In [266]:
import pandas as pd
import sklearn
import tweepy
import csv
import json

In [273]:
# Load Twitter tokens

with open('secrets.json') as data_file:    
    secrets = json.load(data_file)
    for key in secrets:
        key = secrets[key]

'P9KHuEfRyk6KH67oMtBEvYEsY'

In [130]:
def get_all_tweets(screen_name):
    # Twitter only allows access to a users most recent 3240 tweets with this method
    
    # Authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    # Initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    # Make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name, count=200)
    
    # Save most recent tweets
    alltweets.extend(new_tweets)
    
    # Save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    # Keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print "getting tweets before %s" % (oldest)
        
        # All subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name, count=200, max_id=oldest)
        
        # Save most recent tweets
        alltweets.extend(new_tweets)
        
        # Update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print "...%s tweets downloaded so far" % (len(alltweets))
    
    # Transform the tweepy tweets into a 2D array that will populate the csv    
    
    # Keep only non-RTs
    alltweets = [x for x in alltweets if not hasattr(x, 'retweeted_status')] 
    outtweets = [[tweet.id_str, tweet.created_at, tweet.retweet_count, tweet.text.encode("utf-8")] for tweet in alltweets]
    
    # Write the csv    
    with open('%s_tweets.csv' % screen_name, 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(["id", "created_at", "retweets", "text"])
        writer.writerows(outtweets)
    
    pass

if __name__ == '__main__':
    # Pass in the username of the account you want to download
    get_all_tweets("jaymahabal")

getting tweets before 877984097242664959
...400 tweets downloaded so far
getting tweets before 865268143735832575
...600 tweets downloaded so far
getting tweets before 851052506583052287
...800 tweets downloaded so far
getting tweets before 835154553918083071
...999 tweets downloaded so far
getting tweets before 824825134624149503
...1199 tweets downloaded so far
getting tweets before 804329878304751615
...1399 tweets downloaded so far
getting tweets before 784063659563548672
...1599 tweets downloaded so far
getting tweets before 765975667527806975
...1799 tweets downloaded so far
getting tweets before 751065860190703615
...1999 tweets downloaded so far
getting tweets before 748888487953362943
...2199 tweets downloaded so far
getting tweets before 746038398486470655
...2399 tweets downloaded so far
getting tweets before 734425237991915520
...2599 tweets downloaded so far
getting tweets before 723012656161013759
...2799 tweets downloaded so far
getting tweets before 708035665477439487
.

In [174]:
# Grab .csv of all tweets and their retweet numbers

woj = pd.read_csv('wojespn_tweets.csv')
woj.head()

Unnamed: 0,id,created_at,retweets,text
0,882790853172920321,2017-07-06 02:38:35,2690,"Andre Roberson has agreed to a new three-year,..."
1,882773040341430272,2017-07-06 01:27:48,1813,"Jazz, Celtics engage on Gordon Hayward sign-an..."
2,882750266335838208,2017-07-05 23:57:18,2301,"Sources: Dion Waiters has agreed to a 4-year, ..."
3,882717396577943552,2017-07-05 21:46:42,889,"Sources: Waiters, Heat finalizing a new four-y..."
4,882712854771716098,2017-07-05 21:28:39,4256,Free agent guard Dion Waiters is finalizing a ...


In [175]:
# Run TFIDF on text column

import re

def clean_text(tweet):
    return re.sub(r'http\S+', '', tweet)

woj["cleaned_text"] = woj["text"].apply(lambda x: clean_text(x))
del woj["text"]

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2,3), stop_words='english')
vectorizer = vectorizer.fit(woj["cleaned_text"])
tfidf = vectorizer.transform(woj["cleaned_text"])

In [176]:
words = vectorizer.get_feature_names()
woj = pd.concat([woj, pd.DataFrame(tfidf.toarray(), columns=words)], axis=1)
woj

Unnamed: 0,id,created_at,retweets,cleaned_text,01 league,01 league sources,01 meetings,01 meetings teams,01 nears,01 nears free,...,zhou qi 43,zhou qi agrees,zimmerman 41,zimmerman 41 sources,zimmerman makes,zimmerman makes decision,zo kenyon,zo kenyon nets,zubac 32nd,zubac 32nd pick
0,882790853172920321,2017-07-06 02:38:35,2690,"Andre Roberson has agreed to a new three-year,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,882773040341430272,2017-07-06 01:27:48,1813,"Jazz, Celtics engage on Gordon Hayward sign-an...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,882750266335838208,2017-07-05 23:57:18,2301,"Sources: Dion Waiters has agreed to a 4-year, ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,882717396577943552,2017-07-05 21:46:42,889,"Sources: Waiters, Heat finalizing a new four-y...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,882712854771716098,2017-07-05 21:28:39,4256,Free agent guard Dion Waiters is finalizing a ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,882668721508163584,2017-07-05 18:33:17,442,ESPN story on Nick Young joining the Golden St...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,882668215062601729,2017-07-05 18:31:16,2066,Free agent Dion Waiters inching closer to reso...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,882659393954652160,2017-07-05 17:56:13,8234,Free agent guard Nick Young has agreed to a on...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,882580263750053889,2017-07-05 12:41:47,1237,Free agent JaVale McGee has a meeting set with...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,882460718628569089,2017-07-05 04:46:45,324,ESPN story on Clippers reaching agreement for ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [177]:
# Build the model

import numpy as np
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
woj['is_train'] = np.random.uniform(0, 1, len(woj)) <= .75
train, test = woj[woj['is_train']==True], woj[woj['is_train']==False]

clf.fit(train[words], train['retweets'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [178]:
clf.predict(test[words])

array([ 2301,   460,    79,   610,   720,   580,  1827,   440,    29,
         919,   139,    29,  3879,    29,  3879,  1642,    29,  3879,
        3879,   440,   491,   720,   100,   160,    29,   610,   160,
          29,   375,    38,  2689,   160,   932,   321,   160,   321,
         160,   580,    29,    29,   292,    29,   442,  2689,  1812,
          29,    29,   436,    29,    29,  2689,   949,    29,   207,
         200,    29,    11,    30,    29,    17,   312,   580,    30,
         262,   333,    59,    30,    29,   205,    10,  5580,   107,
          11,    38,    29,    29,    29,   192,    29,    29,    17,
          29,    29,    29,    29,    11,    29,   160,    29,    11,
          29,    29,   192,    20,   208,    94,   580,  2689,    29,
        1648,    20,   580,   321,    29,    29,    29,    17,   111,
        2823,   580,    29,   337,   111,   321,   321,   101,    29,
          38,    29,   231,   117,   321,   160,   123,   374,    96,
          29,    29,

In [179]:
import matplotlib.pyplot as plt
%matplotlib inline

pd.concat([test[['retweets', 'cleaned_text']].reset_index(), pd.DataFrame(clf.predict(test[words]), columns=["predicted"])], axis=1).sort_values('predicted')

Unnamed: 0,index,retweets,cleaned_text,predicted
216,857,228,The Vertical's Front Office Insider @BobbyMark...,4
363,1428,86,Why Ben Simmons is still off-limits to NBA exe...,4
69,265,165,Story on @TheVertical: Orlando has hired Toron...,10
402,1594,125,Next Euro star destined to hit NBA Draft: 7-fo...,11
72,281,62,Sources on @TheVertical: Duke guard Frank Jack...,11
56,217,18,Bobby Marks joins @TheVertical Pod. We talk (m...,11
359,1420,192,Vertical Sources: Seven-foot UNLV freshman Ste...,11
85,342,278,Sources on @TheVertical: South Carolina sophom...,11
89,361,379,".@luka7doncic is 18, a star for Real Madrid an...",11
138,546,2093,Portland has three first-round picks in the lo...,11


In [180]:
clf.feature_importances_
feature_imps = pd.concat([pd.DataFrame(words, columns=["words"]), pd.DataFrame(clf.feature_importances_, columns=["importances"])], axis=1)
feature_imps.sort_values("importances", ascending=False)

Unnamed: 0,words,importances
3087,bruce best,0.001068
2689,bookmark thevertical,0.001029
23018,terrific story,0.000984
4710,column thevertical,0.000904
14736,mvp nba history,0.000896
21411,sources thevertical,0.000757
24418,turn tv join,0.000737
17141,play nba,0.000734
3151,bucks owner,0.000732
23714,timkawakami vinparise,0.000731


In [257]:
# export model

from sklearn.externals import joblib
joblib.dump(clf, 'model.pkl')
joblib.dump(words, 'model_columns.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [212]:
trial_tweet = "Magic league sources."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

10731

In [213]:
trial_tweet = "Magic league."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

29

In [215]:
trial_tweet = "league sources"
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

159

In [216]:
trial_tweet = "Magic sources."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

29

In [265]:
import requests
import json

payload = {"tweet":"Magic league sources."}
r = requests.post('http://127.0.0.1:5000/predict', data = json.dumps(payload), headers={'Content-Type': 'application/json'})
r.json()["prediction"]

10731