In [266]:
import pandas as pd
import sklearn
import tweepy
import csv
import json

In [273]:
# Load Twitter tokens

with open('secrets.json') as data_file:    
    secrets = json.load(data_file)
    for key in secrets:
        key = secrets[key]

'P9KHuEfRyk6KH67oMtBEvYEsY'

In [130]:
def get_all_tweets(screen_name):
    # Twitter only allows access to a users most recent 3240 tweets with this method
    
    # Authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    # Initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    # Make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name, count=200)
    
    # Save most recent tweets
    alltweets.extend(new_tweets)
    
    # Save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    # Keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print "getting tweets before %s" % (oldest)
        
        # All subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name, count=200, max_id=oldest)
        
        # Save most recent tweets
        alltweets.extend(new_tweets)
        
        # Update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print "...%s tweets downloaded so far" % (len(alltweets))
    
    # Transform the tweepy tweets into a 2D array that will populate the csv    
    
    # Keep only non-RTs
    alltweets = [x for x in alltweets if not hasattr(x, 'retweeted_status')] 
    outtweets = [[tweet.id_str, tweet.created_at, tweet.retweet_count, tweet.text.encode("utf-8")] for tweet in alltweets]
    
    # Write the csv    
    with open('%s_tweets.csv' % screen_name, 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(["id", "created_at", "retweets", "text"])
        writer.writerows(outtweets)
    
    pass

if __name__ == '__main__':
    # Pass in the username of the account you want to download
    get_all_tweets("jaymahabal")

getting tweets before 877984097242664959
...400 tweets downloaded so far
getting tweets before 865268143735832575
...600 tweets downloaded so far
getting tweets before 851052506583052287
...800 tweets downloaded so far
getting tweets before 835154553918083071
...999 tweets downloaded so far
getting tweets before 824825134624149503
...1199 tweets downloaded so far
getting tweets before 804329878304751615
...1399 tweets downloaded so far
getting tweets before 784063659563548672
...1599 tweets downloaded so far
getting tweets before 765975667527806975
...1799 tweets downloaded so far
getting tweets before 751065860190703615
...1999 tweets downloaded so far
getting tweets before 748888487953362943
...2199 tweets downloaded so far
getting tweets before 746038398486470655
...2399 tweets downloaded so far
getting tweets before 734425237991915520
...2599 tweets downloaded so far
getting tweets before 723012656161013759
...2799 tweets downloaded so far
getting tweets before 708035665477439487
.

In [284]:
# Grab .csv of all tweets and their retweet numbers

woj = pd.read_csv('wojespn_tweets.csv')
woj.head()

Unnamed: 0,id,created_at,retweets,text
0,882790853172920321,2017-07-06 02:38:35,2690,"Andre Roberson has agreed to a new three-year,..."
1,882773040341430272,2017-07-06 01:27:48,1813,"Jazz, Celtics engage on Gordon Hayward sign-an..."
2,882750266335838208,2017-07-05 23:57:18,2301,"Sources: Dion Waiters has agreed to a 4-year, ..."
3,882717396577943552,2017-07-05 21:46:42,889,"Sources: Waiters, Heat finalizing a new four-y..."
4,882712854771716098,2017-07-05 21:28:39,4256,Free agent guard Dion Waiters is finalizing a ...


In [285]:
# Run TFIDF on text column

import re

def clean_text(tweet):
    return re.sub(r'http\S+', '', tweet)

woj["cleaned_text"] = woj["text"].apply(lambda x: clean_text(x))
del woj["text"]

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2,3), stop_words='english')
vectorizer = vectorizer.fit(woj["cleaned_text"])
tfidf = vectorizer.transform(woj["cleaned_text"])

In [286]:
words = vectorizer.get_feature_names()
woj = pd.concat([woj, pd.DataFrame(tfidf.toarray(), columns=words)], axis=1)
woj

Unnamed: 0,id,created_at,retweets,cleaned_text,01 league,01 league sources,01 meetings,01 meetings teams,01 nears,01 nears free,...,zhou qi 43,zhou qi agrees,zimmerman 41,zimmerman 41 sources,zimmerman makes,zimmerman makes decision,zo kenyon,zo kenyon nets,zubac 32nd,zubac 32nd pick
0,882790853172920321,2017-07-06 02:38:35,2690,"Andre Roberson has agreed to a new three-year,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,882773040341430272,2017-07-06 01:27:48,1813,"Jazz, Celtics engage on Gordon Hayward sign-an...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,882750266335838208,2017-07-05 23:57:18,2301,"Sources: Dion Waiters has agreed to a 4-year, ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,882717396577943552,2017-07-05 21:46:42,889,"Sources: Waiters, Heat finalizing a new four-y...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,882712854771716098,2017-07-05 21:28:39,4256,Free agent guard Dion Waiters is finalizing a ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,882668721508163584,2017-07-05 18:33:17,442,ESPN story on Nick Young joining the Golden St...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,882668215062601729,2017-07-05 18:31:16,2066,Free agent Dion Waiters inching closer to reso...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,882659393954652160,2017-07-05 17:56:13,8234,Free agent guard Nick Young has agreed to a on...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,882580263750053889,2017-07-05 12:41:47,1237,Free agent JaVale McGee has a meeting set with...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,882460718628569089,2017-07-05 04:46:45,324,ESPN story on Clippers reaching agreement for ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [287]:
# Build the model

import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

clf = RandomForestRegressor(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
woj['is_train'] = np.random.uniform(0, 1, len(woj)) <= .75
train, test = woj[woj['is_train']==True], woj[woj['is_train']==False]

clf.fit(train[words], train['retweets'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [288]:
clf.predict(test[words])

array([  696.83269231,  1032.9       ,  3286.7       ,   401.50530983,
        2565.7       ,  1046.45146368,   328.05723291,  1133.3       ,
         526.04628205,   870.23269231,  1218.5       ,   548.        ,
        1758.5       ,  1218.5       ,   863.7       ,    99.4       ,
         863.7       ,   328.05723291,   328.05723291,   399.65098291,
         328.05723291,   328.05723291,  1840.        ,  1078.8682906 ,
         338.8       ,   648.90530983,   348.        ,   161.16358974,
         328.05723291,   325.9       ,   540.1       ,   736.        ,
         325.9       ,   419.5       ,   574.2       ,  1480.08076923,
        4180.3       ,   325.9       ,   328.05723291,  1329.44820513,
         418.93223291,   484.43858974,  1435.1       ,   328.05723291,
         328.05723291,  1307.02320513,  1218.4424359 ,   298.21935897,
         328.05723291,   331.65098291,   492.25384615,   473.6       ,
         328.05723291,   419.        ,    50.93125   ,    99.67777778,
      

In [289]:
import matplotlib.pyplot as plt
%matplotlib inline

pd.concat([test[['retweets', 'cleaned_text']].reset_index(), pd.DataFrame(clf.predict(test[words]), columns=["predicted"])], axis=1).sort_values('predicted')

Unnamed: 0,index,retweets,cleaned_text,predicted
151,618,50,Coming Monday on @TheVertical Podcast: Sit-dow...,27.600000
264,1110,40,We go all-in on @KDTrey5's free agency on @The...,28.700000
207,840,25,"On doorstep of Allen Iverson's HOF induction, ...",29.800000
421,1629,27,Sixers coach Brett Brown for a revealing hour ...,29.800000
409,1597,59,Scal on 'Zo-Kenyon Nets fight; telling Rod Tho...,29.800000
311,1271,13,Next on @TheVertical Pod w/ Woj: Chris Herren\...,31.500000
401,1582,40,Jeff Van Gundy on @TheVertical pod w/ Woj: Pla...,31.500000
374,1479,16,Plus: @TheVertical Pod w/Woj Insider Segment: ...,31.500000
373,1476,16,.@TheVertical Pod w/ Woj: Nets GM Sean Marks o...,31.500000
206,839,46,New @TheVertical Pod: Ex-76ers GM Billy King o...,34.400000


In [290]:
clf.feature_importances_
feature_imps = pd.concat([pd.DataFrame(words, columns=["words"]), pd.DataFrame(clf.feature_importances_, columns=["importances"])], axis=1)
feature_imps.sort_values("importances", ascending=False)

Unnamed: 0,words,importances
5025,contract clippers trading,0.037227
14656,mozgov angelo,0.036800
24254,trading tim,0.026832
16547,paul agreed,0.026532
16549,paul george,0.022550
2939,brook lopez 27th,0.021976
397,27th pick,0.021967
21400,sources tell,0.019737
1231,agreed opt contract,0.018799
20317,serge ibaka,0.018206


In [291]:
# export model

from sklearn.externals import joblib
joblib.dump(clf, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [292]:
trial_tweet = "Magic league sources."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

276.83979700854701

In [293]:
trial_tweet = "Magic league."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

328.05723290598291

In [294]:
trial_tweet = "league sources"
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

276.83979700854701

In [295]:
trial_tweet = "Magic sources."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

328.05723290598291

In [265]:
import requests
import json

payload = {"tweet":"Magic league sources."}
r = requests.post('http://127.0.0.1:5000/predict', data = json.dumps(payload), headers={'Content-Type': 'application/json'})
r.json()["prediction"]

10731