In [None]:
import pandas as pd
import sklearn
import tweepy
import csv
import json

In [None]:
# Load Twitter tokens

with open('secrets.json') as data_file:    
    secrets = json.load(data_file)

In [None]:
def get_all_tweets(screen_name):
    # Twitter only allows access to a users most recent 3240 tweets with this method
    
    # Authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(secrets["consumer_key"], secrets["consumer_secret"])
    auth.set_access_token(secrets["access_key"], secrets["access_secret"])
    api = tweepy.API(auth)
    
    # Initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    # Make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name, count=200)
    
    # Save most recent tweets
    alltweets.extend(new_tweets)
    
    # Save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    # Keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print "getting tweets before %s" % (oldest)
        
        # All subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name, count=200, max_id=oldest)
        
        # Save most recent tweets
        alltweets.extend(new_tweets)
        
        # Update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print "...%s tweets downloaded so far" % (len(alltweets))
    
    # Transform the tweepy tweets into a 2D array that will populate the csv    
    
    # Keep only non-RTs
    alltweets = [x for x in alltweets if not hasattr(x, 'retweeted_status')] 
    outtweets = [[tweet.id_str, tweet.created_at, tweet.retweet_count, tweet.text.encode("utf-8")] for tweet in alltweets]
    
    # Write the csv    
    with open('%s_tweets.csv' % screen_name, 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(["id", "created_at", "retweets", "text"])
        writer.writerows(outtweets)
    
    pass

if __name__ == '__main__':
    # Pass in the username of the account you want to download
    get_all_tweets("wojespn")

In [None]:
# Grab .csv of all tweets and their retweet numbers

woj = pd.read_csv('wojespn_tweets.csv')
woj.head()

In [None]:
# Run countvectorizer on text column

import re

# Remove URLs
def clean_text(tweet):
    return re.sub(r'http\S+', '', tweet)

woj["cleaned_text"] = woj["text"].apply(lambda x: clean_text(x))

# Get the length of the tweet
woj["tweet_length"] = woj["text"].apply(lambda x: len(x))

# Get the hour and day of the tweet
woj["tweet_hour"] = woj["created_at"].apply(lambda x: pd.to_datetime(x).hour)
woj["tweet_day"] = woj["created_at"].apply(lambda x: pd.to_datetime(x).dayofweek)


del woj["text"]

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,3), stop_words='english')
vectorizer = vectorizer.fit(woj["cleaned_text"])
countvectorizer = vectorizer.transform(woj["cleaned_text"])

woj

In [None]:
words = vectorizer.get_feature_names()
woj = pd.concat([woj, pd.DataFrame(countvectorizer.toarray(), columns=words)], axis=1)
woj.head()

In [None]:
# Build the model

import numpy as np
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
woj['is_train'] = np.random.uniform(0, 1, len(woj)) <= .75
train, test = woj[woj['is_train']==True], woj[woj['is_train']==False]

clf.fit(train[words], train['retweets'])

In [None]:
## Play around with various parameters

from sklearn.metrics import mean_squared_error

# clf = RandomForestRegressor(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
# clf.fit(train[words], train['retweets'])

# mean_squared_error(test[['retweets']], clf.predict(test[words]))

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

from sklearn.model_selection import GridSearchCV

params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}

ESTIMATORS = {
#     "Extra trees": ExtraTreesRegressor(n_estimators=10),
#     "Random Forests": RandomForestRegressor(n_estimators=10),
#     "K-nn": KNeighborsRegressor(),
    "Ridge": RidgeCV(),
#     "Lasso": LassoCV(), 
#     "Gradient Boosting": GradientBoostingRegressor(**params),
}

parameter_candidates = [
  {'fit_intercept': [True, False], 'normalize': [True, False]},
]

# clf = GridSearchCV(estimator=estimator, param_grid=parameter_candidates, n_jobs=-1)

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator = GridSearchCV(estimator=estimator, param_grid=parameter_candidates, n_jobs=-1)
    estimator.fit(train[words], train['retweets'])
    print name+": "+str(mean_squared_error(test[['retweets']], estimator.predict(test[words])))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

pd.concat([test[['retweets', 'cleaned_text']].reset_index(), pd.DataFrame(clf.predict(test[words]), columns=["predicted"])], axis=1).sort_values('predicted')

In [None]:
clf.feature_importances_
feature_imps = pd.concat([pd.DataFrame(words, columns=["words"]), pd.DataFrame(clf.feature_importances_, columns=["importances"])], axis=1)
feature_imps.sort_values("importances", ascending=False)

In [None]:
# export model

from sklearn.externals import joblib
joblib.dump(clf, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

In [None]:
# Does this model work?

In [None]:
trial_tweet = "Magic league sources."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

In [None]:
trial_tweet = "Magic league."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

In [None]:
trial_tweet = "league sources"
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

In [None]:
trial_tweet = "Magic sources."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]