In [1]:
import pandas as pd
import sklearn
import tweepy
import csv
import json

In [None]:
# Load Twitter tokens

with open('secrets.json') as data_file:    
    secrets = json.load(data_file)
    for key in secrets:
        key = secrets[key]

In [None]:
def get_all_tweets(screen_name):
    # Twitter only allows access to a users most recent 3240 tweets with this method
    
    # Authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    # Initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    # Make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name, count=200)
    
    # Save most recent tweets
    alltweets.extend(new_tweets)
    
    # Save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    # Keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print "getting tweets before %s" % (oldest)
        
        # All subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name, count=200, max_id=oldest)
        
        # Save most recent tweets
        alltweets.extend(new_tweets)
        
        # Update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print "...%s tweets downloaded so far" % (len(alltweets))
    
    # Transform the tweepy tweets into a 2D array that will populate the csv    
    
    # Keep only non-RTs
    alltweets = [x for x in alltweets if not hasattr(x, 'retweeted_status')] 
    outtweets = [[tweet.id_str, tweet.created_at, tweet.retweet_count, tweet.text.encode("utf-8")] for tweet in alltweets]
    
    # Write the csv    
    with open('%s_tweets.csv' % screen_name, 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(["id", "created_at", "retweets", "text"])
        writer.writerows(outtweets)
    
    pass

if __name__ == '__main__':
    # Pass in the username of the account you want to download
    get_all_tweets("jaymahabal")

In [2]:
# Grab .csv of all tweets and their retweet numbers

woj = pd.read_csv('wojespn_tweets.csv')
woj.head()

Unnamed: 0,id,created_at,retweets,text
0,882790853172920321,2017-07-06 02:38:35,2690,"Andre Roberson has agreed to a new three-year,..."
1,882773040341430272,2017-07-06 01:27:48,1813,"Jazz, Celtics engage on Gordon Hayward sign-an..."
2,882750266335838208,2017-07-05 23:57:18,2301,"Sources: Dion Waiters has agreed to a 4-year, ..."
3,882717396577943552,2017-07-05 21:46:42,889,"Sources: Waiters, Heat finalizing a new four-y..."
4,882712854771716098,2017-07-05 21:28:39,4256,Free agent guard Dion Waiters is finalizing a ...


In [3]:
# Run TFIDF on text column

import re

def clean_text(tweet):
    return re.sub(r'http\S+', '', tweet)

woj["cleaned_text"] = woj["text"].apply(lambda x: clean_text(x))
del woj["text"]

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2,3), stop_words='english')
vectorizer = vectorizer.fit(woj["cleaned_text"])
tfidf = vectorizer.transform(woj["cleaned_text"])

In [4]:
words = vectorizer.get_feature_names()
woj = pd.concat([woj, pd.DataFrame(tfidf.toarray(), columns=words)], axis=1)
woj

Unnamed: 0,id,created_at,retweets,cleaned_text,01 league,01 league sources,01 meetings,01 meetings teams,01 nears,01 nears free,...,zhou qi 43,zhou qi agrees,zimmerman 41,zimmerman 41 sources,zimmerman makes,zimmerman makes decision,zo kenyon,zo kenyon nets,zubac 32nd,zubac 32nd pick
0,882790853172920321,2017-07-06 02:38:35,2690,"Andre Roberson has agreed to a new three-year,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,882773040341430272,2017-07-06 01:27:48,1813,"Jazz, Celtics engage on Gordon Hayward sign-an...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,882750266335838208,2017-07-05 23:57:18,2301,"Sources: Dion Waiters has agreed to a 4-year, ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,882717396577943552,2017-07-05 21:46:42,889,"Sources: Waiters, Heat finalizing a new four-y...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,882712854771716098,2017-07-05 21:28:39,4256,Free agent guard Dion Waiters is finalizing a ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,882668721508163584,2017-07-05 18:33:17,442,ESPN story on Nick Young joining the Golden St...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,882668215062601729,2017-07-05 18:31:16,2066,Free agent Dion Waiters inching closer to reso...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,882659393954652160,2017-07-05 17:56:13,8234,Free agent guard Nick Young has agreed to a on...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,882580263750053889,2017-07-05 12:41:47,1237,Free agent JaVale McGee has a meeting set with...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,882460718628569089,2017-07-05 04:46:45,324,ESPN story on Clippers reaching agreement for ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Build the model

import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

clf = RandomForestRegressor(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
woj['is_train'] = np.random.uniform(0, 1, len(woj)) <= .75
train, test = woj[woj['is_train']==True], woj[woj['is_train']==False]

# clf.fit(train[words], train['retweets'])

In [None]:
## Play around with various parameters

from sklearn.metrics import mean_squared_error

# clf = RandomForestRegressor(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
# clf.fit(train[words], train['retweets'])

# mean_squared_error(test[['retweets']], clf.predict(test[words]))

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

from sklearn.model_selection import GridSearchCV

params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}

ESTIMATORS = {
#     "Extra trees": ExtraTreesRegressor(n_estimators=10),
#     "Random Forests": RandomForestRegressor(n_estimators=10),
#     "K-nn": KNeighborsRegressor(),
    "Ridge": RidgeCV(),
#     "Lasso": LassoCV(), 
#     "Gradient Boosting": GradientBoostingRegressor(**params),
}

parameter_candidates = [
  {'alphas': [1, 2], 'fit_intercept': [True, False], 'normalize': [True, False]},
]

# clf = GridSearchCV(estimator=estimator, param_grid=parameter_candidates, n_jobs=-1)

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator = GridSearchCV(estimator=estimator, param_grid=parameter_candidates, n_jobs=-1)
    estimator.fit(train[words], train['retweets'])
    print name+": "+str(mean_squared_error(test[['retweets']], estimator.predict(test[words])))

In [16]:
### Try using H2O

# import h2o

# h2o.init()

# from h2o.estimators.gbm import H2OGradientBoostingEstimator
# from h2o.estimators.deeplearning import H2ODeepLearningEstimator

# dl_model = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)

dl_model = H2OGradientBoostingEstimator(ntrees=100,
                                        max_depth=6,
                                        learn_rate=0.1)

h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)

dl_model.train(x = words,
          y = "retweets",
          training_frame = h2o_train,
          validation_frame = h2o_test)


print dl_model.model_performance(train).mse()
print dl_model.model_performance(test).mse()


H2OResponseError: Server error water.exceptions.H2OIllegalArgumentException:
  Error: Cannot determine file type. for upload_9f21d6f739f6139252b8b90aa2994f66
  Request: POST /3/ParseSetup
    data: {u'source_frames': u'["upload_9f21d6f739f6139252b8b90aa2994f66"]', u'check_header': '1', u'separator': '44'}


In [15]:
import h2o

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.6.0_65"; Java(TM) SE Runtime Environment (build 1.6.0_65-b14-468-11M4833); Java HotSpot(TM) 64-Bit Server VM (build 20.65-b04-468, mixed mode)
  Starting server from /Users/jaymahabal/Documents/2017/personal-projects/wojbomb/venv/lib/python2.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/by/7y9lhkz90bn5046h0c_y90h00000gn/T/tmpyxkAQA
  JVM stdout: /var/folders/by/7y9lhkz90bn5046h0c_y90h00000gn/T/tmpyxkAQA/h2o_jaymahabal_started_from_python.out
  JVM stderr: /var/folders/by/7y9lhkz90bn5046h0c_y90h00000gn/T/tmpyxkAQA/h2o_jaymahabal_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,06 secs
H2O cluster version:,3.10.4.8
H2O cluster version age:,1 month and 19 days
H2O cluster name:,H2O_from_python_jaymahabal_bxz9qc
H2O cluster total nodes:,1
H2O cluster free memory:,123.9 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [None]:
print "hi"

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

pd.concat([test[['retweets', 'cleaned_text']].reset_index(), pd.DataFrame(clf.predict(test[words]), columns=["predicted"])], axis=1).sort_values('predicted')

In [None]:
clf.feature_importances_
feature_imps = pd.concat([pd.DataFrame(words, columns=["words"]), pd.DataFrame(clf.feature_importances_, columns=["importances"])], axis=1)
feature_imps.sort_values("importances", ascending=False)

In [None]:
# export model

from sklearn.externals import joblib
joblib.dump(clf, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

In [None]:
trial_tweet = "Magic league sources."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

In [None]:
trial_tweet = "Magic league."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

In [None]:
trial_tweet = "league sources"
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]

In [None]:
trial_tweet = "Magic sources."
clf.predict(vectorizer.transform([trial_tweet]).toarray())[0]