In [1]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.linear_model import LinearRegression, LassoLars, Ridge, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
#clean non-ascii characters (non-WEuropean language characters) and create a kosher csv file
tweet_file = 'tweets.csv'
clean_file = 'tweets_new.csv'

in_csv = csv.reader(open(tweet_file, "r"), delimiter = ',')
out_csv = csv.writer(open(clean_file, 'w'))
out_txt = []
for row in in_csv:
    out_txt.append(["".join(a if ord(a) < 128 else '' for a in i) for i in row])
out_csv.writerows(out_txt)

In [3]:
#load data into memory
df_tweets = pd.read_csv(clean_file)
df_tweets.head()

Unnamed: 0,TweetPostedTime,TweetID,TweetBody,TweetRetweetFlag,TweetSource,TweetInReplyToStatusID,TweetInReplyToUserID,TweetInReplyToScreenName,TweetRetweetCount,TweetFavoritesCount,...,UserDescription,UserLink,UserExpandedLink,UserFollowersCount,UserFriendsCount,UserListedCount,UserSignupDate,UserTweetCount,MacroIterationNumber,tweet.place
0,Tue Dec 20 10:57:00 +0000 2016,811163485052817408,RT @BeachyMaldives: Local interaction is a gre...,True,"<a href=""http://twitter.com/download/iphone"" r...",,,,1,0,...,Pls donate 2 https://t.co/RvOUK9lAWI #YearEndG...,https://t.co/jghZVBsiQF,http://cjqenterprises.com,6334,6144,1917,Sun Jun 14 22:36:15 +0000 2015,33556,0,
1,Tue Dec 20 10:56:59 +0000 2016,811163483463122944,RT @TechTerraEd: Need #giftideas for your kid(...,True,"<a href=""http://twitter.com/download/iphone"" r...",,,,1,0,...,"Educator of students with special needs, Mothe...",,,154,371,180,Sat Jan 02 13:36:23 +0000 2010,3201,0,
2,Tue Dec 20 10:56:55 +0000 2016,811163466387988480,Seven Questions Before Choosing a Cruise Line ...,False,"<a href=""http://www.google.com/"" rel=""nofollow...",,,,0,0,...,Thrifty Mom Media social media consulting and ...,https://t.co/cEhGzaQJp6,http://www.thriftymommastips.com/,23433,24762,961,Tue May 26 21:26:09 +0000 2009,147958,0,
3,Tue Dec 20 10:56:55 +0000 2016,811163465125679104,"RT @CMGsportsclub: Yoga do Brasil, un havre de...",True,"<a href=""https://roundteam.co"" rel=""nofollow"">...",,,,1,0,...,"Adventure travel, yoga, paleo, Crossfit, runni...",https://t.co/3IHwXkgAkA,https://primalsanctuary.com,11136,10081,978,Sat Sep 12 20:29:18 +0000 2015,28988,0,
4,Tue Dec 20 10:56:53 +0000 2016,811163457508642817,"RT @StylishRentals: Love this! ""Palm Springs M...",True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,,,3065,0,...,I really have got giant ambitions. I start com...,,,55,21,31,Wed Sep 07 16:22:15 +0000 2016,19581,0,


In [4]:
#Feature selection based on a first examination of the data
df_filtered = df_tweets[['TweetRetweetCount', 'TweetPostedTime', 'TweetFavoritesCount',
                        'TweetHashtags', 'UserFollowersCount', 
                         'UserFriendsCount', 'UserListedCount']]

In [5]:
#utility functions

#returns the hour of the day: 
def hour_of_day(time_string):
    r = time_string.split(" ")[3].split(':')
    return r[0]

#returns binary variable for favourite tweets 
def is_fav_tweet(number):
    if(number > 0):
        return 1
    else:
        return number

In [6]:
#preparing the data for modelling
#Test of retweets have time-of-day pattern
#reformulate TweetPostedTime to hour_of_day
df_filtered['tweet_hr'] = df_filtered['TweetPostedTime'].apply(hour_of_day)
new_df_hr = df_filtered.groupby('tweet_hr')
new_df_hr['TweetRetweetCount'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
tweet_hr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1813.0,621.803089,1217.605934,0.0,0.0,1.0,47.0,3181.0
1,5577.0,1631.529675,1532.155912,0.0,0.0,3086.0,3086.0,4056.0
2,3589.0,837.866537,1290.806474,0.0,0.0,2.0,2831.0,3086.0
3,6161.0,1577.889953,1415.883373,0.0,2.0,1536.0,3228.0,3309.0
4,2624.0,204.391387,542.004622,0.0,0.0,1.0,74.0,3043.0
5,3837.0,978.870211,1454.84277,0.0,0.0,3.0,3180.0,3228.0
6,6567.0,2061.774631,1525.686514,0.0,5.0,3069.0,3241.0,3309.0
7,2303.0,68.278767,414.923478,0.0,0.0,0.0,5.0,3309.0
8,4896.0,870.980188,1214.384919,0.0,1.0,217.0,743.0,3309.0
9,2668.0,114.450525,573.818425,0.0,0.0,0.0,7.0,3309.0


In [7]:
# based on analysis, tweet favourite gruoup can be described as a binary variable
df_filtered['tweet_fav'] = df_filtered['TweetFavoritesCount'].apply(is_fav_tweet)
new_df_fav = df_filtered.groupby('tweet_fav')
new_df_fav['TweetRetweetCount'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
tweet_fav,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,38202.0,1197.835401,1460.855198,0.0,0.0,45.0,3086.0,4056.0
1,4160.0,7.685096,138.719493,0.0,0.0,0.0,1.0,3309.0


In [8]:
#binned followers: turns out most important explanatory variable
labels =[1,2,3,4]
df_filtered['foll_bins'] = pd.qcut(df_filtered['UserFollowersCount'], q=4, labels=labels)
new_df_foll = df_filtered.groupby('foll_bins')
new_df_foll['TweetRetweetCount'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
foll_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,10874.0,1855.009564,1351.382044,0.0,321.0,1536.0,3180.0,3309.0
2,10310.0,2296.418526,1372.857944,0.0,71.25,3069.0,3228.0,4056.0
3,10587.0,162.788231,672.937956,0.0,0.0,0.0,4.0,3309.0
4,10591.0,20.846946,157.857095,0.0,0.0,1.0,4.0,3309.0


In [9]:
#binned user-friends feature
frds = np.array(df_filtered['UserFriendsCount'].unique())

df_filtered['frnd_bins'] = pd.qcut(df_filtered['UserFriendsCount'], q=4, labels=labels)
new_df_frnds = df_filtered.groupby('frnd_bins')
new_df_frnds['TweetRetweetCount'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
frnd_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,10616.0,2457.596929,1277.271533,0.0,2831.0,3086.0,3241.0,3309.0
2,10583.0,1579.410375,1519.959009,0.0,0.0,1536.0,3086.0,4056.0
3,10671.0,255.505576,540.444311,0.0,0.0,1.0,91.0,3309.0
4,10492.0,24.821674,173.293615,0.0,0.0,1.0,5.0,3309.0


In [10]:
#binner user listed feature: of only marginal importance
listed = np.array(df_filtered['UserListedCount'].unique())
print(np.sort(listed))

df_filtered['listed_bins'] = pd.qcut(df_filtered['UserListedCount'], q=4, labels=labels)
new_df_listed = df_filtered.groupby('listed_bins')
new_df_listed['TweetRetweetCount'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[    0     1     2 ..., 16290 20671 26577]


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
listed_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,11046.0,1272.520369,1361.293186,0.0,3.0,743.0,3086.0,4056.0
2,10238.0,2156.876441,1431.601856,0.0,13.25,3069.0,3228.0,3309.0
3,10504.0,892.964871,1404.160826,0.0,0.0,1.0,2973.0,3309.0
4,10574.0,25.876111,185.237836,0.0,0.0,1.0,5.0,3309.0


In [11]:
#unique hashtags feature: the hastags associated with each tweet were extracted
#and a list of most popular hashtags created. Each tweeet was scored by the
#number of popular hashtags contained. 

unique_hashtag_lines = df_tweets['TweetHashtags'].unique()
hashtags = []
for i, h in enumerate(np.array(df_tweets['TweetHashtags'])):
    try:
        hashtags.append(h.split(','))
    except:
        pass
#flatten list of lists: and in lowercase
list_tags = [item.lower() for sublist in hashtags for item in sublist]
#strip whitespaces
lst_tags = [x.strip() for x in list_tags]
counts = Counter(lst_tags)
stripped_lst = [x for x in lst_tags if(counts)] 
popular_tags = counts.most_common (30) #30 of the most common chosen as 
#counts stabilize out thereafter
global popular_tags
popular_tags = [x[0] for x in popular_tags]
print(popular_tags)

['travel', 'holiday', 'travelblogger', 'israel', 'jerusalem', 'travelpics', 'rome', 'vatican', 'nikoncanada', 'store', 'ttot', 'vacation', 'photography', 'christmas', 'japan', 'tourism', 'traveller', 'lp', 'japantravel', 'nature', 'vietnam', 'flights', 'adventure', 'packages', 'familytravel', 'wanderlust', 'win', '??', 'ad', 'photo']


In [12]:
def get_tag_score(hashtags):
    try:
        l = hashtags.split(',')
        l_strip = [x.strip() for x in l]
        p = set(l_strip) & set(popular_tags)
        return len(p)
    except:
        return 0

In [13]:
df_filtered['tag_score'] = df_filtered['TweetHashtags'].apply(get_tag_score)
new_df_tag_score = df_filtered.groupby('tag_score')
new_df_tag_score['TweetRetweetCount'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
tag_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,15368.0,1604.443584,1465.998862,0.0,1.0,1536.0,3069.0,3309.0
1,19275.0,1044.483735,1480.386435,0.0,0.0,3.0,3086.0,4056.0
2,4710.0,10.77155,35.234952,0.0,0.0,0.0,4.0,911.0
3,2841.0,333.837029,290.184698,0.0,12.0,219.0,743.0,1186.0
4,155.0,19.303226,61.600132,0.0,0.0,0.0,0.0,222.0
5,9.0,0.888889,1.763834,0.0,0.0,0.0,0.0,4.0
6,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#Define the training and test sets
X = df_filtered[['tweet_hr', 'tweet_fav', 'foll_bins','frnd_bins', 'listed_bins', 'tag_score']]
y = df_filtered['TweetRetweetCount']
X.shape
y.shape

(42362,)

In [15]:
#Training-test split: arbitrary 70-30 split
#haven't used k-fold cross-validation for lack of time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [16]:
#Training:
#Benchmark model: linear regression

lr = LinearRegression()
lr.fit(X_train, y_train)
print(lr.coef_, lr.intercept_)
print(lr.score(X_train, y_train))

[ -44.54280753 -311.5024126  -398.78720762 -562.51083324  104.88372362
 -405.32080664] 3828.3246087
0.567616717212


In [17]:
#Training Decision Tree
model_dt = DecisionTreeRegressor(max_depth=4)
model_dt.fit(X_train, y_train)
print(model_dt.score(X_train, y_train))

0.717890399512


In [18]:
#Training:
#Random Forest regression

rf = RandomForestRegressor(n_estimators=150, min_samples_split=4)
rf.fit(X_train, y_train)
print(rf.feature_importances_)
print(rf.score(X_train, y_train))

[ 0.10937917  0.02946178  0.54374907  0.11294231  0.03881028  0.16565739]
0.90020435247


In [19]:
#testing the models
#linear regression
y_pred = lr.predict(X_test)
#Decision tree
y_pred_dt = model_dt.predict(X_test)
#random forest
y_pred_rf = rf.predict(X_test)

In [20]:
#measure of test set performance
#linear regression
print(lr.score(X_test, y_test))
#decision tree
print(model_dt.score(X_test, y_test))
#random forest
print(rf.score(X_test, y_test))

0.557641340072
0.707291936011
0.887992147036
