In [45]:
import pandas as pd
import quantopian
import blaze
import matplotlib.pyplot as plt
import pytz
from sklearn import ensemble, preprocessing, metrics, linear_model, neighbors
import sklearn
from collections import OrderedDict

Import the data set with all the sentiment analysis done

In [2]:
data = local_csv('TrumpNov29WithSScore.csv')

In [3]:
df = data.dropna()
df.head()

Unnamed: 0,created_at,id_str,is_retweet,source,text,sent_score
0,11/29/18 12:32,1.06812e+18,False,Twitter for iPhone,Billions of Dollars are pouring into the coffe...,-0.1154
1,11/29/18 12:16,1.06812e+18,False,Twitter for iPhone,When will this illegal Joseph McCarthy style W...,-0.3793
2,11/29/18 11:54,1.06811e+18,False,Twitter for iPhone,Did you ever see an investigation more in sear...,-0.5769
3,11/29/18 11:37,1.06811e+18,False,Twitter for iPhone,General Motors is very counter to what other a...,0.0385
4,11/29/18 4:39,1.068e+18,False,Twitter for iPhone,So much happening with the now discredited Wit...,-0.3636


Convert the times and assign them to the data

In [4]:
utc = pytz.timezone('UTC')
times = []
for i, row in df.iterrows():
    try:
        times.append(pd.Timestamp(row['created_at'], tz=utc))
    except Exception as e:
        print(row)

In [5]:
df = df.assign(time_utc=times)
df.head(15)

Unnamed: 0,created_at,id_str,is_retweet,source,text,sent_score,time_utc
0,11/29/18 12:32,1.06812e+18,False,Twitter for iPhone,Billions of Dollars are pouring into the coffe...,-0.1154,2018-11-29 12:32:00+00:00
1,11/29/18 12:16,1.06812e+18,False,Twitter for iPhone,When will this illegal Joseph McCarthy style W...,-0.3793,2018-11-29 12:16:00+00:00
2,11/29/18 11:54,1.06811e+18,False,Twitter for iPhone,Did you ever see an investigation more in sear...,-0.5769,2018-11-29 11:54:00+00:00
3,11/29/18 11:37,1.06811e+18,False,Twitter for iPhone,General Motors is very counter to what other a...,0.0385,2018-11-29 11:37:00+00:00
4,11/29/18 4:39,1.068e+18,False,Twitter for iPhone,So much happening with the now discredited Wit...,-0.3636,2018-11-29 04:39:00+00:00
5,11/29/18 4:36,1.068e+18,False,Twitter for iPhone,"Sebastian Gorka, a very talented man who I got...",0.2381,2018-11-29 04:36:00+00:00
6,11/28/18 23:32,1.06792e+18,False,Twitter for iPhone,On behalf of @FLOTUS Melania and the entire Tr...,0.5185,2018-11-28 23:32:00+00:00
7,11/28/18 16:09,1.06781e+18,False,Twitter for iPhone,Steel Dynamics announced that it will build a ...,0.25,2018-11-28 16:09:00+00:00
8,11/28/18 14:49,1.06779e+18,False,Twitter for iPhone,.....and G.M. would not be closing their plant...,0.1875,2018-11-28 14:49:00+00:00
9,11/28/18 14:43,1.06779e+18,False,Twitter for iPhone,The reason that the small truck business in th...,0.0645,2018-11-28 14:43:00+00:00


In [6]:
prices = get_pricing(['SPY'], start_date="2018-11-29", end_date="2018-11-29", fields='price', frequency='minute')  
prices.head()

Unnamed: 0,Equity(8554 [SPY])
2018-11-29 14:31:00+00:00,273.64
2018-11-29 14:32:00+00:00,273.74
2018-11-29 14:33:00+00:00,273.85
2018-11-29 14:34:00+00:00,273.88
2018-11-29 14:35:00+00:00,273.61


In [8]:
def get_momentum(time, num_min_momentum, time_list, price_list):
    index = time_list.index(time)
    momentum_loc = 0
    if index+1 >= num_min_momentum:
        momentum_loc = index+1 - num_min_momentum
        
    val1 = price_list[index]
    val2 = price_list[momentum_loc]
    
    momentum = val1/val2
    
    return momentum, index
    

In [9]:
def get_time_prices(date):
    
    date_string = "{}-{}-{}".format(date.year, date.month, date.day)
    prices = get_pricing(['SPY'], start_date=date_string, end_date=date_string, fields='price', frequency='minute')
    
    price_dict = OrderedDict()
    for i, row in prices.iterrows():
        price_dict[i] = row[0]
        
    return price_dict.keys(), price_dict.values()    
    

In [10]:
def classify_time(date):
    time = date.time()
    morning = pd.Timestamp('2000-01-01 14:30:00 UTC').time()
    before_noon = pd.Timestamp('2000-01-01 16:00:00 UTC').time()
    after_noon = pd.Timestamp('2000-01-01 17:30:00 UTC').time()
    late_afternoon = pd.Timestamp('2000-01-01 19:00:00 UTC').time()
    closing = pd.Timestamp('2000-01-01 21:00:00 UTC').time()
    
    times = [0,0,0,0]
    
    index = None
    
    if time > morning and time <= before_noon:
        index = 0
    elif time > before_noon and time <= after_noon:
        index = 1
    elif time > after_noon and time <= late_afternoon:
        index = 2
    else:
        index = 3
        
    times[index] = 1
    
    return times

In [11]:
def classify_day(date):
    days = [0,0,0,0,0]
    days[date.dayofweek] = 1
    
    return days

Build the data set for training

In [12]:
open_time = pd.Timestamp('2000-01-01 14:30:00 UTC')
close_time = pd.Timestamp('2000-01-01 20:55:00 UTC')

X_data = []
Y_data_c = []
Y_data_d = []

for i, row in df.iterrows():
#     if i > 1000:
#         break
    
    date = row['time_utc']
    time = date.time()
    day = date.dayofweek
    
    #if its monday-friday and the tweet is during market operating hours
    
    if day < 5 and time > open_time.time() and time <= close_time.time():
        try:
            times, day_prices = get_time_prices(row['time_utc'])
            #normalize the momentums to get them around 0
            mom, index = get_momentum(row['time_utc'], 15, times, day_prices)
            mom = (mom - 1) * 1000
            next_price = day_prices[index + 5]

            x = [mom]
            x.extend(classify_day(date))
            x.extend(classify_time(date))
                        
            y_continuous = day_prices[index + 5] - day_prices[index]
            y_discrete = None
            
            if y_continuous < -0.1:
                y_discrete = -1
            elif y_continuous > 0.1:
                y_discrete = 1
            else:
                y_discrete = 0
                
            X_data.append(x)
            Y_data_c.append(y_continuous)
            Y_data_d.append(y_discrete)
            
            
        except Exception as e:
            pass
#             print(e)
        

In [13]:
print(len(X_data))
print(len(Y_data_c))
print(len(Y_data_d))

10605
10605
10605


In [17]:
# Models
forest = ensemble.RandomForestClassifier()
ada = ensemble.AdaBoostClassifier()
knn = neighbors.KNeighborsClassifier()


Test the models

In [24]:
models_c = [forest, ada, knn]
forest_scores = []
ada_scores = []
knn_scores = []
model_names = [forest_scores, ada_scores, knn_scores]

for i in range(10):
    x_train, x_test, y_train, y_test = sklearn.cross_validation.train_test_split(X_data, Y_data_d, test_size=0.2)
    for i in range(len(models_c)):
        model = models_c[i]
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        acc = metrics.accuracy_score(y_test, y_pred)
        pre = None
        f1 = None
        if i != 2:
            acc = metrics.accuracy_score(y_test, y_pred)
            prec = metrics.precision_score(y_test, y_pred, average='weighted')
            f1 = metrics.f1_score(y_test, y_pred, average='weighted')
            
        model_names[i].append((acc, prec, f1))
    

In [27]:
# scoring for randomforest
avg_a = 0
avg_p = 0
avg_f = 0
for i in forest_scores:   
    avg_a += i[0]
    avg_p += i[1]
    avg_f += i[2]
    
print(avg_a/10)
print(avg_p/10)
print(avg_f/10)

0.617680339463
0.60542788123
0.61102604023


In [28]:
# Ada
avg_a = 0
avg_p = 0
avg_f = 0
for i in ada_scores:   
    avg_a += i[0]
    avg_p += i[1]
    avg_f += i[2]
    
print(avg_a/10)
print(avg_p/10)
print(avg_f/10)

0.723762376238
0.584667404017
0.614359250358


In [30]:
# KNN doesn't have well defined meaning for precision and f1 score and throws a warning
# we decided not to include them
avg_a = 0

for i in knn_scores:   
    avg_a += i[0]

print(avg_a/10)


0.676001885903


Load in the emoji dataset

In [34]:
emoj = local_csv('trumpemoji.csv')

In [35]:
emoj = emoj.rename(index=str, columns={'created_at': 'text', 'Text': 'created_at'})
emoj.head()

Unnamed: 0,text,created_at,Top5%,Emoji_1,Emoji_2,Emoji_3,Emoji_4,Emoji_5,Pct_1,Pct_2,Pct_3,Pct_4,Pct_5
0,Billions of Dollars are pouring into the coffe...,11/29/18 12:32,0.290665,32,55,33,62,25,0.100459,0.071994,0.042611,0.038831,0.036771
1,When will this illegal Joseph McCarthy style W...,11/29/18 12:16,0.425414,32,46,55,34,27,0.105558,0.103549,0.080169,0.077162,0.058977
2,Did you ever see an investigation more in sear...,11/29/18 11:54,0.427425,32,55,12,41,25,0.157205,0.103903,0.064565,0.056425,0.045328
3,General Motors is very counter to what other a...,11/29/18 11:37,0.403117,32,55,19,25,37,0.129634,0.109936,0.073316,0.051891,0.038339
4,So much happening with the now discredited Wit...,11/29/18 4:39,0.292888,12,62,32,52,43,0.065822,0.065414,0.056404,0.055462,0.049786


In [36]:
# convert time
utc = pytz.timezone('UTC')
times = []
for i, row in emoj.iterrows():
    try:
        times.append(pd.Timestamp(row['created_at'], tz=utc))
    except Exception as e:
        print(row)

In [37]:
emoj = emoj.assign(time_utc=times)
emoj.head()

Unnamed: 0,text,created_at,Top5%,Emoji_1,Emoji_2,Emoji_3,Emoji_4,Emoji_5,Pct_1,Pct_2,Pct_3,Pct_4,Pct_5,time_utc
0,Billions of Dollars are pouring into the coffe...,11/29/18 12:32,0.290665,32,55,33,62,25,0.100459,0.071994,0.042611,0.038831,0.036771,2018-11-29 12:32:00+00:00
1,When will this illegal Joseph McCarthy style W...,11/29/18 12:16,0.425414,32,46,55,34,27,0.105558,0.103549,0.080169,0.077162,0.058977,2018-11-29 12:16:00+00:00
2,Did you ever see an investigation more in sear...,11/29/18 11:54,0.427425,32,55,12,41,25,0.157205,0.103903,0.064565,0.056425,0.045328,2018-11-29 11:54:00+00:00
3,General Motors is very counter to what other a...,11/29/18 11:37,0.403117,32,55,19,25,37,0.129634,0.109936,0.073316,0.051891,0.038339,2018-11-29 11:37:00+00:00
4,So much happening with the now discredited Wit...,11/29/18 4:39,0.292888,12,62,32,52,43,0.065822,0.065414,0.056404,0.055462,0.049786,2018-11-29 04:39:00+00:00


Build the data for training and testing

In [39]:
open_time = pd.Timestamp('2000-01-01 14:30:00 UTC')
close_time = pd.Timestamp('2000-01-01 20:55:00 UTC')
include = ['Top5%','Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4', 'Emoji_5',
           'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5']

X_data_e = []
Y_data_ce = []
Y_data_de = []

for i, row in emoj.iterrows():
    index = int(i)
#     if index > 1000:
#         break
    
    date = row['time_utc']
    time = date.time()
    day = date.dayofweek
        
    #if its monday-friday and the tweet is during market operating hours
    
    if day < 5 and time > open_time.time() and time <= close_time.time():
        try:
            times, day_prices = get_time_prices(row['time_utc'])
            _, index = get_momentum(row['time_utc'], 15, times, day_prices)

            x = []
        
        
            for val in include:
                x.append(row[val])

            next_price = day_prices[index + 5]

            y_continuous = day_prices[index + 5] - day_prices[index]
            y_discrete = None

            if y_continuous < -0.1:
                y_discrete = -1
            elif y_continuous > 0.1:
                y_discrete = 1
            else:
                y_discrete = 0

            X_data_e.append(x)
            Y_data_ce.append(y_continuous)
            Y_data_de.append(y_discrete)


        except Exception as e:
            pass
#             print(e)

In [40]:
# models
forest_e = ensemble.RandomForestClassifier()
ada_e = ensemble.AdaBoostClassifier()
knn_e = neighbors.KNeighborsClassifier()


In [41]:
models_c = [forest, ada, knn]
forest_scores = []
ada_scores = []
knn_scores = []
model_names = [forest_scores, ada_scores, knn_scores]

for i in range(10):
    x_train, x_test, y_train, y_test = sklearn.cross_validation.train_test_split(X_data, Y_data_d, test_size=0.2)
    for i in range(len(models_c)):
        model = models_c[i]
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        acc = metrics.accuracy_score(y_test, y_pred)
        pre = None
        f1 = None
        if i != 2:
            acc = metrics.accuracy_score(y_test, y_pred)
            prec = metrics.precision_score(y_test, y_pred, average='weighted')
            f1 = metrics.f1_score(y_test, y_pred, average='weighted')
            
        model_names[i].append((acc, prec, f1))
    

In [42]:
avg_a = 0
avg_p = 0
avg_f = 0
for i in forest_scores:   
    avg_a += i[0]
    avg_p += i[1]
    avg_f += i[2]
    
print(avg_a/10)
print(avg_p/10)
print(avg_f/10)

0.615841584158
0.603888996817
0.60948182461


In [43]:
avg_a = 0
avg_p = 0
avg_f = 0
for i in ada_scores:   
    avg_a += i[0]
    avg_p += i[1]
    avg_f += i[2]
    
print(avg_a/10)
print(avg_p/10)
print(avg_f/10)

0.725695426686
0.601664163765
0.616570427037


In [44]:
# KNN doesn't have well defined meaning for precision and f1 score and throws a warning
# we decided not to include them
avg_a = 0

for i in knn_scores:   
    avg_a += i[0]

print(avg_a/10)


0.676944837341
