In [1]:
import time
start = time.time()
import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

#plotting
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

#statistics & econometrics
import statsmodels.tsa.api as smt
import statsmodels.api as sm

#model fiiting and selection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
import xgboost as xgb

In [2]:
df = pd.read_csv("Combined_News_DJIA.csv",low_memory=False,
                    parse_dates=[0])

full_stock = pd.read_csv("upload_DJIA_table.csv",low_memory=False,
                    parse_dates=[0])

#add the closing stock value to the df - this will be the y variable
df["Close"]=full_stock.Close

#show how the dataset looks like
df.head(5)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25,Close
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge""",17949.369141
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo...",17929.990234
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man...",17694.679688
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...,17409.720703
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...,17140.240234


In [3]:
#drop the label column
df = df.drop(["Label"], axis=1)

In [4]:
#check for NAN
df.isnull().sum()

Date     0
Top1     0
Top2     0
Top3     0
Top4     0
Top5     0
Top6     0
Top7     0
Top8     0
Top9     0
Top10    0
Top11    0
Top12    0
Top13    0
Top14    0
Top15    0
Top16    0
Top17    0
Top18    0
Top19    0
Top20    0
Top21    0
Top22    0
Top23    1
Top24    3
Top25    3
Close    0
dtype: int64

In [5]:
df = df.replace(np.nan, ' ', regex=True)

#sanity check
df.isnull().sum().sum()

0

In [6]:
df = df.replace('b\"|b\'|\\\\|\\\"', '', regex=True)
df.head(2)

Unnamed: 0,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,...,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25,Close
0,2008-08-08,Georgia 'downs two Russian warplanes' as count...,BREAKING: Musharraf to be impeached.',Russia Today: Columns of troops roll into Sout...,Russian tanks are moving towards the capital o...,"Afghan children raped with 'impunity,' U.N. of...",150 Russian tanks have entered South Ossetia w...,"Breaking: Georgia invades South Ossetia, Russi...",The 'enemy combatent' trials are nothing but a...,Georgian troops retreat from S. Osettain capit...,...,Al-Qaeda Faces Islamist Backlash',Condoleezza Rice: The US would not act to prev...,This is a busy day: The European Union has ap...,"Georgia will withdraw 1,000 soldiers from Iraq...",Why the Pentagon Thinks Attacking Iran is a Ba...,Caucasus in crisis: Georgia invades South Osse...,Indian shoe manufactory - And again in a seri...,Visitors Suffering from Mental Illnesses Banne...,No Help for Mexico's Kidnapping Surge,17949.369141
1,2008-08-11,Why wont America and Nato help us? If they won...,Bush puts foot down on Georgian conflict',Jewish Georgian minister: Thanks to Israeli tr...,Georgian army flees in disarray as Russians ad...,Olympic opening ceremony fireworks 'faked',What were the Mossad with fraudulent New Zeala...,Russia angered by Israeli military sale to Geo...,An American citizen living in S.Ossetia blames...,Welcome To World War IV! Now In High Definition!',...,"Do not believe TV, neither Russian nor Georgia...",Riots are still going on in Montreal (Canada) ...,China to overtake US as largest manufacturer',War in South Ossetia [PICS]',Israeli Physicians Group Condemns State Torture',Russia has just beaten the United States over...,Perhaps *the* question about the Georgia - Rus...,Russia is so much better at war',So this is what it's come to: trading sex for ...,17929.990234


In [7]:
Anakin = SentimentIntensityAnalyzer()

In [8]:
Anakin.polarity_scores(" ")

{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}

In [9]:
def detect_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

detect_subjectivity(" ") #should return 0

0.0

In [10]:
#get the headline columns' names
cols = df.columns

cols=cols[1:-1]

In [11]:
cols

Index(['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9',
       'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17',
       'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25'],
      dtype='object')

In [12]:
start_vect=time.time()
print("ANAKIN: 'Intializing the process..'")



for col in cols:
    df[col] = df[col].astype(str) # Make sure data is treated as a string
    df[col+'_comp']= df[col].apply(lambda x:Anakin.polarity_scores(x)['compound'])
    df[col+'_sub'] = df[col].apply(detect_subjectivity)
    print("{} Done".format(col))
    
print("VADER: Vaderization completed after %0.2f Minutes"%((time.time() - start_vect)/60))

ANAKIN: 'Intializing the process..'
Top1 Done
Top2 Done
Top3 Done
Top4 Done
Top5 Done
Top6 Done
Top7 Done
Top8 Done
Top9 Done
Top10 Done
Top11 Done
Top12 Done
Top13 Done
Top14 Done
Top15 Done
Top16 Done
Top17 Done
Top18 Done
Top19 Done
Top20 Done
Top21 Done
Top22 Done
Top23 Done
Top24 Done
Top25 Done
VADER: Vaderization completed after 0.29 Minutes


In [13]:
#the text isn't required anymore
df = df.drop(cols,axis=1)
df.head(5)

Unnamed: 0,Date,Close,Top1_comp,Top1_sub,Top2_comp,Top2_sub,Top3_comp,Top3_sub,Top4_comp,Top4_sub,...,Top21_comp,Top21_sub,Top22_comp,Top22_sub,Top23_comp,Top23_sub,Top24_comp,Top24_sub,Top25_comp,Top25_sub
0,2008-08-08,17949.369141,-0.5994,0.0,0.0,0.0,-0.3612,0.0,-0.7089,0.2,...,-0.7579,0.666667,-0.6249,0.0,-0.2755,0.0,-0.8519,0.2,0.128,0.0
1,2008-08-11,17929.990234,0.8156,0.0,-0.3182,0.288889,0.4404,0.1,-0.1965,0.0,...,-0.802,0.0,0.0,0.0,-0.3182,0.0,-0.1832,0.5,0.0,0.0
2,2008-08-12,17694.679688,0.0258,1.0,0.0,0.0,-0.7845,0.833333,-0.6124,1.0,...,-0.5994,0.0,0.5267,0.0,0.3818,0.35,0.0,0.454545,0.0,0.0
3,2008-08-13,17409.720703,-0.7184,0.0,-0.8074,0.0,-0.6369,0.0,-0.128,0.444444,...,-0.296,0.0,0.4939,0.0,-0.5719,0.0,-0.4215,0.1,-0.34,0.0
4,2008-08-14,17140.240234,0.2023,0.0,-0.5994,0.0,0.6808,0.4,-0.8689,0.666667,...,-0.4404,0.0,-0.5994,0.0,0.1779,0.0,-0.6908,0.5,0.7096,0.0


In [14]:
comp_cols = []
for col in cols:
    comp_col = col + "_comp"
    comp_cols.append(comp_col)

In [16]:
w = np.arange(1,26,1).tolist()
w.reverse()

In [18]:
weighted_comp = []
max_comp = []
min_comp = []
for i in range(0,len(df)):
    a = df.loc[i,comp_cols].tolist()
    weighted_comp.append(np.average(a, weights=w))
    max_comp.append(max(a))
    min_comp.append(min(a))

df['compound_mean'] = weighted_comp
df['compound_max'] = max_comp
df['compound_min'] = min_comp


sub_cols = []
for col in cols:
    sub_col = col + "_sub"
    sub_cols.append(sub_col)


weighted_sub = []
max_sub = []
min_sub = []
for i in range(0,len(df)):
    a = df.loc[i,sub_cols].tolist()
    weighted_sub.append(np.average(a, weights=w))
    max_sub.append(max(a))
    min_sub.append(min(a))

df['subjectivity_mean'] = weighted_sub
df['subjectivity_max'] = max_sub
df['subjectivity_min'] = min_sub

to_drop = sub_cols+comp_cols
df = df.drop(to_drop, axis=1)

In [19]:
df_sentiments=df.iloc[:,2:]

In [20]:
df_sentiments

Unnamed: 0,compound_mean,compound_max,compound_min,subjectivity_mean,subjectivity_max,subjectivity_min
0,-0.350337,0.2144,-0.9260,0.163685,0.666667,0.0
1,-0.085277,0.8156,-0.8271,0.202921,0.720000,0.0
2,-0.318394,0.5423,-0.8591,0.374076,1.000000,0.0
3,-0.162032,0.5106,-0.8074,0.176371,0.900000,0.0
4,-0.194879,0.7177,-0.8689,0.319615,1.000000,0.0
...,...,...,...,...,...,...
1984,-0.178949,0.8442,-0.8720,0.359450,1.000000,0.0
1985,-0.027012,0.8316,-0.8818,0.324485,1.000000,0.0
1986,-0.202743,0.7783,-0.9211,0.179425,0.906250,0.0
1987,-0.235306,0.7351,-0.9578,0.186443,0.700000,0.0


In [21]:
df_sentiments.to_csv("sentiments.csv")