In [83]:
import json
import pandas as pd
import gensim
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
import math
import pickle

# Creating Corpus

In [69]:
market_data = pd.read_csv('data/market_data.csv')
with open('json/readable.json') as json_file:
    heads = json.load(json_file)

In [70]:
head_dates =list(heads.keys()) #gets the dates into a list
dates = [d for d in market_data['Date'] if d.replace("-","") in heads.keys()]

vix_close = [market_data.loc[i].values[4] for i in range(len(market_data)) if market_data['Date'][i] in dates]
dji_close = [market_data.loc[i].values[-3] for i in range(len(market_data)) if market_data['Date'][i] in dates]

corpus = [' . '.join([" . ".join(heads[dates[i+k].replace("-","")]) for k in range(5)])
           for i in range(len(dates)-5)]

# Create tf-idf Scores for '5 Day Advance Headlines'

In [71]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.2,  norm=None) #get vector and tdidf scores
tfidf_scores = vectorizer.fit_transform(corpus)

In [72]:
tfidf_matrix = tfidf_scores.toarray() #put it to a matrix
df_tfidf_scores = pd.DataFrame(tfidf_matrix, columns=vectorizer.get_feature_names())
df_tfidf_scores.head()

Unnamed: 0,01,02,03,04,05,06,07,08,09,1000,...,zones,zoning,zoo,zoom,zte,zuckerberg,zuckerman,zuma,zurich,zynga
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# VXX

In [74]:
y = vix_close[5::]
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.3, random_state=0)

clf = KernelRidge(alpha=1,kernel = "poly")
clf.fit(X_train, y_train)
pred =clf.predict(X_test)
print("R^2 Score: " + str(clf.score(X_test,y_test)))
print("R : " + str(math.sqrt(clf.score(X_test,y_test))))
print("MSE: " + str(mean_squared_error(pred,y_test)))

R^2 Score: 0.7893665090022701
R : 0.888463003733003
MSE: 15.33323117045582


# DJI

In [75]:
y = dji_close[5::]
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.3, random_state=0)

clf = KernelRidge(alpha=1,kernel = "poly")
clf.fit(X_train, y_train)
pred =clf.predict(X_test)
print("R^2 Score: " + str(clf.score(X_test,y_test)))
print("R : " + str(math.sqrt(clf.score(X_test,y_test))))
print("MSE: " + str(mean_squared_error(pred,y_test)))

R^2 Score: 0.9098585352599282
R : 0.953865050864077
MSE: 2562835.623697717


In [82]:
data = {}

In [90]:
whole = clf.predict(tfidf_matrix)

In [103]:
for x,y in zip(dates[5::],range(len(tfidf_matrix))):
    data[x] = clf.predict(tfidf_matrix[y:y+1])[0]

In [104]:
data

{'2000-01-10': 11581.933753882795,
 '2000-01-11': 11630.270387401579,
 '2000-01-12': 11518.086471472667,
 '2000-01-13': 11523.630594133358,
 '2000-01-14': 11742.029354765093,
 '2000-01-18': 11587.564968293551,
 '2000-01-19': 11437.80755593484,
 '2000-01-20': 11316.473616983498,
 '2000-01-21': 11250.779815210866,
 '2000-01-24': 11132.34133524598,
 '2000-01-25': 10938.525337524035,
 '2000-01-26': 10949.036837691645,
 '2000-01-27': 10861.395956633905,
 '2000-01-28': 10769.559550411888,
 '2000-01-31': 10879.287692926257,
 '2000-02-01': 11030.336006096066,
 '2000-02-02': 11047.875024556133,
 '2000-02-03': 10997.260868895724,
 '2000-02-04': 11039.75478123804,
 '2000-02-07': 10875.545255520023,
 '2000-02-08': 10829.497296385294,
 '2000-02-09': 10738.062064182239,
 '2000-02-10': 11258.990625700435,
 '2000-02-11': 10963.190222682879,
 '2000-02-14': 10584.817103534097,
 '2000-02-15': 10698.69129165513,
 '2000-02-16': 10638.279232945342,
 '2000-02-17': 10407.739475021406,
 '2000-02-18': 10267.181

In [106]:
with open("pred_data/dji_data.json", 'w') as fp:
    json.dump(data,fp)