In [2]:
import json
import pandas as pd
import gensim
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
import math
import pickle

# Creating Corpus

In [3]:
market_data = pd.read_csv('data/market_data.csv')
with open('json/readable.json') as json_file:
    heads = json.load(json_file)

In [4]:
head_dates =list(heads.keys()) #gets the dates into a list
dates = [d for d in market_data['Date'] if d.replace("-","") in heads.keys()]

vix_close = [market_data.loc[i].values[4] for i in range(len(market_data)) if market_data['Date'][i] in dates]
dji_close = [market_data.loc[i].values[-3] for i in range(len(market_data)) if market_data['Date'][i] in dates]

corpus = [' . '.join([" . ".join(heads[dates[i+k].replace("-","")]) for k in range(5)])
           for i in range(len(dates)-5)]

# Create tf-idf Scores for '5 Day Advance Headlines'

In [5]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.2,  norm=None) #get vector and tdidf scores
tfidf_scores = vectorizer.fit_transform(corpus)

In [6]:
tfidf_matrix = tfidf_scores.toarray() #put it to a matrix
df_tfidf_scores = pd.DataFrame(tfidf_matrix, columns=vectorizer.get_feature_names())
df_tfidf_scores.head()

Unnamed: 0,01,02,03,04,05,06,07,08,09,1000,...,zones,zoning,zoo,zoom,zte,zuckerberg,zuckerman,zuma,zurich,zynga
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# VXX

In [10]:
y = vix_close[5::]
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.3, random_state=0)

vix_clf = KernelRidge(alpha=1,kernel = "poly")
vix_clf.fit(X_train, y_train)
pred = vix_clf.predict(X_test)
print("R^2 Score: " + str(vix_clf.score(X_test,y_test)))
print("R : " + str(math.sqrt(vix_clf.score(X_test,y_test))))
print("MSE: " + str(mean_squared_error(pred,y_test)))

R^2 Score: 0.7893665090022701
R : 0.888463003733003
MSE: 15.33323117045582


# DJI

In [11]:
y = dji_close[5::]
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.3, random_state=0)

dji_clf = KernelRidge(alpha=1,kernel = "poly")
dji_clf.fit(X_train, y_train)
pred = dji_clf.predict(X_test)
print("R^2 Score: " + str(dji_clf.score(X_test,y_test)))
print("R : " + str(math.sqrt(dji_clf.score(X_test,y_test))))
print("MSE: " + str(mean_squared_error(pred,y_test)))

R^2 Score: 0.9098585352599282
R : 0.953865050864077
MSE: 2562835.623697717


In [12]:
data = {}

In [13]:
whole = dji_clf.predict(tfidf_matrix)

In [15]:
for x,y in zip(dates[5::],range(len(tfidf_matrix))):
    data[x] = dji_clf.predict(tfidf_matrix[y:y+1])[0]

In [16]:
with open("pred_data/dji_data.json", 'w') as fp:
    json.dump(data,fp)

In [None]:
pickle.dump(dji_clf, open('model/dji.sav', 'wb'))

In [None]:
pickle.dump(vix_clf, open('model/vxx.sav', 'wb'))