Final Project
Group Members: Jesse Zou, Andy Li, Yuhan Zheng, Zhiyao Bao

Introduction
What is the data science problem you are trying to solve?
Why does the problem matter?
What could the results of your predictive model be used for?
Why would we want to be able to predict the thing you’re trying to predict?
Then describe the dataset that you will use to tackle this problem

In [2]:
# Imports
import warnings
#warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [3]:
%matplotlib inline

data cleaning
data exploration
feature engineering
describe and clarify each part of the process

In [26]:
# Read data
data = pd.read_csv("DOW30.csv")
data = data.drop(['Date', 'TEDSpread', 'EFFR'],axis=1)
data.head()

Unnamed: 0,LABEL,Open,High,Low,Close,Volume,InterestRate,ExchangeRate,VIX,Gold,Oil
0,0,12266.63965,12659.82031,12266.46973,12654.36035,295530000.0,1.77,1.5615,22.68,897.0,100.92
1,1,12651.66992,12696.29004,12555.16992,12608.91992,232760000.0,1.72,1.5618,23.43,893.5,104.83
2,0,12605.83008,12675.12012,12527.75,12626.03027,183870000.0,1.7,1.5667,23.21,898.25,103.92
3,1,12626.03027,12688.48047,12528.16016,12609.41992,181260000.0,1.67,1.5735,22.45,905.25,106.09
4,0,12612.58984,12733.66016,12583.28027,12612.42969,198070000.0,1.62,1.5713,22.42,914.7,108.91


In [27]:
# Split data and label
data_Y = data['LABEL']
data_X = data.drop(['LABEL'],axis=1)
print(data_X.head())
print(data_Y.head())
print(data_Y.shape)
print(data_X.shape)

          Open         High          Low        Close       Volume  \
0  12266.63965  12659.82031  12266.46973  12654.36035  295530000.0   
1  12651.66992  12696.29004  12555.16992  12608.91992  232760000.0   
2  12605.83008  12675.12012  12527.75000  12626.03027  183870000.0   
3  12626.03027  12688.48047  12528.16016  12609.41992  181260000.0   
4  12612.58984  12733.66016  12583.28027  12612.42969  198070000.0   

   InterestRate  ExchangeRate    VIX    Gold     Oil  
0          1.77        1.5615  22.68  897.00  100.92  
1          1.72        1.5618  23.43  893.50  104.83  
2          1.70        1.5667  23.21  898.25  103.92  
3          1.67        1.5735  22.45  905.25  106.09  
4          1.62        1.5713  22.42  914.70  108.91  
0    0
1    1
2    0
3    1
4    0
Name: LABEL, dtype: int64
(2448,)
(2448, 10)


In [38]:
# SVM
def SVM_trainer(data_X, data_Y):
    svm_scaler = StandardScaler()
    svm_pca = PCA()
    svm = SVC()

    svm_ppl = Pipeline(steps=[('scaler', svm_scaler), ('pca', svm_pca), ('svm', svm)])

    svm_param_grid = {
        'pca__n_components': list(range(1, 11)),
        'svm__kernel': ['linear', 'rbf', 'poly']
    }

    svm_grid_search = GridSearchCV(svm_ppl, svm_param_grid, cv=5, scoring='accuracy')
    svm_scores = cross_val_score(svm_grid_search, data_X, data_Y, cv=10)
    svm_preds = cross_val_predict(svm_grid_search, data_X, data_Y, cv=10)
    print("Accuracy:", svm_scores.mean()*100, "%")
    print("classification report:\n",classification_report(data_Y, svm_preds))
    return svm_grid_search

# SVM_trainer(data_X, data_Y)

In [39]:
# KNN
def KNN_trainer(data_X, data_Y):
    scaler = StandardScaler()
    pca = PCA()
    knn_classifier = KNeighborsClassifier(n_neighbors=7)
    ppl = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn_classifier)])
    scores = cross_val_score(ppl, data_X, data_Y, cv=5) 
    print("Accuracy:", scores.mean()*100, "%")

    param_grid = {
        'pca__n_components': list(range(1, 11)),
        'knn__n_neighbors': list(range(1, 26))
    }

    knn_grid_search = GridSearchCV(ppl, param_grid, cv=5, scoring='accuracy')
    knn_grid_search.fit(data_X, data_Y)
    print("Best parameters:", knn_grid_search.best_params_)
    print("Best score:", knn_grid_search.best_score_*100, "%")

    knn_nested_score = cross_val_score(knn_grid_search, data_X, data_Y, cv=5)
    print("Accuracy:", knn_nested_score.mean()*100, "%")
    return knn_grid_search
# knn_grid_search = KNN_trainer(data_X, data_Y)

In [40]:
# NN
nn_scaler = StandardScaler()
nn = MLPClassifier()

nn_ppl = Pipeline(steps=[('scaler', nn_scaler), ('nn', nn)])
nn_param_grid = {
    'nn__hidden_layer_sizes': list(range(30, 61, 10)),
    'nn__activation': ['logistic', 'tanh', 'relu']
}

@ignore_warnings(category=ConvergenceWarning)
def NN_trainer(nn_ppl, nn_param_grid, data_X, data_Y):
    nn_grid_search = GridSearchCV(nn_ppl, nn_param_grid, cv=5, scoring='accuracy')
    nn_scores = cross_val_score(nn_grid_search, data_X, data_Y, cv=5)
    print("Accuracy:", nn_scores.mean()*100, "%")
    return nn_grid_search

# nn_grid_search = NN_trainer(nn_ppl, nn_param_grid, data_X, data_Y)

In [42]:
# Ensamble
@ignore_warnings(category=ConvergenceWarning)
def ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, eclf):
    eclf = VotingClassifier(
        estimators=[('svm', svm_grid_search), ('knn', knn_grid_search), ('nn', nn_grid_search)],
        voting='hard')
    for clf, label in zip([svm_grid_search, knn_grid_search, nn_grid_search, eclf], ['SVM', 'KNN', 'Neural Network', 'Ensemble']):
        scores = cross_val_score(clf, data_X, data_Y, scoring='accuracy', cv=5)
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
        
# ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, eclf)

In [33]:
# Read data
data_dow = pd.read_csv("DOW30.csv")
data_sp = pd.read_csv("SP500.csv")
data_nas = pd.read_csv("NASDAQ.csv")

def process_data(target_dataset, dataset_label1, dataset_label2, label1, label2):
    data_processed = target_dataset.drop(['Date', 'TEDSpread', 'EFFR'],axis=1)
    labels1 = dataset_label1.iloc[0:, 1]
    labels2 = dataset_label2.iloc[0:, 1]
    data_processed[label1] = labels1
    data_processed[label1] = data_processed[label1].shift(periods=1, fill_value=-1)
    data_processed[label2] = labels2
    data_processed[label2] = data_processed[label2].shift(periods=1, fill_value=-1)
    data_processed = data_processed.iloc[1: , :]
    return data_processed

Accuracy: 53.941953830712606 %
classification report:
               precision    recall  f1-score   support

           0       0.48      0.07      0.12      1119
           1       0.54      0.94      0.69      1328

    accuracy                           0.54      2447
   macro avg       0.51      0.50      0.40      2447
weighted avg       0.51      0.54      0.43      2447



In [None]:
# DOW30
data_dow_processed = process_data(data_dow, data_sp, data_nas, "SP500", "NASDAQ")
# print(dataset_dow_processed.head())
data_dow_Y = data_dow_processed['LABEL']
data_dow_X = data_dow_processed.drop(['LABEL'],axis=1)
svm_grid_search = SVM_trainer(data_dow_X, data_dow_Y)
knn_grid_search = KNN_trainer(data_X, data_Y)
nn_grid_search = NN_trainer(nn_ppl, nn_param_grid, data_X, data_Y)
eclf = VotingClassifier(
        estimators=[('svm', svm_grid_search), ('knn', knn_grid_search), ('nn', nn_grid_search)],
        voting='hard')

In [None]:
# SP500
data_sp_processed = process_data(data_sp, data_dow, data_nas, "DOW30", "NASDAQ")
data_sp_Y = data_sp_processed['LABEL']
data_sp_X = data_sp_processed.drop(['LABEL'],axis=1)
SVM_trainer(data_sp_X, data_sp_Y)

In [None]:
# NASDAQ
data_nas_processed = process_data(data_nas, data_dow, data_sp, "DOW30", "SP500")
data_nas_Y = data_nas_processed['LABEL']
data_nas_X = data_nas_processed.drop(['LABEL'],axis=1)
SVM_trainer(data_nas_X, data_nas_Y)