Final Project
Group Members: Jesse Zou, Andy Li, Yuhan Zheng, Zhiyao Bao

# Introduction

Introduction
* What is the data science problem you are trying to solve?
* * We're trying to predict the trend of the stock market (if the stock market's price will go up or go down in general for the next day) for DOW30, SP500 and NASDAQ respectively.
* Why does the problem matter?
* * The problem matters because: first, we have an unbiased analysis on the market and the economy; second, it may help people involving the stock market to earn money.
* What could the results of your predictive model be used for?
* Why would we want to be able to predict the thing you’re trying to predict?
* Then describe the dataset that you will use to tackle this problem

In [None]:
# Imports
import warnings
#warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
# from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [None]:
%matplotlib inline

data cleaning
data exploration
feature engineering
describe and clarify each part of the process

# Load datasets

In [4]:
# Read data
data_dow = pd.read_csv("DOW30.csv")
data_sp = pd.read_csv("SP500.csv")
data_nas = pd.read_csv("NASDAQ.csv")

# Data Cleaning

# Data Exploration

In [None]:
def box_plot(x):
    red_square = dict(markerfacecolor='r', marker='s')
    fig, ax = plt.subplots()
    ax.set_title('Horizontal Boxes')
    ax.boxplot(x, vert=False, flierprops=red_square)

box_plot(data_dow[""])

def line_graph(x, y):
    pass


# Models

In [None]:
# SVM
def SVM_trainer(data_X, data_Y):
    svm_scaler = StandardScaler()
    svm_pca = PCA()
    svm = SVC()

    svm_ppl = Pipeline(steps=[('scaler', svm_scaler), ('pca', svm_pca), ('svm', svm)])

    svm_param_grid = {
        'pca__n_components': list(range(1, 11)),
        'svm__kernel': ['linear', 'rbf', 'poly']
    }

    svm_grid_search = GridSearchCV(svm_ppl, svm_param_grid, cv=5, scoring='accuracy')
#     svm_scores = cross_val_score(svm_grid_search, data_X, data_Y, cv=10)
#     svm_preds = cross_val_predict(svm_grid_search, data_X, data_Y, cv=10)
#     print("Accuracy:", svm_scores.mean()*100, "%")
#     print("classification report:\n",classification_report(data_Y, svm_preds))
    return svm_grid_search

# SVM_trainer(data_X, data_Y)

In [None]:
# KNN
def KNN_trainer(data_X, data_Y):
    scaler = StandardScaler()
    pca = PCA()
    knn_classifier = KNeighborsClassifier(n_neighbors=7)
    ppl = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn_classifier)])
#     scores = cross_val_score(ppl, data_X, data_Y, cv=5) 
#     print("Accuracy:", scores.mean()*100, "%")

    param_grid = {
        'pca__n_components': list(range(1, 11)),
        'knn__n_neighbors': list(range(1, 26))
    }

    knn_grid_search = GridSearchCV(ppl, param_grid, cv=5, scoring='accuracy')
#     knn_grid_search.fit(data_X, data_Y)
#     print("Best parameters:", knn_grid_search.best_params_)
#     print("Best score:", knn_grid_search.best_score_*100, "%")

#     knn_nested_score = cross_val_score(knn_grid_search, data_X, data_Y, cv=5)
#     print("Accuracy:", knn_nested_score.mean()*100, "%")
    return knn_grid_search
# knn_grid_search = KNN_trainer(data_X, data_Y)

In [None]:
# NN
# @ignore_warnings(category=ConvergenceWarning)
def NN_trainer(data_X, data_Y):
    nn_scaler = StandardScaler()
    nn = MLPClassifier()

    nn_ppl = Pipeline(steps=[('scaler', nn_scaler), ('nn', nn)])
    nn_param_grid = {
        'nn__hidden_layer_sizes': list(range(30, 61, 10)),
        'nn__activation': ['logistic', 'tanh', 'relu']
    }
    nn_grid_search = GridSearchCV(nn_ppl, nn_param_grid, cv=5, scoring='accuracy')
#     nn_scores = cross_val_score(nn_grid_search, data_X, data_Y, cv=5)
#     print("Accuracy:", nn_scores.mean()*100, "%")
    return nn_grid_search

# nn_grid_search = NN_trainer(nn_ppl, nn_param_grid, data_X, data_Y)

In [None]:
# Ensamble
# @ignore_warnings(category=ConvergenceWarning)
def ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_X, data_Y):
    eclf = VotingClassifier(
        estimators=[('svm', svm_grid_search), ('knn', knn_grid_search), ('nn', nn_grid_search)],
        voting='hard')
    for clf, label in zip([svm_grid_search, knn_grid_search, nn_grid_search, eclf], ['SVM', 'KNN', 'Neural Network', 'Ensemble']):
        scores = cross_val_score(clf, data_X, data_Y, scoring='accuracy', cv=5)
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
        
# ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, eclf)

# Results with Various Feature Engineering

## 1: without tracking the prev day; no date, TEDSpread, EFFR

In [None]:
# DOW30
data_dow_processed = data_dow.drop(['Date', 'TEDSpread', 'EFFR'],axis=1)
# data_dow_processed.head()

data_dow_Y = data_dow_processed['LABEL']
data_dow_X = data_dow_processed.drop(['LABEL'],axis=1)
svm_grid_search = SVM_trainer(data_dow_X, data_dow_Y)
knn_grid_search = KNN_trainer(data_dow_X, data_dow_Y)
nn_grid_search = NN_trainer(data_dow_X, data_dow_Y)
ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_dow_X, data_dow_Y)


In [None]:
# SP500
data_sp_processed = data_sp.drop(['Date', 'TEDSpread', 'EFFR'],axis=1)
data_sp_Y = data_sp_processed['LABEL']
data_sp_X = data_sp_processed.drop(['LABEL'],axis=1)
svm_grid_search = SVM_trainer(data_sp_X, data_sp_Y)
knn_grid_search = KNN_trainer(data_sp_X, data_sp_Y)
nn_grid_search = NN_trainer(data_sp_X, data_sp_Y)
ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_sp_X, data_sp_Y)

In [None]:
# NASDAQ
data_nas_processed = data_nas.drop(['Date', 'TEDSpread', 'EFFR'],axis=1)
data_nas_Y = data_nas_processed['LABEL']
data_nas_X = data_nas_processed.drop(['LABEL'],axis=1)
SVM_trainer(data_nas_X, data_nas_Y)
svm_grid_search = SVM_trainer(data_nas_X, data_nas_Y)
knn_grid_search = KNN_trainer(data_nas_X, data_nas_Y)
nn_grid_search = NN_trainer(data_nas_X, data_nas_Y)
ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_nas_X, data_nas_Y)

## 2: tracking the prev day; no date, TEDSpread, EFFR

In [None]:
# preprocess data
def process_data(target_dataset, dataset_label1, dataset_label2, label1, label2):
    data_processed = target_dataset.drop(['Date', 'TEDSpread', 'EFFR'],axis=1)
    labels1 = dataset_label1.iloc[0:, 1]
    labels2 = dataset_label2.iloc[0:, 1]
    data_processed[label1] = labels1
    data_processed[label1] = data_processed[label1].shift(periods=1, fill_value=-1)
    data_processed[label2] = labels2
    data_processed[label2] = data_processed[label2].shift(periods=1, fill_value=-1)
    data_processed = data_processed.iloc[1: , :]
    return data_processed

In [None]:
# DOW30
data_dow_processed = process_data(data_dow, data_sp, data_nas, "SP500", "NASDAQ")
# print(data_dow_processed.head())
data_dow_Y = data_dow_processed['LABEL']
data_dow_X = data_dow_processed.drop(['LABEL'],axis=1)
svm_grid_search = SVM_trainer(data_dow_X, data_dow_Y)
knn_grid_search = KNN_trainer(data_dow_X, data_dow_Y)
nn_grid_search = NN_trainer(data_dow_X, data_dow_Y)
ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_dow_X, data_dow_Y)

In [None]:
# SP500
data_sp_processed = process_data(data_sp, data_dow, data_nas, "DOW30", "NASDAQ")
data_sp_Y = data_sp_processed['LABEL']
data_sp_X = data_sp_processed.drop(['LABEL'],axis=1)
svm_grid_search = SVM_trainer(data_sp_X, data_sp_Y)
knn_grid_search = KNN_trainer(data_sp_X, data_sp_Y)
nn_grid_search = NN_trainer(data_sp_X, data_sp_Y)
ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_sp_X, data_sp_Y)

In [None]:
# NASDAQ
data_nas_processed = process_data(data_nas, data_dow, data_sp, "DOW30", "SP500")
data_nas_Y = data_nas_processed['LABEL']
data_nas_X = data_nas_processed.drop(['LABEL'],axis=1)
SVM_trainer(data_nas_X, data_nas_Y)
svm_grid_search = SVM_trainer(data_nas_X, data_nas_Y)
knn_grid_search = KNN_trainer(data_nas_X, data_nas_Y)
nn_grid_search = NN_trainer(data_nas_X, data_nas_Y)
ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_nas_X, data_nas_Y)

## 2: tracking the prev day; no date

In [None]:
# preprocess data
def process_data_2(target_dataset, dataset_label1, dataset_label2, label1, label2):
    data_processed = target_dataset.drop(['Date'],axis=1)
    labels1 = dataset_label1.iloc[0:, 1]
    labels2 = dataset_label2.iloc[0:, 1]
    data_processed[label1] = labels1
    data_processed[label1] = data_processed[label1].shift(periods=1, fill_value=-1)
    data_processed[label2] = labels2
    data_processed[label2] = data_processed[label2].shift(periods=1, fill_value=-1)
    data_processed = data_processed.iloc[1: , :]
    return data_processed

In [None]:
# DOW30
data_dow_processed = process_data_2(data_dow, data_sp, data_nas, "SP500", "NASDAQ")
# print(data_dow_processed.head())
data_dow_Y = data_dow_processed['LABEL']
data_dow_X = data_dow_processed.drop(['LABEL'],axis=1)
svm_grid_search = SVM_trainer(data_dow_X, data_dow_Y)
knn_grid_search = KNN_trainer(data_dow_X, data_dow_Y)
nn_grid_search = NN_trainer(data_dow_X, data_dow_Y)
ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_dow_X, data_dow_Y)

In [None]:
# SP500
data_sp_processed = process_data_2(data_sp, data_dow, data_nas, "DOW30", "NASDAQ")
data_sp_Y = data_sp_processed['LABEL']
data_sp_X = data_sp_processed.drop(['LABEL'],axis=1)
svm_grid_search = SVM_trainer(data_sp_X, data_sp_Y)
knn_grid_search = KNN_trainer(data_sp_X, data_sp_Y)
nn_grid_search = NN_trainer(data_sp_X, data_sp_Y)
ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_sp_X, data_sp_Y)

In [None]:
# NASDAQ
data_nas_processed = process_data_2(data_nas, data_dow, data_sp, "DOW30", "SP500")
data_nas_Y = data_nas_processed['LABEL']
data_nas_X = data_nas_processed.drop(['LABEL'],axis=1)
SVM_trainer(data_nas_X, data_nas_Y)
svm_grid_search = SVM_trainer(data_nas_X, data_nas_Y)
knn_grid_search = KNN_trainer(data_nas_X, data_nas_Y)
nn_grid_search = NN_trainer(data_nas_X, data_nas_Y)
ensamble_trainer(svm_grid_search, knn_grid_search, nn_grid_search, data_nas_X, data_nas_Y)