In [None]:
import time
import os
import pandas as pd
import numpy as np
from datetime import datetime, date, time, timedelta
import statsmodels.api as sm
import matplotlib.pyplot as plt
from math import *
import calendar
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import History 
from keras.models import load_model
from keras.models import model_from_json

def columnToFloat(table,colname):
    #This function will take a column in a data frame that has numbers stored as strings and convert them to floats
    sample_name = list(table[colname])
    sample_name = list(map(float, sample_name))
    return sample_name

def add_months(sourcedate,months):
    month = sourcedate.month - 1 + months
    year = sourcedate.year + month // 12
    month = month % 12 + 1
    day = min(sourcedate.day,calendar.monthrange(year,month)[1])
    return date(year,month,day)

def getFactordata():
    #Read in the total_data5 set, IT's MASSIVE
    totalset = pd.read_csv('./factor_data/Total_Data5.csv')
    totalset = totalset[['DATE','TICKER','mcap','EP','BP','CP','SP','ES', 'CRET','RET','REP','RBP','RCP','RSP','CTEF','PM1', 'BR2','VOL','TOT','MRV1']]
    totalset = totalset.drop_duplicates(subset = ['DATE','TICKER'],keep='first')
    #Remove missing values from data
    totalset = totalset[totalset['ES'] != '.']
    totalset = totalset[totalset['CTEF'] != '.']
    totalset = totalset[totalset['PM1'] != '.']
    #RET is in percent, put in decimal
    totalset['RET'] = totalset['RET']/100
    totalset = totalset.reset_index()
    totalset = totalset.iloc[:,1:]
    totalset['ES'] = columnToFloat(totalset,'ES')
    totalset['CTEF'] = columnToFloat(totalset,'CTEF')
    totalset['PM1'] = columnToFloat(totalset,'PM1')
    totalset = totalset.replace('.', np.NaN)
    totalset = totalset.ffill()
    totalset = totalset.dropna()
    totalset = totalset.dropna()
    return totalset
    
def filterUnivTotalSet(totalset,DATE):

    #Important for part 3
    totalset_sub = totalset[totalset['DATE'] == DATE] #whatever format its in
    totalset_sub = totalset_sub.reset_index()
    totalset_sub = totalset_sub.iloc[:,1:]
    
    #PART 2 - A
    #Rank by mcap, then collect top 4000
    totalset_sub = totalset_sub.sort_values(['mcap'], ascending=False)
    totalset_sub = totalset_sub.iloc[:4000,:] 
    
    #PART 2 - B
    #Eli Schwartz
    #Choose 70th percentile and above for ES value from totalset_sub
    totalset_sub = totalset_sub.sort_values(['ES'], ascending=False)
    
    n=len(totalset_sub)
    index= int(n*7/10) #70th percentile
    totalset_sub = totalset_sub.iloc[index:,:]
    
    #Reset index
    totalset_sub = totalset_sub.reset_index()
    totalset_sub = totalset_sub.iloc[:,1:]
    
    return totalset_sub

def subtractYear(dateCons, year):
    year = dateCons.year - year
    month = dateCons.month
    day =  dateCons.day
    dateString = str(month) + "/" + str(day) + "/" + str(year)
    yearBackDate = datetime.strptime(dateString, '%m/%d/%Y')
    return yearBackDate

def splitTrainTest(dataset, testSize):
    y = dataset.iloc[:, 0:1]
    data = dataset.iloc[:, 1:19]
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.4)
    return  X_train, X_test, y_train, y_test


def getTrainingFrame(dateCons, totalset, tickList10Fac):
    datePrevYear = subtractYear(dateCons, 25)
    datePrevYear = add_months(datePrevYear, 1)
    df = pd.DataFrame()
    for i in range(300):
        dateNext = add_months(datePrevYear, 1)
        retNextPeriod = totalset[totalset['DATE'] == dateNext]
        smallFil = totalset[totalset['DATE'] == datePrevYear] 
        retNextPeriod = retNextPeriod[['TICKER', 'CRET']]
        retNextPeriod = retNextPeriod.loc[retNextPeriod.TICKER.isin(tickList10Fac)]
        factorList = ['TICKER','mcap','EP','BP','CP','SP','ES','RET','REP','RBP','RCP','RSP','CTEF','PM1', 'BR2','VOL','TOT','MRV1']
        smallFil =  smallFil[factorList]
        smallFil =  smallFil.loc[smallFil.TICKER.isin(tickList10Fac)]
        df = df.append(pd.merge(retNextPeriod, smallFil, on='TICKER'))
        datePrevYear = dateNext
    del df['TICKER']
    return df

def KNNPredictionUsingOptimalK(df):    
    X_train, X_test, y_train, y_test = splitTrainTest(df, 0.3)
    
    neigh = KNeighborsRegressor(n_neighbors=2, algorithm = 'auto', weights = 'uniform')
    neigh.fit(X_train, y_train)
    #print(neigh.predict(x))
    TrainPred = neigh.predict(X_train)
    TestPred = neigh.predict(X_test)
    TrainError = np.sum((TrainPred -y_train)**2)
    TestError = np.sum((TestPred - y_test)**2)
    
    NumberResultsKNN = pd.DataFrame({'K': [2], 'TrainError': [TrainError[0]],'TestError': [TestError[0]]})

    for n in range(3,20,1):
        neigh = KNeighborsRegressor(n_neighbors=n, algorithm = 'auto', weights = 'uniform')
        neigh.fit(X_train, y_train)
        TrainPred = neigh.predict(X_train)
        TestPred = neigh.predict(X_test)
        TrainError = np.sum((TrainPred -y_train)**2)
        TestError = np.sum((TestPred - y_test)**2)
        
        NumberResultsKNN = NumberResultsKNN.append(pd.DataFrame({'K': [n], 'TrainError': [TrainError[0]],'TestError': [TestError[0]]}))
    
    NumberResultsKNN = NumberResultsKNN.reset_index()
    NumberResultsKNN = NumberResultsKNN.iloc[:,1:]
    plt.scatter(NumberResultsKNN['K'], NumberResultsKNN['TrainError'], c='k', label='TrainError')
    plt.plot(NumberResultsKNN['K'], NumberResultsKNN['TestError'], c='g', label='TestError')
    plt.legend()
    plt.show()
    
    BestK = NumberResultsKNN['K'][0]
    SmallestDifference = abs(NumberResultsKNN['TrainError'][0] - NumberResultsKNN['TestError'][0])
    for k in range(len(NumberResultsKNN)):
        if abs(NumberResultsKNN['TrainError'][k] - NumberResultsKNN['TestError'][k]) < SmallestDifference:
            SmallestDifference = abs(NumberResultsKNN['TrainError'][k] - NumberResultsKNN['TestError'][k])
            BestK = NumberResultsKNN['K'][k]
    
    neigh = KNeighborsRegressor(n_neighbors = BestK, algorithm = 'auto', weights = 'uniform')
    neigh.fit(X_train, y_train)
    filename = "./"+ resultpath + '/KNNStockPredictor.pkl'
    pickle.dump(neigh, open(filename, 'wb'))
    
    return BestK

def neuralNetwork(df):
    history = History()
    X_train, X_test, y_train, y_test = splitTrainTest(df, 0.3)
    model = Sequential()
    model.add(Dense(16, input_dim=6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(16, kernel_initializer='normal', activation='relu'))
    model.add(Dense(8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adadelta())

    feature_cols = X_train
    labels = y_train.values

    model.fit(np.array(feature_cols), np.array(labels), epochs=5, batch_size=100)

    filename = "./"+ resultpath + '/NNStockPredictor.h5'
    
    # serialize model to JSON
    model_json = model.to_json()
    with open("./"+ resultpath + "/modelNN.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(filename)
    print("Saved model to disk")
    
    
if __name__=='__main__':
    
    resultpath = "Model"
    if(not os.path.isdir(resultpath)):
        os.mkdir(resultpath)
        
    totalset = getFactordata()
    totalset['DATE'] = pd.to_datetime(totalset['DATE'])

    dateString = '12/01/2004'
    dateCons = datetime.strptime(dateString, '%m/%d/%Y')

    filteredTotalSet = filterUnivTotalSet(totalset, dateCons)
    filteredTotalSet['DATE'] = pd.to_datetime(filteredTotalSet['DATE'])
    filteredTotalSet = filteredTotalSet.sort_values(by=['TICKER'])
    tickerList = filteredTotalSet['TICKER'].values.tolist()
    
    df = getTrainingFrame(dateCons, totalset, tickerList)
    df = df.convert_objects(convert_numeric=True)
    df =  df.dropna()
    
    print(df.shape)
    
    bestK = KNNPredictionUsingOptimalK(df)
    print("Our Best K found:" , bestK )
    
    neuralNetwork(df)
    print("finished")

  if self.run_code(code, result):


In [3]:
df.to_csv("data.csv")