# This is my capstone project for the Udacity Machine Learning Nanodegree.

Import the libraries needed.

In [None]:
import pandas as pd
import numpy as np
import keras as kr
from keras.models import Sequential
from keras.layers import LSTM, Dense
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from os import listdir

Get the data.

In [None]:
directory = 'sandp500/individual_stocks_5yr'
#directory = 'OneDrive/Documents/Projects/MachineLearning/Udacity/Capstone/sandp500/individual_stocks_5yr'
#directory = 'Capstone/sandp500/individual_stocks_5yr'
directory_listing = listdir(directory)

In [None]:
symbols_list = []

for symbol in directory_listing:
    symb = symbol.split('_')[0]
    symbols_list.append(symb)

print(len(symbols_list))
print(symbols_list[0])

In [None]:
csv_file = '{}/{}_data.csv'.format(directory, symbols_list[0])
dataset = pd.read_csv(csv_file)

Since we already know the name of the specific stock we are trying to get from the name of the file, we can drop that column in the dataframe.

In [None]:
dataset = dataset.assign(trading_date = pd.to_datetime(dataset['Date']))

In [None]:
dataset = dataset.drop('Name', 1)
dataset = dataset.drop('Date', 1)
dataset.set_index(['trading_date'], inplace=True)

In [None]:
dataset['NextDayClose'] = dataset['Close'].shift(-1)

In [None]:
dataset.head(10)

In [None]:
#below is the Daily Returns calculation to put into the Sharpe Ratio. 
#df = dataset.assign(Daily_Returns = np.divide((dataset.Open - dataset.Close), dataset.Close) * 100)

#Below is the calculation for the Sharpe Ratio column. 
#df = df.assign(Sharpe_Ratio = np.divide((df.Daily_Returns - 0.046), np.std(np.array([df.Open, df.High, df.Low, df.Close]))))

#Below is the rate of change (momentum) for the specific stock. 
#df = df.assign(Rate_of_Change = (np.divide(df.Close, df.Open) - 1) * 100)

df = dataset.assign(Difference_of_Close = dataset['Close'].diff())

In [None]:
df.head(10)

This code is for comparing the different columns of the raw data. 

In [None]:
df = df.drop(['Open', 'High', 'Low', 'Volume'], axis=1)

In [None]:
df.head(10)

The following is graphing a few of the graphs with the opening price and the volume on one graph to compare with two different axis'.  I thought to do this as a comparison between the opening price (which all the raw data features follow roughly the same line) and the volume feature.  Since the volume feature is important. [http://www.investopedia.com/terms/v/volume.asp]

In [None]:
def getting_preprocessed_data(symbol):
        csv_file = '{}/{}_data.csv'.format(directory, symbol)
        df = pd.read_csv(csv_file)
        df = df.drop('Name', 1)
        df.set_index('Date', inplace=True)
        # below was found at https://stackoverflow.com/questions/29314033/python-pandas-dataframe-remove-empty-cells
        df['Open'].replace('', np.nan, inplace=True)
        df.dropna(subset=['Open'], inplace=True)
        return df

def plotting_stocks(symbols_list, amount_of_stocks=0):
    if amount_of_stocks == 0:
        amount_of_stocks = len(symbols_list)
        
    for symbol in symbols_list[:amount_of_stocks]:
        fig, ax = plt.subplots()
        fig.subplots_adjust(right=0.7)
        df = getting_preprocessed_data(symbol)
        print(symbol)
        df.Open.plot(ax=ax, style='b-', figsize=(20,10))
        # same ax as above since it's automatically added on the right
        df.Volume.plot(ax=ax, style='r-', secondary_y=True, figsize=(20,10))
        # add legend --> take advantage of pandas providing us access
        # to the line associated with the right part of the axis
        #ax.legend([ax.get_lines()[0], ax.get_lines()[0]], ['Open','Volume'], bbox_to_anchor=(1.5, 0.5))
        plt.show()
        #below is the Daily Returns calculation to put into the Sharpe Ratio. 
        df_preprocessed = df.assign(Daily_Returns = np.divide((df.Open - df.Close), df.Close) * 100)

        #Below is the calculation for the Sharpe Ratio column. 
        df_preprocessed = df_preprocessed.assign(Sharpe_Ratio = np.divide((df_preprocessed.Daily_Returns - 0.046), np.std(np.array([df_preprocessed.Open, df_preprocessed.High, df_preprocessed.Low, df_preprocessed.Close]))))

        #Below is the rate of change (momentum) for the specific stock. 
        df_preprocessed = df_preprocessed.assign(Rate_of_Change = (np.divide(df_preprocessed.Close, df_preprocessed.Open) - 1) * 100)

        #df.plot.scatter(x='Open', y='Volume', label="AAL")
        log_df = np.log(df)
        log_df.plot.scatter(x='Volume', y='Open', label="AAL", figsize=(20,10))
        plt.show()
        df_preprocessed.plot.scatter(x='Open', y='Sharpe_Ratio', label="Sharpe Ratio Open", figsize=(20,10))
        plt.show()
        df_preprocessed.plot.scatter(x='Volume',y='Sharpe_Ratio', label="Sharpe Ratio Close", figsize=(20,10), use_index=True)
        plt.show()

In [None]:
# printing out the first four stocks to get an idea of how each stock is individually represented.
plotting_stocks(symbols_list, 10)

#df.plot.scatter(x='Open', y='Volume', label="AAL")
log_df = np.log(df)
log_df.plot.scatter(x='Volume', y='Open', label="AAL", figsize=(20,10))
plt.show()

log_df.plot.scatter(x='Volume', y='Close', label="AAL", figsize=(20,10))
plt.show()

log_df.plot(x=log_df.index, y='Open', label="AAL", figsize=(20,10), use_index=True, style='.')
plt.show()

In [None]:
#below is the Daily Returns calculation to put into the Sharpe Ratio. 
df_preprocessed = df.assign(Daily_Returns = np.divide((df.Open - df.Close), df.Close) * 100)

In [None]:
#Below is the calculation for the Sharpe Ratio column. 
df_preprocessed = df_preprocessed.assign(Sharpe_Ratio = np.divide((df_preprocessed.Daily_Returns - 0.046), np.std(np.array([df_preprocessed.Open, df_preprocessed.High, df_preprocessed.Low, df_preprocessed.Close]), ddof=1)))

In [None]:
#Below is the rate of change for the specific stock. 
df_preprocessed = df_preprocessed.assign(Rate_of_Change = (np.divide(df_preprocessed.Close, df_preprocessed.Open) - 1) * 100)

In [None]:
df_preprocessed.plot.scatter(x='Volume', y='Sharpe_Ratio', label="AAL", figsize=(20,10))
plt.show()

In [None]:
df_preprocessed

In [None]:
df_preprocessed.to_csv('AAL_preprocessed_data.csv')

df_preprocessed_abs = df_preprocessed.assign(Daily_Returns = np.absolute(df_preprocessed.Daily_Returns))

df_preprocessed_abs

In [None]:
from IPython.display import display
display(df_preprocessed.head(n=1))

In [None]:
# I am using some of the techniques I learned from previous projects.  The below is from the Finding Donors Project.
closing = df_preprocessed['Close'].astype(int)
features = df_preprocessed.drop('Close', axis = 1)

#closing_raw
#features_raw

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(features, closing, test_size=0.2, random_state=0)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

In [None]:
clf = SVC(random_state=2)

learner = clf.fit(X_train, y_train)

In [None]:
pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)*100

print("Accuracy is: {:.4f}%".format(accuracy))