In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import math


# Read in data and display first 5 rows
data = pd.read_csv('/Users/jackieoh/datathon/Aggregated Data/SYF-Aggregated.csv')

#adding date columns
date = data['Date'].str.split('/', expand=True)
data['Month'] = date[0]
data['Day'] = date[1]
data['Year'] = date[2]


# Create numpy array of data without Close
data = data[np.isfinite(data['AvgClose60Days'])]
labels = np.array(data['Close'])  # Labels are the values we want to predict
dates = np.array(data['Date'])
data = data.drop('Close', axis=1)
data = data.drop('Date', axis=1)
factors_list = list(data.columns)
data = np.array(data)
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels, train_date, test_date = train_test_split(data, labels, dates, test_size=0.0132, shuffle=False)


# Get baseline prediction
average_close = train_labels.mean()
baseline_errors = abs(average_close - test_labels)
baseline_errors_squared = baseline_errors**2
average_baseline_error = round(np.mean(baseline_errors), 2)
print('Average baseline error: ', average_baseline_error)
squared_baseline_error = round(np.mean(baseline_errors_squared), 2)
print('Mean Squared baseline error: ', squared_baseline_error)


# Instantiate and train model with 1000 decision trees
rf = RandomForestRegressor(n_estimators=1000, random_state=42)
rf.fit(train_data, train_labels);

# Use the forest's predict method on the test data
predictions = rf.predict(test_data)

# Calculate errors
errors = abs(predictions - test_labels)
errorSquared = errors**2
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
print('Mean Squared Error:', round(np.mean(errorSquared), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')



r2 = r2_score(predictions, test_labels)
print('R^2: ', round(r2, 2), '%.')

# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(factors_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];


training = pd.DataFrame({'Date': train_date, 'Close': train_labels})
training.index = training['Date']
actual = pd.DataFrame({'Date': test_date, 'Close': test_labels})
actual.index = actual['Date']
predicted = pd.DataFrame({'Date': test_date, 'Close': predictions})
predicted.index = predicted['Date']

Average baseline error:  1.18
Mean Squared baseline error:  1.57
Mean Absolute Error: 1.33 degrees.
Mean Squared Error: 1.93 degrees.
Accuracy: 95.62 %.
R^2:  -2.05 %.
Variable: Discover             Importance: 0.46
Variable: Competitor2          Importance: 0.2
Variable: OMF                  Importance: 0.11
Variable: GeneralStockPrice    Importance: 0.07
Variable: Competitor1          Importance: 0.06
Variable: Year                 Importance: 0.05
Variable: AvgClose60Days       Importance: 0.02
Variable: DOW                  Importance: 0.01
Variable: Month                Importance: 0.0
Variable: Day                  Importance: 0.0


In [3]:
predictions = pd.read_csv('lstm_predictions/lstm_syf.csv')

In [4]:
predictions.columns # = ['Date', 'Pred', 'Actual']

Index(['Date', 'Pred', 'Actual'], dtype='object')

In [5]:
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [6]:
stock_price_line_SYF_training = go.Scatter(x=training['Date'], y=training['Close'], name='Training', marker = dict(color = '#45afdc'))
stock_price_line_SYF_predicted = go.Scatter(x=predictions['Date'], y=predictions['Pred'], name='Predicted closing values', marker = dict(color = '#ed9e46'))
stock_price_line_SYF_actual = go.Scatter(x=predictions['Date'], y=predictions['Actual'], name='Actual closing values', marker = dict(color = '#146c8e'))

data = [stock_price_line_SYF_training, stock_price_line_SYF_predicted, stock_price_line_SYF_actual]

layout = go.Layout(
    title='Synchrony Closing Value Predictions using LSTM Model',
    xaxis=dict(
        title='Date'
    ),
    yaxis=dict(
        title='Stock Closing Price'
    )
)
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='bar-line')

In [8]:
# values sourced from model_results.xlsx
x_sfy = ['DOW Jones Avg', 'Competitors', 'Avg close 60 days']
y_sfy = [0.35, 0.47, 0.07]

x_hon = ['Avg close 60 days', 'Competitors', 'DOW Jones Avg']
y_hon = [0.34, 0.55, 0.09]

x_mmm = ['Competitors', 'DOW Jones Avg', 'Avg close 60 days']
y_mmm = [0.63, 0.26, 0.11]

x_bayer = ['Avg close 60 days', 'Competitors', 'DOW Jones Avg']
y_bayer = [0.86, 0.1, 0.03]

importance_plot_sfy = go.Bar(x=x_sfy, y=y_sfy, name='SFY', marker= dict(color = '#45afdc'))
importance_plot_hon = go.Bar(x=x_hon, y=y_hon, name='HON', marker= dict(color = '#ed9e46'))
importance_plot_mmm = go.Bar(x=x_mmm, y=y_mmm, name='MMM', marker= dict(color = '#146c8e'))
importance_plot_bayer = go.Bar(x=x_bayer, y=y_bayer, name='BAYZF', marker= dict(color = '#ffc800'))

layout = go.Layout(
    title='Most Important Features Across All Models',
    yaxis=dict(
        title='Importance'
    )
)

data = [importance_plot_sfy, importance_plot_hon, importance_plot_mmm, importance_plot_bayer]

fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='bar-line')