In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import math


# Read in data and display first 5 rows
data = pd.read_csv('/Users/jackieoh/datathon/Aggregated Data/BAYZF-Aggregated.csv')

#adding date columns
date = data['Date'].str.split('/', expand=True)
data['Month'] = date[0]
data['Day'] = date[1]
data['Year'] = date[2]


# Create numpy array of data without Close
data = data[np.isfinite(data['AvgClose60Days'])]
labels = np.array(data['Close'])  # Labels are the values we want to predict
dates = np.array(data['Date'])
data = data.drop('Close', axis=1)
data = data.drop('Date', axis=1)
data = data.drop('JNJ', axis=1)
data = data.drop('NVS', axis=1)
data = data.drop('DOW', axis=1)
factors_list = list(data.columns)
data = np.array(data)
print(data)
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels, train_date, test_date = train_test_split(data, labels, dates, test_size=0.0132, shuffle=False)


# Get baseline prediction
average_close = train_labels.mean()
baseline_errors = abs(average_close - test_labels)
baseline_errors_squared = baseline_errors**2
average_baseline_error = round(np.mean(baseline_errors), 2)
print('Average baseline error: ', average_baseline_error)
squared_baseline_error = round(np.mean(baseline_errors_squared), 2)
print('Mean Squared baseline error: ', squared_baseline_error)


# Instantiate and train model with 1000 decision trees
rf = RandomForestRegressor(n_estimators=1000, random_state=42)
rf.fit(train_data, train_labels);

# Use the forest's predict method on the test data
predictions = rf.predict(test_data)

# Calculate errors
errors = abs(predictions - test_labels)
errorSquared = errors**2
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
print('Mean Squared Error:', round(np.mean(errorSquared), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')



r2 = r2_score(predictions, test_labels)
print('R^2: ', round(r2, 2), '%.')

# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(factors_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];


training = pd.DataFrame({'Date': train_date, 'Close': train_labels})
training.index = training['Date']
actual = pd.DataFrame({'Date': test_date, 'Close': test_labels})
actual.index = actual['Date']
predicted = pd.DataFrame({'Date': test_date, 'Close': predictions})
predicted.index = predicted['Date']

[[49.48233312 7662.534033 51.94366677 ... '6' '26' '09']
 [49.4006665 7672.845703 51.90866679999999 ... '6' '29' '09']
 [49.35399985 7685.050529000001 51.894833500000004 ... '6' '30' '09']
 ...
 [78.03600038 25037.06204 140.07083290000003 ... '2' '13' '19']
 [77.69266707 24978.5222 139.86616630000003 ... '2' '14' '19']
 [77.39883378 24916.66322 139.6459996 ... '2' '15' '19']]
Average baseline error:  27.13
Mean Squared baseline error:  739.87
Mean Absolute Error: 5.58 degrees.
Mean Squared Error: 35.08 degrees.
Accuracy: 92.63 %.
R^2:  -1732.02 %.
Variable: AvgClose60Days       Importance: 0.86
Variable: Competitor1          Importance: 0.07
Variable: GeneralStockPrice    Importance: 0.03
Variable: Competitor2          Importance: 0.02
Variable: Year                 Importance: 0.02
Variable: Month                Importance: 0.01
Variable: Day                  Importance: 0.0


In [28]:
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [24]:
stock_price_line_SYF_training = go.Scatter(x=training['Date'], y=training['Close'], name='Training', marker = dict(color = '#45afdc'))
stock_price_line_SYF_predicted = go.Scatter(x=predicted['Date'], y=predicted['Close'], name='Predicted closing values', marker = dict(color = '#ed9e46'))
stock_price_line_SYF_actual = go.Scatter(x=actual['Date'], y=actual['Close'], name='Actual closing values', marker = dict(color = '#146c8e'))

data = [stock_price_line_SYF_training, stock_price_line_SYF_predicted, stock_price_line_SYF_actual]

layout = go.Layout(
    title='MMM Closing Value Predictions using Random Forest Model',
    xaxis=dict(
        title='Date'
    ),
    yaxis=dict(
        title='Stock Closing Price'
    )
)
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='bar-line')

In [21]:
stock_price_line_MMM_training = go.Scatter(x=training['Date'], y=training['Close'], name='Training', marker = dict(color = '#45afdc'))
stock_price_line_MMM_predicted = go.Scatter(x=predicted['Date'], y=predicted['Close'], name='Predicted closing values', marker = dict(color = '#ed9e46'))
stock_price_line_MMM_actual = go.Scatter(x=actual['Date'], y=actual['Close'], name='Actual closing values', marker = dict(color = '#146c8e'))

data = [stock_price_line_SYF_training, stock_price_line_SYF_predicted, stock_price_line_SYF_actual]

layout = go.Layout(
    title='3M Closing Value Predictions using Random Forest Model',
    xaxis=dict(
        title='Date'
    ),
    yaxis=dict(
        title='Stock Closing Price'
    )
)
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='bar-line')