In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics import mean_absolute_error
import featuretools as ft

# Functions

In [2]:
def get_evaluation_metrics(y_true, y_pred, model_type=None):
    if model_type: 
        print("Model Type: {}".format(model_type))
    print("\tMSE: {:.4f}".format(mean_squared_error(y_true, y_pred)))
    print("\tRMSE: {:.4f}".format(math.sqrt(mean_squared_error(y_true, y_pred))))
    print("\tExplained Variance Score: {:.4f}".format(explained_variance_score(y_true, y_pred)))
    print("\tMAE: {:.4f}\n==========".format(mean_absolute_error(y_true, y_pred)))
    
def get_data_split_by_ticker(feature_set):
    aapl = feature_set.loc[feature_set["ticker_symbol"]=="AAPL"]
    amzn = feature_set.loc[feature_set["ticker_symbol"]=="AMZN"]
    goog = feature_set.loc[feature_set["ticker_symbol"]=="GOOG"]
    googl = feature_set.loc[feature_set["ticker_symbol"]=="GOOGL"]
    msft = feature_set.loc[feature_set["ticker_symbol"]=="MSFT"]
    tsla = feature_set.loc[feature_set["ticker_symbol"]=="TSLA"]
    return aapl, amzn, goog, googl, msft, tsla



# Load Data

In [3]:
cd = os.getcwd()
raw_data_directory_path = "/".join(cd.split("/")[:-1]) + "/data/raw/"
processed_data_directory_path = "/".join(cd.split("/")[:-1]) + "/data/processed/"
weekly_data = pd.read_csv(processed_data_directory_path + "weekly_data.csv")

# Weekly Linear Regression `close_value` Model

In [4]:
current_features_indices = [0, 6, 7, 8, 9, 10, 11, 12, 14, 15]

features = weekly_data.iloc[:, current_features_indices]
response_variable = weekly_data.iloc[:, 5]

In [5]:
current_features = pd.get_dummies(features, prefix='ticker')

X_train, X_test, y_train, y_test = train_test_split(current_features, response_variable, test_size=0.25, random_state=42)

In [6]:
X_train

Unnamed: 0,volume,reaction_num,compound,writer,body,positive,negative,ratio_pos,ratio_neg,ticker_AAPL,ticker_AMZN,ticker_GOOG,ticker_GOOGL,ticker_MSFT,ticker_TSLA
86,196224543240,5267,0.060526,901,7725,6323,1402,0.818511,0.181489,1,0,0,0,0,0
1107,52779543060,1144,0.092671,238,1546,1351,195,0.873868,0.126132,0,0,0,0,1,0
1488,24107129303,22466,0.093710,953,4144,3034,1110,0.732143,0.267857,0,0,0,0,0,1
365,5188814536,2390,0.103006,557,2137,1816,321,0.849789,0.150211,0,1,0,0,0,0
265,11945130597,1748,0.117113,658,2023,1826,197,0.902620,0.097380,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,71559604970,1459,0.086105,413,2322,1938,384,0.834625,0.165375,0,0,0,0,1,0
1294,49923543340,5219,0.164745,733,1597,1402,195,0.877896,0.122104,0,0,0,0,1,0
860,1069602551,957,0.090259,296,707,614,93,0.868458,0.131542,0,0,0,1,0,0
1459,12113841226,6156,0.119133,790,2308,1902,406,0.824090,0.175910,0,0,0,0,0,1


In [7]:
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [8]:
get_evaluation_metrics(y_test, y_pred)

	MSE: 48093.5355
	RMSE: 219.3024
	Explained Variance Score: 0.7985
	MAE: 154.0381


# Weekly Linear Regression `close_value` Combined Model

In [11]:
aapl, amzn, goog, googl, msft, tsla = get_data_split_by_ticker(weekly_data)

current_feature_indices = [6, 7, 8, 9, 10, 11, 12, 14, 15]

aapl_features = aapl.iloc[:, current_feature_indices]
aapl_rv = aapl.iloc[:, 5]

amzn_features = amzn.iloc[:, current_feature_indices]
amzn_rv = amzn.iloc[:, 5]

goog_features = goog.iloc[:, current_feature_indices]
goog_rv = goog.iloc[:, 5]

googl_features = googl.iloc[:, current_feature_indices]
googl_rv = googl.iloc[:, 5]

msft_features = msft.iloc[:, current_feature_indices]
msft_rv = msft.iloc[:, 5]

tsla_features = tsla.iloc[:, current_feature_indices]
tsla_rv = tsla.iloc[:, 5]

In [12]:
X_train_aapl, X_test_aapl, y_train_aapl, y_test_aapl = train_test_split(aapl_features, aapl_rv, test_size=0.25, random_state=42)
X_train_amzn, X_test_amzn, y_train_amzn, y_test_amzn = train_test_split(amzn_features, amzn_rv, test_size=0.25, random_state=42)
X_train_goog, X_test_goog, y_train_goog, y_test_goog = train_test_split(goog_features, goog_rv, test_size=0.25, random_state=42)
X_train_googl, X_test_googl, y_train_googl, y_test_googl = train_test_split(googl_features, googl_rv, test_size=0.25, random_state=42)
X_train_msft, X_test_msft, y_train_msft, y_test_msft = train_test_split(msft_features, msft_rv, test_size=0.25, random_state=42)
X_train_tsla, X_test_tsla, y_train_tsla, y_test_tsla = train_test_split(tsla_features, tsla_rv, test_size=0.25, random_state=42)

In [13]:
reg_aapl = LinearRegression().fit(X_train_aapl, y_train_aapl)
y_pred_aapl = reg_aapl.predict(X_test_aapl)

reg_amzn = LinearRegression().fit(X_train_amzn, y_train_amzn)
y_pred_amzn = reg_amzn.predict(X_test_amzn)

reg_goog = LinearRegression().fit(X_train_goog, y_train_goog)
y_pred_goog = reg_goog.predict(X_test_goog)


reg_googl = LinearRegression().fit(X_train_googl, y_train_googl)
y_pred_googl = reg_googl.predict(X_test_googl)


reg_msft = LinearRegression().fit(X_train_msft, y_train_msft)
y_pred_msft = reg_msft.predict(X_test_msft)

reg_tsla = LinearRegression().fit(X_train_tsla, y_train_tsla)
y_pred_tsla = reg_tsla.predict(X_test_tsla)

In [14]:
all_ticker_true = y_test_aapl + y_test_amzn + y_test_goog + y_test_googl + y_test_msft + y_test_tsla

In [15]:
all_ticker_true = pd.concat([y_test_aapl, y_test_amzn, y_test_goog, y_test_googl, y_test_msft, y_test_tsla])
all_ticker_pred = np.concatenate([y_pred_aapl, y_pred_amzn, y_pred_goog, y_pred_googl, y_pred_msft, y_pred_tsla])

In [16]:
get_evaluation_metrics(all_ticker_true, all_ticker_pred, "Combined Ticker Models")
get_evaluation_metrics(y_test, y_pred, "Single Model")

Model Type: Combined Ticker Models
	MSE: 14761.0156
	RMSE: 121.4949
	Explained Variance Score: 0.9396
	MAE: 76.5495
Model Type: Single Model
	MSE: 48093.5355
	RMSE: 219.3024
	Explained Variance Score: 0.7985
	MAE: 154.0381


### Between the two model types, it appears that there's a pretty significant performance boost when switching to a model that combines models for each `ticker_symbol`. All metrics seem to be worse for the model that uses `ticker_symbol` as a feature. 

#### Note: performance increase may be due to stratification of `ticker_symbol` variable rather than due to the different types of models. We should look into this. 