In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics import mean_absolute_error
import featuretools as ft

# Functions

In [2]:
def get_evaluation_metrics(y_true, y_pred, model_type=None):
    if model_type: 
        print("Model Type: {}".format(model_type))
    print("\tMSE: {:.4f}".format(mean_squared_error(y_true, y_pred)))
    print("\tRMSE: {:.4f}".format(math.sqrt(mean_squared_error(y_true, y_pred))))
    print("\tExplained Variance Score: {:.4f}".format(explained_variance_score(y_true, y_pred)))
    print("\tMAE: {:.4f}\n==========".format(mean_absolute_error(y_true, y_pred)))
    
def get_data_split_by_ticker(feature_set):
    aapl = feature_set.loc[feature_set["ticker_symbol"]=="AAPL"]
    amzn = feature_set.loc[feature_set["ticker_symbol"]=="AMZN"]
    goog = feature_set.loc[feature_set["ticker_symbol"]=="GOOG"]
    googl = feature_set.loc[feature_set["ticker_symbol"]=="GOOGL"]
    msft = feature_set.loc[feature_set["ticker_symbol"]=="MSFT"]
    tsla = feature_set.loc[feature_set["ticker_symbol"]=="TSLA"]
    return aapl, amzn, goog, googl, msft, tsla


# Load data

In [3]:
cd = os.getcwd()
raw_data_directory_path = "/".join(cd.split("/")[:-1]) + "/data/raw/"
processed_data_directory_path = "/".join(cd.split("/")[:-1]) + "/data/processed/"
weekly_data = pd.read_csv(processed_data_directory_path + "weekly_data.csv")

# Weekly linear regression `close_value` model

In [4]:
def build_lr_model_one():
    current_features_indices = [0, 6, 7, 8, 9, 10, 11, 12, 14, 15]

    features = weekly_data.iloc[:, current_features_indices]
    response_variable = weekly_data.iloc[:, 5]

    current_features = pd.get_dummies(features, prefix='ticker')

    X_train, X_test, y_train, y_test = train_test_split(current_features, response_variable, test_size=0.25, random_state=42)

    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    get_evaluation_metrics(y_test, y_pred)

In [5]:
build_lr_model_one()

	MSE: 48093.9168
	RMSE: 219.3033
	Explained Variance Score: 0.7985
	MAE: 154.0412


# Weekly linear regression `close_value` combined model

In [6]:
def build_lr_model_two():
    aapl, amzn, goog, googl, msft, tsla = get_data_split_by_ticker(weekly_data)

    current_feature_indices = [6, 7, 8, 9, 10, 11, 12, 14, 15]

    aapl_features = aapl.iloc[:, current_feature_indices]
    aapl_rv = aapl.iloc[:, 5]

    amzn_features = amzn.iloc[:, current_feature_indices]
    amzn_rv = amzn.iloc[:, 5]

    goog_features = goog.iloc[:, current_feature_indices]
    goog_rv = goog.iloc[:, 5]

    googl_features = googl.iloc[:, current_feature_indices]
    googl_rv = googl.iloc[:, 5]

    msft_features = msft.iloc[:, current_feature_indices]
    msft_rv = msft.iloc[:, 5]

    tsla_features = tsla.iloc[:, current_feature_indices]
    tsla_rv = tsla.iloc[:, 5]

    X_train_aapl, X_test_aapl, y_train_aapl, y_test_aapl = train_test_split(aapl_features, aapl_rv, test_size=0.25, random_state=42)
    X_train_amzn, X_test_amzn, y_train_amzn, y_test_amzn = train_test_split(amzn_features, amzn_rv, test_size=0.25, random_state=42)
    X_train_goog, X_test_goog, y_train_goog, y_test_goog = train_test_split(goog_features, goog_rv, test_size=0.25, random_state=42)
    X_train_googl, X_test_googl, y_train_googl, y_test_googl = train_test_split(googl_features, googl_rv, test_size=0.25, random_state=42)
    X_train_msft, X_test_msft, y_train_msft, y_test_msft = train_test_split(msft_features, msft_rv, test_size=0.25, random_state=42)
    X_train_tsla, X_test_tsla, y_train_tsla, y_test_tsla = train_test_split(tsla_features, tsla_rv, test_size=0.25, random_state=42)

    reg_aapl = LinearRegression().fit(X_train_aapl, y_train_aapl)
    y_pred_aapl = reg_aapl.predict(X_test_aapl)

    reg_amzn = LinearRegression().fit(X_train_amzn, y_train_amzn)
    y_pred_amzn = reg_amzn.predict(X_test_amzn)

    reg_goog = LinearRegression().fit(X_train_goog, y_train_goog)
    y_pred_goog = reg_goog.predict(X_test_goog)


    reg_googl = LinearRegression().fit(X_train_googl, y_train_googl)
    y_pred_googl = reg_googl.predict(X_test_googl)


    reg_msft = LinearRegression().fit(X_train_msft, y_train_msft)
    y_pred_msft = reg_msft.predict(X_test_msft)

    reg_tsla = LinearRegression().fit(X_train_tsla, y_train_tsla)
    y_pred_tsla = reg_tsla.predict(X_test_tsla)

    all_ticker_true = y_test_aapl + y_test_amzn + y_test_goog + y_test_googl + y_test_msft + y_test_tsla

    all_ticker_true = pd.concat([y_test_aapl, y_test_amzn, y_test_goog, y_test_googl, y_test_msft, y_test_tsla])
    all_ticker_pred = np.concatenate([y_pred_aapl, y_pred_amzn, y_pred_goog, y_pred_googl, y_pred_msft, y_pred_tsla])

    get_evaluation_metrics(all_ticker_true, all_ticker_pred, "Combined Ticker Models")


In [7]:
build_lr_model_two()

Model Type: Combined Ticker Models
	MSE: 14760.5070
	RMSE: 121.4928
	Explained Variance Score: 0.9396
	MAE: 76.5480


### Between the two model types, it appears that there's a pretty significant performance boost when switching to a model that combines models for each `ticker_symbol`. All metrics seem to be worse for the model that uses `ticker_symbol` as a feature. 

#### Note: performance increase may be due to stratification of `ticker_symbol` variable rather than due to the different types of models. We should look into this. 

# Weekly linear regression `close_value`  model without `volume` as feature

In [8]:
def build_lr_model_three():
    current_features_indices = [0, 7, 8, 9, 10, 11, 12, 14, 15]

    features = weekly_data.iloc[:, current_features_indices]
    response_variable = weekly_data.iloc[:, 5]

    current_features = pd.get_dummies(features, prefix='ticker')

    X_train, X_test, y_train, y_test = train_test_split(current_features, response_variable, test_size=0.25, random_state=42)

    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    get_evaluation_metrics(y_test, y_pred)

In [9]:
build_lr_model_three()

	MSE: 48281.0135
	RMSE: 219.7294
	Explained Variance Score: 0.7978
	MAE: 155.3529


# Weekly linear regression `close_value` combined model without `volume` as feature

In [10]:
def build_lr_model_four():
    aapl, amzn, goog, googl, msft, tsla = get_data_split_by_ticker(weekly_data)

    current_feature_indices = [7, 8, 9, 10, 11, 12, 14, 15]

    aapl_features = aapl.iloc[:, current_feature_indices]
    aapl_rv = aapl.iloc[:, 5]

    amzn_features = amzn.iloc[:, current_feature_indices]
    amzn_rv = amzn.iloc[:, 5]

    goog_features = goog.iloc[:, current_feature_indices]
    goog_rv = goog.iloc[:, 5]

    googl_features = googl.iloc[:, current_feature_indices]
    googl_rv = googl.iloc[:, 5]

    msft_features = msft.iloc[:, current_feature_indices]
    msft_rv = msft.iloc[:, 5]

    tsla_features = tsla.iloc[:, current_feature_indices]
    tsla_rv = tsla.iloc[:, 5]

    X_train_aapl, X_test_aapl, y_train_aapl, y_test_aapl = train_test_split(aapl_features, aapl_rv, test_size=0.25, random_state=42)
    X_train_amzn, X_test_amzn, y_train_amzn, y_test_amzn = train_test_split(amzn_features, amzn_rv, test_size=0.25, random_state=42)
    X_train_goog, X_test_goog, y_train_goog, y_test_goog = train_test_split(goog_features, goog_rv, test_size=0.25, random_state=42)
    X_train_googl, X_test_googl, y_train_googl, y_test_googl = train_test_split(googl_features, googl_rv, test_size=0.25, random_state=42)
    X_train_msft, X_test_msft, y_train_msft, y_test_msft = train_test_split(msft_features, msft_rv, test_size=0.25, random_state=42)
    X_train_tsla, X_test_tsla, y_train_tsla, y_test_tsla = train_test_split(tsla_features, tsla_rv, test_size=0.25, random_state=42)

    reg_aapl = LinearRegression().fit(X_train_aapl, y_train_aapl)
    y_pred_aapl = reg_aapl.predict(X_test_aapl)

    reg_amzn = LinearRegression().fit(X_train_amzn, y_train_amzn)
    y_pred_amzn = reg_amzn.predict(X_test_amzn)

    reg_goog = LinearRegression().fit(X_train_goog, y_train_goog)
    y_pred_goog = reg_goog.predict(X_test_goog)


    reg_googl = LinearRegression().fit(X_train_googl, y_train_googl)
    y_pred_googl = reg_googl.predict(X_test_googl)


    reg_msft = LinearRegression().fit(X_train_msft, y_train_msft)
    y_pred_msft = reg_msft.predict(X_test_msft)

    reg_tsla = LinearRegression().fit(X_train_tsla, y_train_tsla)
    y_pred_tsla = reg_tsla.predict(X_test_tsla)

    all_ticker_true = y_test_aapl + y_test_amzn + y_test_goog + y_test_googl + y_test_msft + y_test_tsla

    all_ticker_true = pd.concat([y_test_aapl, y_test_amzn, y_test_goog, y_test_googl, y_test_msft, y_test_tsla])
    all_ticker_pred = np.concatenate([y_pred_aapl, y_pred_amzn, y_pred_goog, y_pred_googl, y_pred_msft, y_pred_tsla])

    get_evaluation_metrics(all_ticker_true, all_ticker_pred, "Combined Ticker Models")


In [11]:
build_lr_model_four()

Model Type: Combined Ticker Models
	MSE: 15294.3563
	RMSE: 123.6704
	Explained Variance Score: 0.9374
	MAE: 77.9087
