# PROJECT

You will construct a linear regression model to predict the return of a ticker, given the returns of an index (SPY).  You will source the data, assemble it into a useful form, and transform it as needed.  Finally, you will use sklearn to build the model and evaluate it using the RMSE Performance metric.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# <span style="color:red">Import any other modules you need</span>

In [2]:
# Your imports
import warnings
warnings.filterwarnings("ignore")
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

# <span style="color:red">Create function to obtain the train and test data</span>

In [3]:
def getData(ticker, indx):
    """
    Retrieve two timeseries: one for a ticker and one for an index.
    Return a DataFrame containing the two timeseries.
   
    Parameters
    ----------
    ticker, indx: Strings representing the stock symbol for "ticker" and the "index"
    
    The two timeseries are in separate CSV files.  The code below will construct the names of the files from
    the stock symbol strings.
    
    The files contain multiple features. The feature of interest to us is "Close", which is the closing price.
        
    
    Returns
    --------
    df: a DataFrame with the following properties
    
    df.index should be the dates in the timeseries
    df should have (at least) 2 columns, with names:
    "Dependent"
    "Independent"
    
    df.loc[:, "Dependent"] should be the timeseries of the "Close" attribute for the ticker
    df.loc[:, "Independent"] should be the timeseries for the "Close" attribute of the index against which we are computing beta.
    """

    # Construct the name of the files containing the ticker and the "index"
    ticker_file = "./data/assignment_1/{t}.csv".format(t=ticker)
    indx_file   = "./data/assignment_1/{t}.csv".format(t=indx)
    
    # Create the function body according to the spec
    
        
    ticker_data = pd.read_csv("data/assignment_1/"+ticker+".csv")
    indx_data = pd.read_csv("data/assignment_1/"+indx+".csv")
    df1 = pd.merge(ticker_data,indx_data,on='Date',how='inner')
    df = df1[['Date','Close_x','Close_y']]
    df.rename(columns={'Close_x':'Dependent'},inplace=True)
    df.rename(columns={'Close_y':'Independent'},inplace=True)
    df = df.set_index('Date')

    # Change the return statement as appropriate
    return df

In [4]:
# Ticker: BA (Boeing), Index: SPY (the ETF for the S&P 500)
df = getData("BA", "SPY")

X = df.loc[:, ["Independent"] ]
y = df.loc[:, ["Dependent"] ]

In [5]:
X

Unnamed: 0_level_0,Independent
Date,Unnamed: 1_level_1
2018-01-02,268.769989
2018-01-03,270.470001
2018-01-04,271.609985
2018-01-05,273.420013
2018-01-08,273.920013
2018-01-09,274.540009
2018-01-10,274.119995
2018-01-11,276.119995
2018-01-12,277.920013
2018-01-16,276.970001


In [6]:
y

Unnamed: 0_level_0,Dependent
Date,Unnamed: 1_level_1
2018-01-02,296.839996
2018-01-03,297.799988
2018-01-04,296.670013
2018-01-05,308.839996
2018-01-08,310.149994
2018-01-09,318.429993
2018-01-10,320.260010
2018-01-11,328.119995
2018-01-12,336.209991
2018-01-16,335.160004


# <span style="color:red">Create function to split the full data into train and test data</span>

In [7]:
def split(X, y):
    """
    Split the data into a training and test set
    
    The training data should span the date range from 1/1/2018 to 6/30/2018
    The test data should span the date range from 7/1/2018 to 7/31/2018
    
    Parameters
    ----------
    X: DataFrame containing the independent variable(s) (i.e, features, predictors)
    y: DataFrame containing the dependent variable (i.e., the target)
    
    Optional
    --------
    seed: Integer used as the seed for a random number generator
      You don't necessarily NEED to use a random number generator but, if you do, please use the default value for seed
    
    Returns
    -------
    X_train: DataFrame containing training data for independent variable(s)
    X_test:  DataFrame containing test data for independent variable(s)
    y_train: DataFrame containing training data for dependent variable
    y_test:  DateFrame containing test data for dependent variable
    """
    
    # IF  you need to use a random number generator, use rng.
    #rng = np.random.RandomState(seed)
    
    # Create the function body according to the spec
        
    X_train = X.loc['2018-01-01':'2018-06-30',"Independent"]
    X_test = X.loc['2018-07-01':'2018-07-31',"Independent"]
    y_train = y.loc['2018-01-01':'2018-06-30',"Dependent"]
    y_test = y.loc['2018-07-01':'2018-07-31',"Dependent"]

    # Change the return statement as appropriate
    return X_train, X_test, y_train, y_test
   

In [8]:
# Split the data into a training and a test set
X_train, X_test, y_train, y_test = split(X, y)


In [9]:
X_train

Date
2018-01-02    268.769989
2018-01-03    270.470001
2018-01-04    271.609985
2018-01-05    273.420013
2018-01-08    273.920013
2018-01-09    274.540009
2018-01-10    274.119995
2018-01-11    276.119995
2018-01-12    277.920013
2018-01-16    276.970001
2018-01-17    279.609985
2018-01-18    279.140015
2018-01-19    280.410004
2018-01-22    282.690002
2018-01-23    283.290009
2018-01-24    283.179993
2018-01-25    283.299988
2018-01-26    286.579987
2018-01-29    284.679993
2018-01-30    281.760010
2018-01-31    281.899994
2018-02-01    281.579987
2018-02-02    275.450012
2018-02-05    263.929993
2018-02-06    269.130005
2018-02-07    267.670013
2018-02-08    257.630005
2018-02-09    261.500000
2018-02-12    265.339996
2018-02-13    266.000000
                 ...    
2018-05-18    271.329987
2018-05-21    273.369995
2018-05-22    272.609985
2018-05-23    273.359985
2018-05-24    272.799988
2018-05-25    272.149994
2018-05-29    269.019989
2018-05-30    272.609985
2018-05-31    270.94

In [10]:
X_test

Date
2018-07-02    271.859985
2018-07-03    270.899994
2018-07-05    273.109985
2018-07-06    275.420013
2018-07-09    277.899994
2018-07-10    278.899994
2018-07-11    276.859985
2018-07-12    279.369995
2018-07-13    279.589996
2018-07-16    279.339996
2018-07-17    280.470001
2018-07-18    281.059998
2018-07-19    280.000000
2018-07-20    279.679993
2018-07-23    280.200012
2018-07-24    281.609985
2018-07-25    284.010010
2018-07-26    283.339996
2018-07-27    281.420013
2018-07-30    279.950012
2018-07-31    281.329987
Name: Independent, dtype: float64

In [11]:
y_train

Date
2018-01-02    296.839996
2018-01-03    297.799988
2018-01-04    296.670013
2018-01-05    308.839996
2018-01-08    310.149994
2018-01-09    318.429993
2018-01-10    320.260010
2018-01-11    328.119995
2018-01-12    336.209991
2018-01-16    335.160004
2018-01-17    351.010010
2018-01-18    340.160004
2018-01-19    337.730011
2018-01-22    338.000000
2018-01-23    335.589996
2018-01-24    334.690002
2018-01-25    343.109985
2018-01-26    343.220001
2018-01-29    340.820007
2018-01-30    337.709991
2018-01-31    354.369995
2018-02-01    356.940002
2018-02-02    348.910004
2018-02-05    328.880005
2018-02-06    340.910004
2018-02-07    348.119995
2018-02-08    329.660004
2018-02-09    332.829987
2018-02-12    343.799988
2018-02-13    343.160004
                 ...    
2018-05-18    351.230011
2018-05-21    363.920013
2018-05-22    355.019989
2018-05-23    359.209991
2018-05-24    359.000000
2018-05-25    360.089996
2018-05-29    352.480011
2018-05-30    358.190002
2018-05-31    352.16

In [12]:
y_test

Date
2018-07-02    336.079987
2018-07-03    332.929993
2018-07-05    333.179993
2018-07-06    334.640015
2018-07-09    341.920013
2018-07-10    347.160004
2018-07-11    340.600006
2018-07-12    346.029999
2018-07-13    350.790009
2018-07-16    356.100006
2018-07-17    356.880005
2018-07-18    360.230011
2018-07-19    355.329987
2018-07-20    354.899994
2018-07-23    353.269989
2018-07-24    358.269989
2018-07-25    355.920013
2018-07-26    359.320007
2018-07-27    360.649994
2018-07-30    351.059998
2018-07-31    356.299988
Name: Dependent, dtype: float64

# <span style="color:red">Create function to convert the DataFrames to ndarrays</span>

In [13]:
def pd2ndarray( dfList ):
    """
    For each DataFrame in the list dfList, prepare the ndarray needed by the sklearn model
    
    Parameters
    ----------
    dfList: List of DataFrames
    
    Returns
    --------
    ndList: a list of ndarrays
    """
    
    # Create the function body according to the spec
    
    X_train_np = X_train.as_matrix()
    X_test_np = X_test.as_matrix()
    y_train_np = y_train.as_matrix()
    y_test_np = y_test.as_matrix()
    
    X_train_np = X_train_np.reshape(-1,1)
    X_test_np = X_test_np.reshape(-1,1)
    y_train_np = y_train_np.reshape(-1,1)
    y_test_np = y_test_np.reshape(-1,1)
    
    # Change the return statement as appropriate
    return [X_train_np, X_test_np, y_train_np, y_test_np]
   

In [14]:
# sklearn takes ndarrays as arguments, not DataFrames; convert your DataFrames to the appropriate ndarray
# You will need to replace the empty list argument
X_train, X_test, y_train, y_test = pd2ndarray( [X_train, X_test, y_train, y_test] )

In [15]:
X_train

array([[268.769989],
       [270.470001],
       [271.609985],
       [273.420013],
       [273.920013],
       [274.540009],
       [274.119995],
       [276.119995],
       [277.920013],
       [276.970001],
       [279.609985],
       [279.140015],
       [280.410004],
       [282.690002],
       [283.290009],
       [283.179993],
       [283.299988],
       [286.579987],
       [284.679993],
       [281.76001 ],
       [281.899994],
       [281.579987],
       [275.450012],
       [263.929993],
       [269.130005],
       [267.670013],
       [257.630005],
       [261.5     ],
       [265.339996],
       [266.      ],
       [269.589996],
       [273.029999],
       [273.109985],
       [271.399994],
       [270.049988],
       [270.399994],
       [274.709991],
       [277.899994],
       [274.429993],
       [271.649994],
       [267.700012],
       [269.079987],
       [272.190002],
       [272.880005],
       [272.779999],
       [274.100006],
       [278.869995],
       [278.5

In [16]:
X_test

array([[271.859985],
       [270.899994],
       [273.109985],
       [275.420013],
       [277.899994],
       [278.899994],
       [276.859985],
       [279.369995],
       [279.589996],
       [279.339996],
       [280.470001],
       [281.059998],
       [280.      ],
       [279.679993],
       [280.200012],
       [281.609985],
       [284.01001 ],
       [283.339996],
       [281.420013],
       [279.950012],
       [281.329987]])

In [17]:
y_train

array([[296.839996],
       [297.799988],
       [296.670013],
       [308.839996],
       [310.149994],
       [318.429993],
       [320.26001 ],
       [328.119995],
       [336.209991],
       [335.160004],
       [351.01001 ],
       [340.160004],
       [337.730011],
       [338.      ],
       [335.589996],
       [334.690002],
       [343.109985],
       [343.220001],
       [340.820007],
       [337.709991],
       [354.369995],
       [356.940002],
       [348.910004],
       [328.880005],
       [340.910004],
       [348.119995],
       [329.660004],
       [332.829987],
       [343.799988],
       [343.160004],
       [344.850006],
       [356.459991],
       [355.040009],
       [353.350006],
       [352.369995],
       [355.920013],
       [356.660004],
       [363.480011],
       [364.640015],
       [362.209991],
       [349.690002],
       [344.670013],
       [352.75    ],
       [348.920013],
       [347.040009],
       [348.730011],
       [354.519989],
       [344.1

In [18]:
y_test

array([[336.079987],
       [332.929993],
       [333.179993],
       [334.640015],
       [341.920013],
       [347.160004],
       [340.600006],
       [346.029999],
       [350.790009],
       [356.100006],
       [356.880005],
       [360.230011],
       [355.329987],
       [354.899994],
       [353.269989],
       [358.269989],
       [355.920013],
       [359.320007],
       [360.649994],
       [351.059998],
       [356.299988]])

# <span style="color:red">Create function to return the sklearn model you need</span>

In [19]:
def createModel():
    """
    Create an sklearn model object
    
    Parameters
    ----------
    None
    
    Returns
    -------
    model: An sklearn model object,
    i.e., responds to model.fit(), model.predict()
    """
    
    # Create the function body according to the spec
    
    regr = linear_model.LinearRegression()
    
    # Change the return statement as appropriate
    return regr

In [20]:
# Create linear regression object
model = createModel()

# Train the model using the training sets
_ = model.fit(X_train, y_train)

# The coefficients
print('Coefficients: \n', model.intercept_, model.coef_)

Coefficients: 
 [54.88037132] [[1.05394596]]


# <span style="color:red">Create function to compute a Root Mean Squared Error</span>

In [21]:
def computeRMSE( target, predicted ):
    """
    Compute the Root Mean Squared Error (RMSE)
    
    Parameters
    -----------
    target: ndarray of target values
    predicted: ndarray of predicted values
    
    Returns
    -------
    rmse: a Scalar value containg the RMSE
    """
    
    # Create the function body according to the spec
    
    rmse = mean_squared_error(target,predicted)
        
    # Change the return statement as appropriate
    return rmse

# <span style="color:red">Evaluate in and out of sample Root Mean Squared Error</span>

In [22]:
# Predictions:


# predict in sample: You will need to change the None argument
y_pred_train = model.predict( X_train )

# predict out of sample: You will need to change the None argument
y_pred_test = model.predict( X_test )

In [23]:
# Compute the in-sample fit
# - you will need to replace the None's below with the appropriate argument
rmse_insample = computeRMSE( y_train, y_pred_train )
print("RMSE (train): {r:2.3f}".format(r=rmse_insample))

# Compute the out of sample fit
# - you will need to replace the None's below with the appropriate argument
rmse_outOfsample = computeRMSE( y_test, y_pred_test)
print("RMSE (test): {r:2.3f}".format(r=rmse_outOfsample))


RMSE (train): 180.274
RMSE (test): 37.997
