# Linear Regression: Demo

In [None]:
# Start with importing the usual stuff

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from sklearn.model_selection import train_test_split 

pd.options.mode.copy_on_write = True

In [None]:
# This contains some functions written by the authors of the textbook (ISLP)
# we'll be adapting most of the this demo from their material

from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)

## Set up data

In [None]:
# First we'll read in our data and look quickly at its contents

#Auto = pd.read_csv('../../data/Auto.csv')
Auto = pd.read_csv('Auto.csv')
Auto

In [None]:
# Sometimes it's helpful to look at the list of columns
Auto.columns

In [None]:
# What is the data type of each column?
Auto.dtypes

## Review the target variable

In [None]:
# Since we want to predict mpg, let's start by listing all its possible values
np.unique(Auto['mpg'])

In [None]:
# Are there any missing values in there?
# isnull() returns 1s for each NaN, then we can add them up to get the total count of missing values

Auto['mpg'].isnull().sum()

# CAUTION: This isn't foolproof! It won't find missing values that have been coded as 0, -1, 9999, etc.

In [None]:
# Let's create a function that allows us to display a histogram for a variable
# Credit to Hastie and Tibshirani--see their notes for more details on this

def histogram(var):
    n, bins, patches = plt.hist(x=var, bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
    plt.grid(axis='y', alpha=0.75)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Frequency of ' + var.name)

In [None]:
histogram(Auto['mpg'])

## Feature Engineering

### Missing Values

In [None]:
np.unique(Auto['horsepower'])

In [None]:
Auto[(Auto['horsepower']=='?')]

In [None]:
# Should we remove those records?
# Auto = Auto[(Auto['horsepower']!='?')]

In [None]:
# Or should we impute the mean value?

# Ok let's calculate the mean value first
int(pd.to_numeric(Auto['horsepower'], errors='coerce').mean())

In [None]:
# Now we'll impute the mean into the missing cells
Auto.replace({'horsepower':'?'},'104',inplace=True)

np.unique(Auto['horsepower'])

In [None]:
# Convert the horsepower column to numeric

Auto['horsepower'] = pd.to_numeric(Auto['horsepower'])

## Set up structure for modeling

In [None]:
# Identify some predictors and the target

X = Auto[['horsepower', 'weight', 'year']]
y = Auto['mpg']

In [None]:
# Create a column for the intercept in the features matrix

X['intercept'] = np.ones(Auto.shape[0])
X

In [None]:
# Create training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=314,
                                                    test_size=0.25,
                                                    shuffle=True) 
Train = pd.merge_ordered(X_train,y_train,left_on=X_train.index,right_on=y_train.index).drop(columns=['key_0'])
Test = pd.merge_ordered(X_test,y_test,left_on=X_test.index,right_on=y_test.index).drop(columns=['key_0'])

In [None]:
# Spot check the train and test sets

print(X_train.head()) 
print(X_train.shape)
print()
print(y_train.head()) 
print(y_train.shape)
print() 
print(X_test.head()) 
print(X_test.shape)
print() 
print(y_test.head())
print(y_test.shape)
print() 
print(Train.head())
print(Train.shape)
print() 
print(Test.head())
print(Test.shape)

In [None]:
# Look at possible values and spot check for missing entries

print('MPG')
print(np.unique(Train['mpg']))
print('Horsepower')
print(np.unique(Train['horsepower']))
print('Weight')
print(np.unique(Train['weight']))
print('Year')
print(np.unique(Train['year']))

## Simple Linear Regression, Example 1

In [None]:
# Build a linear model where horsepower predicts mpg
# Take note of the p-values

model_hp = sm.OLS(y_train, X_train[['intercept','horsepower']])
results_hp = model_hp.fit()
summarize(results_hp)

In [None]:
# We can also assess the overall fit of the model

print('R^2 on train:',results_hp.rsquared)

In [None]:
# Create helper functions for computing the mean squared error

def predict(X, model):
    # the built-in get_prediction tool returns an array, so we need to convert to a dataframe
    predictions_df = pd.DataFrame(model.get_prediction(X).predicted, columns=['y_hat'], index=X.index)
    return predictions_df['y_hat']

def mse(y, y_hat):
    # calculate the residual error for each individual record
    resid = y - y_hat
    # square the residual (hence "squared error")
    sq_resid = resid**2
    # calculate the sum of squared errors
    SSR = sum(sq_resid)
    # divide by the number of records to get the mean squared error
    MSE = SSR / y.shape[0]
    return MSE

In [None]:
# Let's compute the MSE on the training and test sets

predictions_hp_train = predict(X_train[['intercept', 'horsepower']], results_hp)
print('mse train:',mse(y_train, predictions_hp_train))
predictions_hp_test = predict(X_test[['intercept', 'horsepower']], results_hp)
print('mse test:',mse(y_test, predictions_hp_test))

In [None]:
# Define a function to draw a line given coefficients [credit to Hastie & Tibshirani]

def abline(ax, b, m, *args, **kwargs):
    "Add a line with slope m and intercept b to ax"
    xlim = ax.get_xlim()
    ylim = [m * xlim[0] + b, m * xlim[1] + b]
    ax.plot(xlim, ylim, *args, **kwargs)

In [None]:
# Plot hp vs mpg on training set

ax = Train.plot.scatter('horsepower', 'mpg')
ax.set_title("Plot of hp vs mpg (Train)")
abline(ax,
       results_hp.params[0],
       results_hp.params[1],
       'r--',
       linewidth=3)

In [None]:
# Plot hp vs mpg on test set

ax = Test.plot.scatter('horsepower', 'mpg')
ax.set_title("Plot of hp vs mpg (Test)")
abline(ax,
       results_hp.params[0],
       results_hp.params[1],
       'g--',
       linewidth=3)

In [None]:
# Plot residual error for train set

ax = subplots(figsize=(8,8))[1]
ax.scatter(predictions_hp_train, y_train-predictions_hp_train)
ax.set_xlabel('Fitted value')
ax.set_ylabel('Residual')
ax.axhline(0, c='k', ls='--');

In [None]:
# Plot residual error for test set

ax = subplots(figsize=(8,8))[1]
ax.scatter(predictions_hp_test, y_test-predictions_hp_test)
ax.set_xlabel('Fitted value')
ax.set_ylabel('Residual')
ax.axhline(0, c='k', ls='--');

## Simple Linear Regression, Example 2

In [None]:
# Build a linear model where weight predicts mpg

model_weight = sm.OLS(y_train, X_train[['intercept','weight']])
results_weight = model_weight.fit()
summarize(results_weight)

In [None]:
# R^2 for weight model
print('R^2 on train:',results_weight.rsquared)

In [None]:
# MSE for weight model

predictions_weight_train = predict(X_train[['intercept', 'weight']], results_weight)
print('mse train:',mse(y_train, predictions_weight_train))
predictions_weight_test = predict(X_test[['intercept', 'weight']], results_weight)
print('mse test:',mse(y_test, predictions_weight_test))

In [None]:
# Plot weight vs mpg on training set

ax = Train.plot.scatter('weight', 'mpg')
ax.set_title("Plot of weight vs mpg (Train)")
abline(ax,
       results_weight.params[0],
       results_weight.params[1],
       'r--',
       linewidth=3)

In [None]:
# Plot weight vs mpg on test set

ax = Test.plot.scatter('weight', 'mpg')
ax.set_title("Plot of weight vs mpg (Test)")
abline(ax,
       results_weight.params[0],
       results_weight.params[1],
       'g--',
       linewidth=3)

In [None]:
# Plot residual error for training set

ax = subplots(figsize=(8,8))[1]
ax.scatter(predictions_weight_train, y_train-predictions_weight_train)
ax.set_xlabel('Fitted value')
ax.set_ylabel('Residual')
ax.axhline(0, c='k', ls='--');

In [None]:
# Plot residual error for test set

ax = subplots(figsize=(8,8))[1]
ax.scatter(predictions_weight_test, y_test-predictions_weight_test)
ax.set_xlabel('Fitted value')
ax.set_ylabel('Residual')
ax.axhline(0, c='k', ls='--');

## Simple Linear Regression, Example 3

In [None]:
# Build a linear model where year predicts mpg



In [None]:
# Compute the R^2 for year model (on train)



In [None]:
# Compute the MSE for year model (on train and then on test)



In [None]:
# Plot year vs mpg (on train)



## Categorical Predictors

In [None]:
# One potentially useful variable is the name of the vehicle

Auto['name']

In [None]:
# Let's grab just the first word in each vehicle name, which appears to be the "make"

Auto['make'] = Auto['name'].str.split(' ').str[0]

In [None]:
# Take a look at the possible values

Auto['make'].value_counts()

In [None]:
# What cleaning should be done?








In [None]:
# If we want to include this variable in our model, we might want to do one hot encoding

one_hot = pd.get_dummies(Auto, columns = ['make']) 
one_hot

## Simple (or Multiple?) Linear Regression, Example 4

In [None]:
# Build a linear model where 'make_toyota', 'make_ford', and 'make_pontiac' predict mpg



In [None]:
# Compute the R^2 for this model (on train)



In [None]:
# Compute the MSE for this model (on train and then on test)

