## Assignment: Multiple Assumptions

Use the model specified below and check the following assumptions:

1. Assess Linearity & fix if necessary
2. Assess Independence of Errors -- if violated stop here
3. Assess Normality of Errors & fix if necessary
4. Assess Multicollinearity & fix if necessary
5. Assess Equal Variance of Errors & fix if necessary
6. Determine if any influential points are in our data and fix if necessary

In [0]:
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import numpy as np

computers = pd.read_csv("../Data/Computers.csv")

computers.tail()

In [0]:
computers.corr(numeric_only=True)

In [0]:
# Linearity - Not super clear here, but possible with some of our features
# will try fitting w/ and w/o polynomial terms

sns.pairplot(computers, corner=True)

In [0]:
computers.groupby("ram").agg({"price": "mean"}).plot.bar()

### Model Fitting & Evaluation

In [0]:
# Independence - DW Test Statistic looks great

cols = ["ram", "speed", "hd", "screen", "ads", "trend"]

X = sm.add_constant(computers[cols])
y = computers["price"]

model = sm.OLS(y, X).fit()

model.summary()

In [0]:
### Residual plotter helper function

def residual_analysis_plots(model):
    
    import scipy.stats as stats
    import matplotlib.pyplot as plt
    
    predictions = model.predict()
    residuals = model.resid
    
    fig, ax = plt.subplots(1, 2, sharey="all", figsize=(10, 6))
    
    sns.scatterplot(x=predictions, y=residuals, ax=ax[0], alpha=.3)
    ax[0].set_title("Residual Plot")
    ax[0].set_xlabel("Prediction")
    ax[0].set_ylabel("Residuals")
    
    stats.probplot(residuals, dist="norm", plot=ax[1])
    ax[1].set_title("Normal Q-Q Plot")   

In [0]:
# looks like slight equal variance issues

sns.scatterplot(x=model.predict(), y=model.resid)

sns.despine()

In [0]:
residual_analysis_plots(model)

In [0]:
# Normality is violated - try a log transform

import scipy.stats as stats
import matplotlib.pyplot as plt

stats.probplot(model.resid, dist="norm", plot=plt);

### Outliers & Influence

In [0]:
influence = model.get_influence()
inf_summary_df = influence.summary_frame()
inf_summary_df.head()

In [0]:
# no influential points

inf_summary_df["cooks_d"].sort_values(ascending=False).head()

### Model with feature dngineering, log transform, polynomial target

In [0]:
# Try engineering squared terms to address potential linearity issue
# and log transform target 

computers = computers.assign(
    ram2 = computers["ram"] ** 2,
    hd2 = computers["hd"] ** 2,
    premium_flag = np.where(computers["premium"] == "yes", 1, 0),
    multi_flag = np.where(computers["multi"] == "yes", 1, 0),
    cd_flag = np.where(computers["cd"] == "yes", 1, 0),
    trend2 = computers["trend"] ** 2
)

cols = [
    "ram", 
    "ram2", 
    "speed", 
    "hd", 
    "hd2", 
    "screen", 
#     "ads", 
    "trend",
    "trend2",
    "premium_flag",
    "multi_flag",
    "cd_flag"
]

X = sm.add_constant(computers[cols])
# y = computers["price"]
y = np.log(computers["price"])

model = sm.OLS(y, X).fit()

model.summary()

In [0]:
# Residuals look much better now, with no clear issues

residual_analysis_plots(model)

### Multicollinearity

In [0]:
# Multicollinearity looks good! VIFS all below 5, not including polynomial terms, which are exceptions

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

variables = sm.OLS(y, X).exog

pd.Series(
    [vif(variables, i) for i in range(variables.shape[1])],
    index=X.columns
)