### Assignment: Feature Engineering

Create the following features and determine if they improve model accuracy. 

1. Create 2nd and 3rd order polynomial columns for 'hd'
2. Then, try combining the 'cd' and 'multi' features: If 'cd' AND 'multi' both equal "yes" then 1, else 0.
3. Finally, bin ram values into '<=4', '8', and '16+' - does this improve model fit?
4. Create Dummy Variables for any categorical columns

Remove any features that don't improve model accuracy and score your final model on the test dataset.

In [0]:
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import numpy as np

computers = pd.read_csv("../Data/Computers.csv")

computers.tail()

### Feature Engineering

In [0]:
ram_dict = {
    2: "<=4",
    4: "<=4",
    8: "8",
    16: "16+",
    24: "16+",
    32: "16+"    
}

computers_eng = (
    pd.get_dummies(
        computers.assign(
            hd2 = computers["hd"] ** 2,
            hd3 = computers["hd"] ** 3,
#             ports = np.where((computers["cd"] == "yes") & (computers["multi"] == "yes"), 1, 0), 
#             ram_categories = computers["ram"].map(ram_dict),
        ),
    drop_first=True    
    )
)

computers_eng.head()

### Data Splitting

In [0]:
from sklearn.model_selection import train_test_split


features = ["speed", "hd", "ram", "screen", "ads", "trend"]

X = sm.add_constant(computers_eng.drop("price", axis=1))
# X = sm.add_constant(computers[features])
y = np.log(computers["price"])

# Test Split
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=12345)

### Cross-Validation

In [0]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_absolute_error as mae



kf = KFold(n_splits=5, shuffle=True, random_state=2023)

# Create a list to store validation scores for each fold
cv_lm_r2s = []
cv_lm_mae = []

# Loop through each fold in X and y
for train_ind, val_ind in kf.split(X, y):
    # Subset data based on CV folds
    X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
    X_val, y_val = X.iloc[val_ind], y.iloc[val_ind]
    # Fit the Model on fold's training data
    model = sm.OLS(y_train, X_train).fit()
    # Append Validation score to list 
    cv_lm_r2s.append(r2(y_val, model.predict(X_val),))
    cv_lm_mae.append(mae(y_val, model.predict(X_val),))

print("All Validation R2s: ", [round(x, 3) for x in cv_lm_r2s])
print(f"Cross Val R2s: {round(np.mean(cv_lm_r2s), 3)} +- {round(np.std(cv_lm_r2s), 3)}")

print("All Validation MAEs: ", [round(x, 3) for x in cv_lm_mae])
print(f"Cross Val MAEs: {round(np.mean(cv_lm_mae), 3)} +- {round(np.std(cv_lm_mae), 3)}")

In [0]:
def residual_analysis_plots(model):
    
    import scipy.stats as stats
    import matplotlib.pyplot as plt
    
    predictions = model.predict()
    residuals = model.resid
    
    fig, ax = plt.subplots(1, 2, sharey="all", figsize=(10, 6))
    
    sns.scatterplot(x=predictions, y=residuals, ax=ax[0], alpha=.3)
    ax[0].set_title("Residual Plot")
    ax[0].set_xlabel("Prediction")
    ax[0].set_ylabel("Residuals")
    
    stats.probplot(residuals, dist="norm", plot=ax[1])
    ax[1].set_title("Normal Q-Q Plot")   


In [0]:
residual_analysis_plots(model)

### Fit On All Training Data

In [0]:
model = sm.OLS(y, X).fit()

model.summary()

### Test Performance

In [0]:
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_absolute_error as mae

print(f"Test R2: {r2(y_test, model.predict(X_test))}")
print(f"Test MAE: {mae(y_test, model.predict(X_test))}")