The explanation for steps in this notebook is at https://youranalystbuddy.com/model-evaluation/

## Model evaluation

### Loading data and build pipeline

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

data = pd.read_csv('students1000.csv')

log_cols = ['FamilyIncome']
num_cols = ['HighSchoolGPA','AvgDailyStudyTime','TotalAbsence']
cat_cols = ['Major','State']
target = 'FirstYearGPA'

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#regular pipeline for HighSchoolGPA, AvgDailyStudyTime, and TotaAbsence
num_pipeline = Pipeline([
    ('standardize', StandardScaler()),
    ('impute', SimpleImputer(strategy='median'))
])

#log pipeline with log transformation added for FamilyIncome
def log_transform(data):
    return np.log(data)

log_pipeline = Pipeline([
    ('log transform', FunctionTransformer(log_transform)),
    ('standardize', StandardScaler()),
    ('impute', SimpleImputer(strategy='median'))
])

#categorical pipeline for Major and State
cat_pipeline = Pipeline([
    ('encode', OneHotEncoder(max_categories=5, handle_unknown='infrequent_if_exist'))
])

processing_pipeline = ColumnTransformer([
    ('log trans', log_pipeline, log_cols),
    ('numeric', num_pipeline, num_cols),
    ('class', cat_pipeline, cat_cols)
])

<h3>Regression Model</h3>

<h4>Create and train model</h4>

In [2]:
from sklearn.linear_model import LinearRegression

linear_reg_pipeline = Pipeline([
    ('processing', processing_pipeline),
    ('modeling', LinearRegression())
])

### Motivation

In [18]:
train, test = train_test_split(data, test_size=0.2)

linear_reg_pipeline.fit(train, train[[target]])
print('training r2:', linear_reg_pipeline.score(train,train[[target]]))
print('testing r2:', linear_reg_pipeline.score(test,test[[target]]))

training r2: 0.8471151657163838
testing r2: 0.8837116449064917


In [12]:
train, test = train_test_split(data, test_size=0.2)

linear_reg_pipeline.fit(train, train[[target]])
print('training r2:', linear_reg_pipeline.score(train,train[[target]]))
print('testing r2:', linear_reg_pipeline.score(test,test[[target]]))

training r2: 0.8592036109478167
testing r2: 0.840746003829508


## Cross-validation scoring

In [5]:
train, test = train_test_split(data, test_size=0.2)

In [6]:
from sklearn.model_selection import cross_val_score

#get the MSE
mse_lr_cv = cross_val_score(linear_reg_pipeline, train, train[[target]], cv=5, scoring='neg_mean_squared_error')
print(mse_lr_cv)

[-0.04142114 -0.04307921 -0.04465915 -0.03976443 -0.04902567]


In [7]:
mse_lr_cv = cross_val_score(linear_reg_pipeline, train, train[[target]], cv=5, scoring='neg_mean_squared_error')
print(-np.mean(mse_lr_cv))

0.04358991987909061


In [9]:
#get the R2
mse_lr_cv = cross_val_score(linear_reg_pipeline, train, train[[target]], cv=10, scoring='r2')
print(np.mean(mse_lr_cv))

0.8464072905351188


## Cross-validation prediction

In [10]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score

trainY_pred = cross_val_predict(linear_reg_pipeline, train, train[[target]])
print('training mse: ', mean_squared_error(train[[target]], trainY_pred))
print('training r2: ', r2_score(train[[target]], trainY_pred))

training mse:  0.04358991987909061
training r2:  0.8484520155783549
