# MMAI5040 - Business Application of AI 1
## Lab 2 - Evaluation of model performance

Import required packages

In [None]:
%matplotlib inline
import math
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pylab as plt

# install dmba package containing utility functions (see: https://pypi.org/project/dmba/) 
import sys
!{sys.executable} -m pip install dmba
from dmba import regressionSummary, classificationSummary, liftChart, gainsChart 

Measurement of prediction error (Numerical value outcome)

In [None]:
# Mount google drive to access files from within colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load data, pre-process and generate training and validation sets
car_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ToyotaCorolla.csv')
car_df.shape

In [None]:
car_df.head(3)

In [None]:
car_df.dtypes

In [None]:
# pre-process the data: 
## create a list of predictors by removing the outcome variable and text character columns 
excludeColumns = ('Price','ID','Model','Fuel_Type','Color')
predictors = [s for s in car_df.columns if s not in excludeColumns]
outcome = 'Price'

In [None]:
# partition data
X = car_df[predictors]
y = car_df[outcome]

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

print('Training   :', train_X.shape)
print('Validation :', valid_X.shape)

In [None]:
# Train a linear regression model
reg = LinearRegression()
reg.fit(train_X, train_y)

In [None]:
# Evaluate performance

# training data
regressionSummary(train_y, reg.predict(train_X))
# validation data
regressionSummary(valid_y, reg.predict(valid_X))

In [None]:
# Plot prediction errors using histogram and box plot for training and validation sets

pred_error_train = pd.DataFrame({
    'residual': train_y - reg.predict(train_X), 
    'data set': 'training'
})
pred_error_valid = pd.DataFrame({
    'residual': valid_y - reg.predict(valid_X), 
    'data set': 'validation'
})
boxdata_df = pred_error_train.append(pred_error_valid, ignore_index=True)

fig, axes = plt.subplots(nrows=1, ncols=3)
fig.set_size_inches(9, 4)
common = {'bins': 100, 'range': [-6500, 6500]}
pred_error_train.hist(ax=axes[0], **common)
pred_error_valid.hist(ax=axes[1], **common)
boxdata_df.boxplot(ax=axes[2], by='data set')

axes[0].set_title('training')
axes[1].set_title('validation')
axes[2].set_title(' ')
axes[2].set_ylim(-6500, 6500)
plt.suptitle('Prediction errors') 
plt.subplots_adjust(bottom=0.15, top=0.85, wspace=0.35)

plt.show()

# notice that on the boxplot, most most errors are in the [-2000,2000] range...

Cummulative gains and lift charts: 
* *Goal* = search among a set of new records for the subset of records that gives the highest cummulative predicted values. 

* E.g., Select cars likely to generate the most resale value

In [None]:
# Creating cummulative gains and lift charts
# sort the actual values in descending order of the prediction
df = pd.DataFrame({
    'predicted': reg.predict(valid_X),
    'actual': valid_y, 
})
df = df.sort_values(by=['predicted'], ascending=False)

fig, axes = plt.subplots(nrows=1, ncols=2)

# gains chart
ax = gainsChart(df['actual'], ax=axes[0])
ax.set_ylabel('Cumulative Price')
ax.set_title('Cumulative Gains Chart')

# lift chart
ax = liftChart(df['actual'], ax=axes[1], labelBars=False)
ax.set_ylabel('Lift')

plt.tight_layout()
plt.show()

Measurement of Classification Accuracy (Categorical outcome)

In [None]:
# Confusion Matrices based on different cut-offs

# Load data: ownerExample data contains data on lawn mower ownership
owner_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ownerExample.csv')
owner_df.head()

In [None]:
class_names = ['nonowner','owner']

In [None]:
# produce confusion matrix for cutoff at 0.5
predicted = ['owner' if p > 0.5 else 'nonowner' for p in owner_df.Probability]
classificationSummary(owner_df.Class, predicted, class_names=class_names)

In [None]:
# produce confusion matrix for cutoff at 0.25
predicted = ['owner' if p > 0.25 else 'nonowner' for p in owner_df.Probability]
classificationSummary(owner_df.Class, predicted, class_names=class_names)

In [None]:
# confusion matrix for cutoff at 0.75
predicted = ['owner' if p > 0.75 else 'nonowner' for p in owner_df.Probability]
classificationSummary(owner_df.Class, predicted, class_names=class_names)

In [None]:
# Examine accuracy over multiple cut-offs:
# Using liftExample data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/liftExample.csv')
df.head()

In [None]:
# Using liftExample data, calculate the accuracy at different cutoff values and create a graph

cutoffs = [i * 0.1 for i in range(0, 11)]
accT = []
for cutoff in cutoffs:
    predicted = [1 if p > cutoff else 0 for p in df.prob]
    accT.append(accuracy_score(df.actual, predicted))

line_accuracy = plt.plot(cutoffs, accT, '-', label='Accuracy')[0]
line_error = plt.plot(cutoffs, [1 - acc for acc in accT], '--', label='Overall error')[0]
plt.ylim([0,1])
plt.xlabel('Cutoff Value')
plt.legend(handles=[line_accuracy, line_error])

plt.show()

#### Receiver Operating Characteristics (ROC) Curve

In [None]:
# Create the ROC for the classification of liftExample data
fpr, tpr, _ = roc_curve(df.actual, df.prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=[5, 5])
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.legend(loc="lower right")

plt.show()

Gains Chart

In [None]:
# Create the gains chart for the for the liftExample classification

# remember we need to first sort the predicted probabilties
df = df.sort_values(by=['prob'], ascending=False)

gainsChart(df.actual, figsize=(4, 4))

plt.tight_layout()
plt.show()

Decile Lift Chart

In [None]:
# now create decile lift chart
liftChart(df.actual, labelBars=False)
plt.tight_layout()
plt.show()