# Supervised Learning Model Evaluation Lab

Complete the exercises below to solidify your knowledge and understanding of supervised learning model evaluation.

In [75]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Regression Model Evaluation

In [76]:
data = pd.read_csv('housing.csv')

#In the steps below it was found that the column MEDV (Y) has NaN values. 
#Confirming and dropping them off

print(data.isnull().any())


CRIM       False
ZN         False
INDUS      False
CHAS       False
NOX        False
RM         False
AGE        False
DIS        False
RAD        False
TAX        False
PTRATIO    False
B          False
LSTAT      False
MEDV        True
dtype: bool


In [77]:
data = data.fillna(0)

In [78]:
print(data.isnull().any())

CRIM       False
ZN         False
INDUS      False
CHAS       False
NOX        False
RM         False
AGE        False
DIS        False
RAD        False
TAX        False
PTRATIO    False
B          False
LSTAT      False
MEDV       False
dtype: bool


## 1. Split this data set into training (80%) and testing (20%) sets.

The `MEDV` field represents the median value of owner-occupied homes (in $1000's) and is the target variable that we will want to predict.

In [79]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, f1_score, recall_score, confusion_matrix
from sklearn.linear_model import LinearRegression

In [80]:
#Checking all columns in data_dataframe, and defining the variables for the sets

#print (data.keys())
columns_regression = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
X = data[columns_regression]
y = data['MEDV']



In [81]:
#splitting the data into learning and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [82]:
#Creating the model
model = LinearRegression()

## 2. Train a `LinearRegression` model on this data set and generate predictions on both the training and the testing set.

In [83]:
#Perform the fitting
model.fit(X_train,y_train)

#Make prediction
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

## 3. Calculate and print R-squared for both the training and the testing set.

In [84]:
#Evaluate the model
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score (y_test, y_test_pred)

#Compute
print("Score:")
print(f"Training: {r2_score(y_train, y_train_pred)},\nTests: {r2_score(y_test, y_test_pred)}")

Score:
Training: 0.7523932016050755,
Tests: 0.7949292755607812


## 4. Calculate and print mean squared error for both the training and the testing set.

In [85]:
print (f"MEDV average is:{data['MEDV'].mean()}\n")

print('MSE:')
print(f"Training: {mean_squared_error(y_train, y_train_pred)},\nTests: {mean_squared_error(y_test, y_test_pred)}\n")


MEDV average is:21.215810276679843

MSE:
Training: 31.253313649105575,
Tests: 22.31062669302033



## 5. Calculate and print mean absolute error for both the training and the testing set.

In [86]:
#Calculate MAE for training set & test set
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error (y_train, y_train_pred)

print("Mean abolsute error for trainin set:", mae_train)
print("Mean absolute error for test set:", mae_test)

#Lower values indicate better model performance

Mean abolsute error for trainin set: 4.079237999224006
Mean absolute error for test set: 4.079237999224006


## Classification Model Evaluation

In [87]:
from sklearn.datasets import load_iris

data_iris = load_iris()

X_c = pd.DataFrame(data_iris["data"], columns=data_iris["feature_names"])
y_c = pd.DataFrame(data_iris["target"], columns=["class"])

data_iris = pd.concat([X_c, y_c], axis=1)

## 6. Split this data set into training (80%) and testing (20%) sets.

The `class` field represents the type of flower and is the target variable that we will want to predict.

In [88]:
! pip install plotly

Defaulting to user installation because normal site-packages is not writeable


In [89]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.figure(figsize=(8,8))
pd.options.plotting.backend = "plotly"
sns.set(rc={'figure.figsize':(6,6)});

<Figure size 800x800 with 0 Axes>

In [90]:
#Defining the predictor and response
print (data_iris.keys())
all_columns = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

x = data_iris[all_columns]
y = data_iris['class']

#split the data into training and test sets

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'class'],
      dtype='object')


## 7. Train a `LogisticRegression` model on this data set and generate predictions on both the training and the testing set.

In [91]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=2000)

logreg.fit(x_train, y_train)

y_train_pred_ = logreg.predict(x_train)
y_test_pred_ =  logreg.predict(x_test)



## 8. Calculate and print the accuracy score for both the training and the testing set.

    This function resolves the 8,9,10,11,12 & 13 exercices.

In [92]:
accuracy_train = accuracy_score(y_train, y_train_pred_)

accuracy_test = accuracy_score(y_test, y_test_pred_)

print("Accuracy score for training set is:", accuracy_train)
print("Accuracy score for test set is:", accuracy_test)

Accuracy score for training set is: 0.975
Accuracy score for test set is: 1.0


## 9. Calculate and print the balanced accuracy score for both the training and the testing set.

In [93]:
bal_acc_score_train = balanced_accuracy_score(y_train, y_train_pred_)
bal_acc_score_test = balanced_accuracy_score(y_test, y_test_pred_)

print("Balanced accuracy score for training set is:", bal_acc_score_train)
print("Balanced accuracy score for test set is:", bal_acc_score_test)

Balanced accuracy score for training set is: 0.975609756097561
Balanced accuracy score for test set is: 1.0


## 10. Calculate and print the precision score for both the training and the testing set.

In [95]:
prec_score_train = precision_score(y_train, y_train_pred_, average='macro')
prec_score_test = precision_score(y_test, y_test_pred_, average= 'macro')

print("Precision score for training set is:", bal_acc_score_train)
print("Precision score for test set is:", bal_acc_score_test)

Precision score for training set is: 0.975609756097561
Precision score for test set is: 1.0


## 11. Calculate and print the recall score for both the training and the testing set.

In [97]:
rec_score_train = recall_score(y_train, y_train_pred_, average='macro')
rec_score_test = recall_score(y_test, y_test_pred_, average='macro')

print("Recall score for training set is:", rec_score_train)
print("Recall score for test set is:", rec_score_test)

Recall score for training set is: 0.975609756097561
Recall score for test set is: 1.0


## 12. Calculate and print the F1 score for both the training and the testing set.

In [99]:
f1_train = f1_score(y_train, y_train_pred_, average='macro')
f1_test = f1_score(y_test, y_test_pred_, average='macro')

print("F1 score for training set is:", f1_train)
print("F1 score for test set is:", f1_test)

F1 score for training set is: 0.9749960931395533
F1 score for test set is: 1.0


## 13. Generate confusion matrices for both the training and the testing set.

In [100]:
conf_mat_train = confusion_matrix(y_train, y_train_pred_)
conf_mat_test = confusion_matrix(y_test, y_test_pred_)

print("Confusion matrix for training set is:", conf_mat_train)
print("Confusion matrix for test set is:", conf_mat_test)

Confusion matrix for training set is: [[40  0  0]
 [ 0 38  3]
 [ 0  0 39]]
Confusion matrix for test set is: [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## Bonus: For each of the data sets in this lab, try training with some of the other models you have learned about, recalculate the evaluation metrics, and compare to determine which models perform best on each data set.