# Supervised Learning Model Evaluation Lab

Complete the exercises below to solidify your knowledge and understanding of supervised learning model evaluation.

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Regression Model Evaluation

In [22]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
# help(data)
# print(data)
# print(data['DESCR'])
# print(data['data'])
# print(data['data'].shape)
ca_housing = data['data']
# print(ca_housing)
# print(data['target'])
# print(data['target_names'])
ca_mhv = data["target"]
# print(ca_mhv)
# print(data["feature_names"])
# print(len(data['feature_names']))
columns = data["feature_names"]
# print(columns)
X = pd.DataFrame(ca_housing, columns=columns)
y = pd.DataFrame(ca_mhv, columns=['MHV'])

data = pd.concat([X, y])
# print(data.MHV.isna().sum())
# Given we have nan values for MHV, we will drop those rows
data.dropna(subset=['MHV'], inplace=True)
# data

# X
# y


## 1. Split this data set into training (80%) and testing (20%) sets.

The `MEDV` field represents the median value of owner-occupied homes (in $1000's) and is the target variable that we will want to predict.

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [4]:
# Your code here
seed = 161803
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=seed, test_size=.2)

## 2. Train a `LinearRegression` model on this data set and generate predictions on both the training and the testing set.

In [5]:
# Your code here
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

pred_training = model.predict(X_train)
# print('Train predition: ', pred_training)
pred_test = model.predict(X_test)
# print('Test predition: ', pred_test)

## 3. Calculate and print R-squared for both the training and the testing set.

In [6]:
# Your code here
r2_train = r2_score(y_train, pred_training)
print('Train R²: ', r2_train)

r2_test = r2_score(y_test, pred_test)
print('Test R²: ', r2_test)

Train R²:  0.604040982891235
Test R²:  0.6052651123675199


## 4. Calculate and print mean squared error for both the training and the testing set.

In [7]:
# Your code here
mse_train = mean_squared_error(y_train, pred_training)
print('Mean squared error training: ', mse_train)
mse_test = mean_squared_error(y_test, pred_test)
print('Mean squared error test: ', mse_test)

Mean squared error training:  0.5266982738591474
Mean squared error test:  0.5276647081078614


## 5. Calculate and print mean absolute error for both the training and the testing set.

In [8]:
# Your code here
mae_train = mean_absolute_error(y_train, pred_training)
print('Mean absolute error train: ', mae_train)
mae_test = mean_absolute_error(y_test, pred_test)
print('Mean absolute error test: ', mae_test)

Mean absolute error train:  0.5302225783319179
Mean absolute error test:  0.5345053907946938


## Classification Model Evaluation

In [9]:
# Your code here

%reset -f

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix

from sklearn.datasets import load_iris

seed = 161803

irisdf = load_iris()
# print(irisdf)
# print(irisdf.DESCR)
iris_data = irisdf['data']
columns = irisdf['feature_names']
# pd.DataFrame(irisdf)
# print(irisdf['target'])
iris_class = irisdf['target']

X = pd.DataFrame(iris_data, columns=columns)
y = pd.DataFrame(iris_class, columns=['iris class'])

# df = pd.concat([X, y], axis='columns')
# df.sample(5)

## 6. Split this data set into training (80%) and testing (20%) sets.

The `class` field represents the type of flower and is the target variable that we will want to predict.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, test_size=.2)

## 7. Train a `LogisticRegression` model on this data set and generate predictions on both the training and the testing set.

In [11]:
# Your code here
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

pred_train = model.predict(X_train)
print(pred_train)
pred_test = model.predict(X_test)
print(pred_test)

[0 2 1 2 0 2 0 2 0 1 0 2 2 1 2 0 0 2 0 2 2 2 1 2 2 1 1 2 2 1 2 0 0 1 1 2 1
 0 0 2 0 2 2 1 0 0 0 2 1 1 2 0 0 1 2 1 1 1 2 0 2 1 0 1 0 1 2 0 1 2 1 0 0 1
 2 0 1 2 0 0 0 0 1 0 2 0 1 1 1 1 2 2 1 0 1 2 0 0 1 2 1 1 1 0 0 2 2 0 1 2 2
 2 2 1 1 0 2 2 2 2]
[0 1 1 0 1 2 0 1 2 0 0 2 1 1 1 2 0 0 0 2 2 0 2 0 0 1 1 1 0 1]


## 8. Calculate and print the accuracy score for both the training and the testing set.

In [12]:
# Your code here

acc_train = accuracy_score(y_train, pred_train)
print('Accuracy score train: ', acc_train)
acc_test = accuracy_score(y_test, pred_test)
print('Accuracy score test: ', acc_test.__format__(".3f"))

Accuracy score train:  0.975
Accuracy score test:  0.933


## 9. Calculate and print the balanced accuracy score for both the training and the testing set.

In [13]:
# Your code here

bal_train = balanced_accuracy_score(y_train, pred_train)
print('Balanced accuracy score train: ', bal_train.__format__(".3f"))
bal_test = balanced_accuracy_score(y_test, pred_test)
print('Balanced accuracy score test: ', bal_test.__format__(".3f"))

Balanced accuracy score train:  0.975
Balanced accuracy score test:  0.922
