# Supervised Learning Model Evaluation Lab

Complete the exercises below to solidify your knowledge and understanding of supervised learning model evaluation.

In [95]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, balanced_accuracy_score, precision_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

## Regression Model Evaluation

In [60]:
from sklearn.datasets import load_boston

data = load_boston()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.DataFrame(data["target"], columns=['MEDV'])

data = pd.concat([X, y], axis=1)

## 1. Split this data set into training (80%) and testing (20%) sets.

The `MEDV` field represents the median value of owner-occupied homes (in $1000's) and is the target variable that we will want to predict.

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

## 2. Train a `LinearRegression` model on this data set and generate predictions on both the training and the testing set.

In [41]:
lr=LinearRegression()

In [42]:
lr.fit(X_train,y_train)

LinearRegression()

In [43]:
y_pred_test=lr.predict(X_test)
y_pred_train=lr.predict(X_train)

In [44]:
print('Coefficients: \n', lr.coef_)

Coefficients: 
 [[-1.09486736e-01  4.86363778e-02  3.31882536e-02  7.00229657e-01
  -1.64293646e+01  3.92215755e+00 -4.81787462e-03 -1.44041471e+00
   2.92437355e-01 -1.43907227e-02 -9.48415994e-01  8.80565049e-03
  -4.91206837e-01]]


## 3. Calculate and print R-squared for both the training and the testing set.

In [46]:
print('Coefficient of determination test:',r2_score(y_test, y_pred_test))
print('Coefficient of determination train:',r2_score(y_train, y_pred_train))

Coefficient of determination test: 0.6224874929699081
Coefficient of determination train: 0.7636579873179555


## 4. Calculate and print mean squared error for both the training and the testing set.

In [47]:
print('Mean squared error test:', mean_squared_error(y_test, y_pred_test))
print('Mean squared error train:', mean_squared_error(y_train, y_pred_train))

Mean squared error test: 31.279792262139207
Mean squared error train: 20.020711855729527


## 5. Calculate and print mean absolute error for both the training and the testing set.

In [48]:
print('Mean absolute error test:',mean_absolute_error(y_test, y_pred_test))
print('Mean absolute error train:',mean_absolute_error(y_train, y_pred_train))

Mean absolute error test: 3.3795222428059275
Mean absolute error train: 3.2018162656302263


## Classification Model Evaluation

In [64]:
from sklearn.datasets import load_iris

data = load_iris()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.DataFrame(data["target"], columns=["class"])

data = pd.concat([X, y], axis=1)

## 6. Split this data set into training (80%) and testing (20%) sets.

The `class` field represents the type of flower and is the target variable that we will want to predict.

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

## 7. Train a `LogisticRegression` model on this data set and generate predictions on both the training and the testing set.

In [73]:
lr= LogisticRegression(max_iter=10000)

In [75]:
lr.fit(X_train,y_train.values.ravel())

LogisticRegression(max_iter=10000)

In [76]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

## 8. Calculate and print the accuracy score for both the training and the testing set.

In [87]:
print('Accuracy score test:',accuracy_score(y_test, y_pred_test))
print('Accuracy score train:',accuracy_score(y_train, y_pred_train))

Accuracy score test: 0.9333333333333333
Accuracy score train: 0.9833333333333333


## 9. Calculate and print the balanced accuracy score for both the training and the testing set.

In [88]:
print('Accuracy balance test:',balanced_accuracy_score(y_test, y_pred_test))
print('Accuracy balance train:',balanced_accuracy_score(y_train, y_pred_train))

Accuracy balance test: 0.9333333333333332
Accuracy balance train: 0.9833333333333334


## 10. Calculate and print the precision score for both the training and the testing set.

In [89]:
print('Precision score test:',precision_score(y_test, y_pred_test, average="weighted"))
print('Precision score train:',precision_score(y_train, y_pred_train, average="weighted"))

Precision score test: 0.9333333333333333
Precision score train: 0.9841269841269842


## 11. Calculate and print the recall score for both the training and the testing set.

In [93]:
print('Recall score test:',recall_score(y_test, y_pred_test,average="weighted"))
print('Recall score train:',recall_score(y_train, y_pred_train,average="weighted"))

Recall score test: 0.9333333333333333
Recall score train: 0.9833333333333333


## 12. Calculate and print the F1 score for both the training and the testing set.

In [94]:
print('F1 score test:',f1_score(y_test, y_pred_test,average="weighted"))
print('F1 score train:',f1_score(y_train, y_pred_train,average="weighted"))

F1 score test: 0.9333333333333333
F1 score train: 0.9833229101521785


## 13. Generate confusion matrices for both the training and the testing set.

In [98]:
confusion_test=confusion_matrix(y_test, y_pred_test)
pd.DataFrame(confusion_test)

Unnamed: 0,0,1,2
0,10,0,0
1,0,9,1
2,0,1,9


In [99]:
confusion_train=confusion_matrix(y_train, y_pred_train)
pd.DataFrame(confusion_train)

Unnamed: 0,0,1,2
0,40,0,0
1,0,38,2
2,0,0,40


## Bonus: For each of the data sets in this lab, try training with some of the other models you have learned about, recalculate the evaluation metrics, and compare to determine which models perform best on each data set.

# Regression Models

In [117]:
from sklearn.datasets import load_boston

data = load_boston()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.DataFrame(data["target"], columns=['MEDV'])

data = pd.concat([X, y], axis=1)

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [119]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet

## LASSO

In [120]:
ls=Lasso()
ls.fit(X_train,y_train)
y_pred_test=ls.predict(X_test)
y_pred_train=ls.predict(X_train)
print('Coefficient of determination test:',r2_score(y_test, y_pred_test))
print('Coefficient of determination train:',r2_score(y_train, y_pred_train))
print('Mean squared error test:', mean_squared_error(y_test, y_pred_test))
print('Mean squared error train:', mean_squared_error(y_train, y_pred_train))

Coefficient of determination test: 0.6894557105781416
Coefficient of determination train: 0.681451649578849
Mean squared error test: 22.63703464085066
Mean squared error train: 27.807446076013264


## Ridge

In [121]:
rd=Ridge()
rd.fit(X_train,y_train)
y_pred_test=rd.predict(X_test)
y_pred_train=rd.predict(X_train)
print('Coefficient of determination test:',r2_score(y_test, y_pred_test))
print('Coefficient of determination train:',r2_score(y_train, y_pred_train))
print('Mean squared error test:', mean_squared_error(y_test, y_pred_test))
print('Mean squared error train:', mean_squared_error(y_train, y_pred_train))

Coefficient of determination test: 0.7205592761509207
Coefficient of determination train: 0.7369273687550079
Mean squared error test: 20.369749376530443
Mean squared error train: 22.964733604014647


## ElasticNet

In [122]:
en=ElasticNet()
en.fit(X_train,y_train)
y_pred_test=en.predict(X_test)
y_pred_train=en.predict(X_train)
print('Coefficient of determination test:',r2_score(y_test, y_pred_test))
print('Coefficient of determination train:',r2_score(y_train, y_pred_train))
print('Mean squared error test:', mean_squared_error(y_test, y_pred_test))
print('Mean squared error train:', mean_squared_error(y_train, y_pred_train))

Coefficient of determination test: 0.6935526172663258
Coefficient of determination train: 0.6840149551190398
Mean squared error test: 22.33839183278803
Mean squared error train: 27.58368418714784


## LinearRegression

In [123]:
# As I re-splitted the data, I can't compare with the data I above so 
# I re-calcule the linear regression in order to compare bothe results.

In [124]:
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred_test=lr.predict(X_test)
y_pred_train=lr.predict(X_train)
print('Coefficient of determination test:',r2_score(y_test, y_pred_test))
print('Coefficient of determination train:',r2_score(y_train, y_pred_train))
print('Mean squared error test:', mean_squared_error(y_test, y_pred_test))
print('Mean squared error train:', mean_squared_error(y_train, y_pred_train))

Coefficient of determination test: 0.7288127379306977
Coefficient of determination train: 0.7386563067644092
Mean squared error test: 19.768115707582364
Mean squared error train: 22.81380722062062


## Conclusion

The result of the linear regression model and the Ridge model are really similar and those are the best models. 
The Lasso model and the ElasticNet one have a lower r2 and a higher mean squared error, so in these cases they are not better model, it could be a coincidence anyway because there is not a substancial difference.

# Classification Model

In [161]:
from sklearn.datasets import load_iris

data = load_iris()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.DataFrame(data["target"], columns=["class"])

data = pd.concat([X, y], axis=1)

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

## LinearSVC

In [163]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

In [164]:
svc = LinearSVC(max_iter=10000)
svc.fit(X_train, y_train.values.ravel())
y_pred_train = svc.predict(X_train)
y_pred_test = svc.predict(X_test)
print('Accuracy score test:',accuracy_score(y_test, y_pred_test))
print('Accuracy score train:',accuracy_score(y_train, y_pred_train))
print('Precision score test:',precision_score(y_test, y_pred_test, average="weighted"))
print('Precision score train:',precision_score(y_train, y_pred_train, average="weighted"))

Accuracy score test: 0.9333333333333333
Accuracy score train: 0.975
Precision score test: 0.9466666666666667
Precision score train: 0.9751682986536108


In [165]:
confusion_test=confusion_matrix(y_test, y_pred_test)
pd.DataFrame(confusion_test)

Unnamed: 0,0,1,2
0,11,0,0
1,0,9,2
2,0,0,8


In [166]:
confusion_test=confusion_matrix(y_train, y_pred_train)
pd.DataFrame(confusion_train)

Unnamed: 0,0,1,2
0,40,0,0
1,0,38,2
2,0,0,40


## KNeighborsClassifier

In [167]:
kn = KNeighborsClassifier()
kn.fit(X_train,y_train.values.ravel())
y_pred = kn.predict(X_test)
y_pred_train = kn.predict(X_train)
y_pred_test = kn.predict(X_test)
print('Accuracy score test:',accuracy_score(y_test, y_pred_test))
print('Accuracy score train:',accuracy_score(y_train, y_pred_train))
print('Precision score test:',precision_score(y_test, y_pred_test, average="weighted"))
print('Precision score train:',precision_score(y_train, y_pred_train, average="weighted"))

Accuracy score test: 0.9666666666666667
Accuracy score train: 0.9833333333333333
Precision score test: 0.9703703703703703
Precision score train: 0.9833333333333333


In [168]:
confusion_test=confusion_matrix(y_test, y_pred_test)
pd.DataFrame(confusion_test)

Unnamed: 0,0,1,2
0,11,0,0
1,0,10,1
2,0,0,8


In [169]:
confusion_test=confusion_matrix(y_train, y_pred_train)
pd.DataFrame(confusion_train)

Unnamed: 0,0,1,2
0,40,0,0
1,0,38,2
2,0,0,40


## LogisticRegression

As I re-splitted the data, I can't compare with the data I above so I re-calcule the linear regression in order to compare bothe results.

In [170]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train,y_train.values.ravel())
y_pred = lr.predict(X_test)
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print('Accuracy score test:',accuracy_score(y_test, y_pred_test))
print('Accuracy score train:',accuracy_score(y_train, y_pred_train))
print('Precision score test:',precision_score(y_test, y_pred_test, average="weighted"))
print('Precision score train:',precision_score(y_train, y_pred_train, average="weighted"))

Accuracy score test: 1.0
Accuracy score train: 0.975
Precision score test: 1.0
Precision score train: 0.9766666666666667


In [171]:
confusion_test=confusion_matrix(y_test, y_pred_test)
pd.DataFrame(confusion_test)

Unnamed: 0,0,1,2
0,11,0,0
1,0,11,0
2,0,0,8


In [172]:
confusion_test=confusion_matrix(y_train, y_pred_train)
pd.DataFrame(confusion_train)

Unnamed: 0,0,1,2
0,40,0,0
1,0,38,2
2,0,0,40


## Conclusion

The LogisticRegression had the best result after this one the KNeighborsClassifier and "the worst" fitted would be the RegressionSVC but again all three have a really insignificant difference and all three are good enough in for this dataset.