# Model Validation in Python

## Regression model

In [1]:
import pandas as pd
candy = pd.read_csv('datasets/candy-data.csv')
candy.head()

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [2]:
X = candy.drop(['winpercent','competitorname'],axis=1)
y = candy['winpercent']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as mae

# Instantiate the model
model = RandomForestRegressor(n_estimators=50, random_state=1111)

# Fit the model
model.fit(X_train,y_train)

# Create vectors of predictions
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

# Train/Test Errors
train_error = mae(y_true=y_train, y_pred=train_predictions)
test_error = mae(y_true=y_test, y_pred=test_predictions)

# Print the accuracy for seen and unseen data
print("Model error on seen data: {0:.2f}.".format(train_error))
print("Model error on unseen data: {0:.2f}.".format(test_error))

Model error on seen data: 3.52.
Model error on unseen data: 8.89.


In [5]:
# Array of score for corresponding features
model.feature_importances_

array([0.36580965, 0.04109648, 0.02573308, 0.06999166, 0.00470775,
       0.01048645, 0.01194061, 0.05098452, 0.02945032, 0.20212108,
       0.18767839])

In [6]:
# Print how important each column is to the model
for i, item in enumerate(model.feature_importances_):
      # Use i and item to print out the feature importance of each column
    print("{0:s}: {1:.2f}".format(X_train.columns[i], item))

chocolate: 0.37
fruity: 0.04
caramel: 0.03
peanutyalmondy: 0.07
nougat: 0.00
crispedricewafer: 0.01
hard: 0.01
bar: 0.05
pluribus: 0.03
sugarpercent: 0.20
pricepercent: 0.19


## Classification Model

In [7]:
tic_tac_toe = pd.read_csv('datasets/tic-tac-toe.csv')
tic_tac_toe.head()

Unnamed: 0,Top-Left,Top-Middle,Top-Right,Middle-Left,Middle-Middle,Middle-Right,Bottom-Left,Bottom-Middle,Bottom-Right,Class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [8]:
""" To apply the RandomForestClassifier algorithm on the Tic Tac Toe dataset, you need to convert the categorical values ('x', 'o', 'b') into numerical values. Here's how you can modify the dataset:

Convert 'x' to 1 (representing player 1's move).
Convert 'o' to -1 (representing player 2's move).
Convert 'b' to 0 (representing a blank space).
By doing this, you'll have a numerical representation of the Tic Tac Toe dataset that can be used with the RandomForestClassifier algorithm."""

tic_tac_toe.replace('x',1,inplace=True)
tic_tac_toe.replace('o',-1,inplace=True)
tic_tac_toe.replace('b',0,inplace=True)

tic_tac_toe.head()

Unnamed: 0,Top-Left,Top-Middle,Top-Right,Middle-Left,Middle-Middle,Middle-Right,Bottom-Left,Bottom-Middle,Bottom-Right,Class
0,1,1,1,1,-1,-1,1,-1,-1,positive
1,1,1,1,1,-1,-1,-1,1,-1,positive
2,1,1,1,1,-1,-1,-1,-1,1,positive
3,1,1,1,1,-1,-1,-1,0,0,positive
4,1,1,1,1,-1,-1,0,-1,0,positive


In [9]:
tic_tac_toe['Class'].value_counts()

positive    626
negative    332
Name: Class, dtype: int64

In [10]:
# Representing positive class as 1 and negative as 0
tic_tac_toe.replace({'positive':1, 'negative':0},inplace=True)
tic_tac_toe.head()

Unnamed: 0,Top-Left,Top-Middle,Top-Right,Middle-Left,Middle-Middle,Middle-Right,Bottom-Left,Bottom-Middle,Bottom-Right,Class
0,1,1,1,1,-1,-1,1,-1,-1,1
1,1,1,1,1,-1,-1,-1,1,-1,1
2,1,1,1,1,-1,-1,-1,-1,1,1
3,1,1,1,1,-1,-1,-1,0,0,1
4,1,1,1,1,-1,-1,0,-1,0,1


In [11]:
tic_tac_toe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Top-Left       958 non-null    int64
 1   Top-Middle     958 non-null    int64
 2   Top-Right      958 non-null    int64
 3   Middle-Left    958 non-null    int64
 4   Middle-Middle  958 non-null    int64
 5   Middle-Right   958 non-null    int64
 6   Bottom-Left    958 non-null    int64
 7   Bottom-Middle  958 non-null    int64
 8   Bottom-Right   958 non-null    int64
 9   Class          958 non-null    int64
dtypes: int64(10)
memory usage: 75.0 KB


In [12]:
# Separating into explanatory and target variables
X = tic_tac_toe.drop('Class',axis=1)
y = tic_tac_toe['Class']

In [13]:
# Train-Test split
X_train_ttt, X_test_ttt, y_train_ttt, y_test_ttt = train_test_split(X, y, 
                                                                   test_size=0.3, random_state=42)

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the model
rfc = RandomForestClassifier(n_estimators=50, max_depth=6, random_state=1111)

# Fit the model
rfc.fit(X_train_ttt, y_train_ttt)

# Predict class
Class_prediction = rfc.predict(X_test_ttt)
print(Class_prediction[:5])

print()

# Predict probabilites in each class
Class_proba_prediction = rfc.predict_proba(X_test_ttt)
print(Class_proba_prediction[:5])

[1 1 1 0 0]

[[0.4424578  0.5575422 ]
 [0.11066166 0.88933834]
 [0.21968656 0.78031344]
 [0.5979667  0.4020333 ]
 [0.85271481 0.14728519]]


In [15]:
# R2 score
rfc.score(X_test_ttt, y_test_ttt)

0.9652777777777778

# Validation Basics

In [16]:
import pandas as pd 

tic_tac_toe_1 = pd.read_csv('datasets/tic-tac-toe.csv')
X_1 = pd.get_dummies(tic_tac_toe_1.iloc[:,:9])
y_1 = tic_tac_toe_1.iloc[:,9]

In [17]:
X_1

Unnamed: 0,Top-Left_b,Top-Left_o,Top-Left_x,Top-Middle_b,Top-Middle_o,Top-Middle_x,Top-Right_b,Top-Right_o,Top-Right_x,Middle-Left_b,Middle-Left_o,Middle-Left_x,Middle-Middle_b,Middle-Middle_o,Middle-Middle_x,Middle-Right_b,Middle-Right_o,Middle-Right_x,Bottom-Left_b,Bottom-Left_o,Bottom-Left_x,Bottom-Middle_b,Bottom-Middle_o,Bottom-Middle_x,Bottom-Right_b,Bottom-Right_o,Bottom-Right_x
0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0
1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0
2,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1
3,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0
4,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1
954,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
955,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1
956,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


In [18]:
y_1

0      positive
1      positive
2      positive
3      positive
4      positive
         ...   
953    negative
954    negative
955    negative
956    negative
957    negative
Name: Class, Length: 958, dtype: object

## Creating train, test and validation datasets
![image](image.png)


In [19]:
# Create temporary training and final testing datasets
X_temp, X_test, y_temp, y_test  =\
    train_test_split(X_1, y_1, test_size=0.2, random_state=1111)

# Create the final training and validation datasets
X_train, X_val, y_train, y_val =\
    train_test_split(X_temp, y_temp, test_size=0.25, random_state=1111)

## Accuracy metrics: regression models

- Mean Absolute Error 
![image-2.png](image-2.png)

- Mean Squared Error
![image-3](image-3.png)


In [20]:
X_candy = candy.drop(['competitorname','winpercent'],axis=1)
y_candy = candy['winpercent']

X_train_candy, X_test_candy, y_train_candy, y_test_candy = train_test_split(X_candy, y_candy,
                                                                           test_size=0.3,
                                                                           random_state=42)

In [21]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate the model
rfr = RandomForestRegressor(n_estimators=50, max_depth=6, random_state=1111)

# Fitting the model
rfr.fit(X_train_candy, y_train_candy)

# Predictions
predictions = rfr.predict(X_test_candy)
predictions[:5]

array([45.46772591, 68.82094526, 43.36377122, 43.59687643, 48.17881586])

In [22]:
from sklearn.metrics import mean_absolute_error

# Manually calculate the MAE
n = len(predictions)
mae_one = sum(abs(y_test_candy - predictions)) / n
print('With a manual calculation, the error is {}'.format(mae_one))

# Use scikit-learn to calculate the MAE
mae_two = mean_absolute_error(y_test_candy, predictions)
print('Using scikit-learn, the error is {}'.format(mae_two))

With a manual calculation, the error is 8.78166188617276
Using scikit-learn, the error is 8.781661886172758


In [23]:
from sklearn.metrics import mean_squared_error

n = len(predictions)
# Finish the manual calculation of the MSE
mse_one = sum((y_test_candy - predictions)**2) / n
print('With a manual calculation, the error is {}'.format(mse_one))

# Use the scikit-learn function to calculate MSE
mse_two = mean_squared_error(y_test_candy,predictions)
print('Using scikit-learn, the error is {}'.format(mse_two))

With a manual calculation, the error is 130.8122303680407
Using scikit-learn, the error is 130.81223036804073


## Classification metrics
- **Accuracy** 
![image-4.png](image-4.png)

- **Precision**
![image-5](image-5.png)

- **Recall**
![image-6](image-6.png)


In [26]:
X_1.head()

Unnamed: 0,Top-Left_b,Top-Left_o,Top-Left_x,Top-Middle_b,Top-Middle_o,Top-Middle_x,Top-Right_b,Top-Right_o,Top-Right_x,Middle-Left_b,Middle-Left_o,Middle-Left_x,Middle-Middle_b,Middle-Middle_o,Middle-Middle_x,Middle-Right_b,Middle-Right_o,Middle-Right_x,Bottom-Left_b,Bottom-Left_o,Bottom-Left_x,Bottom-Middle_b,Bottom-Middle_o,Bottom-Middle_x,Bottom-Right_b,Bottom-Right_o,Bottom-Right_x
0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0
1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0
2,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1
3,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0
4,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0


In [28]:
y_1

0      positive
1      positive
2      positive
3      positive
4      positive
         ...   
953    negative
954    negative
955    negative
956    negative
957    negative
Name: Class, Length: 958, dtype: object

In [29]:
X_train_ttt, X_test_ttt, y_train_ttt, y_test_ttt = train_test_split(X_1, y_1, 
                                                                    test_size=0.3,
                                                                    random_state=42)

In [31]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the model
rfc = RandomForestClassifier(n_estimators=50, max_depth=6, random_state=1111)

# Fit the model
rfc.fit(X_train_ttt, y_train_ttt)

# Predictions
y_pred = rfc.predict(X_test_ttt)
y_pred[:5]

array(['positive', 'positive', 'positive', 'negative', 'negative'],
      dtype=object)

In [33]:
from sklearn.metrics import confusion_matrix

# Create and print the confusion matrix
cm = confusion_matrix(y_test_ttt, y_pred)
print(cm)

[[ 70  25]
 [  2 191]]


In [38]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

print("acuuracy: ",accuracy_score(y_test_ttt, y_pred))
print("recall: ", recall_score(y_test_ttt, y_pred, pos_label='positive'))
print("precision: ",precision_score(y_test_ttt, y_pred, pos_label='positive'))

acuuracy:  0.90625
recall:  0.9896373056994818
precision:  0.8842592592592593
