# Introductin to scikit-learn

This notebook demonstrates some of the most useful functions of the beautiful Scikit-learn library.

What we're going to cover

0. An end-to-end Scikit-learn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problems
3. Fit the model/algorithm and use it to make predictions on our data
4. Evaluating a model
5. Improve a model
6. Save and load a traned model
7. Putting it all together!

## 0. An end-to-end Scikit-learn workflow

In [1]:
# 1. Get the data ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("../data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [2]:
#  Create X (features matrix)
x = heart_disease.drop("target", axis=1) # We want every columns except target column

# Create y (labels)
y = heart_disease["target"] #'Target' column here is our label column

In [3]:
# 2. Choose the right model and hyperparemter
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# We'll keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [4]:
# 3. Fit  the model to the training data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# So 80% of data will be use for training
# and 20% of data for testing

In [5]:
clf.fit(x_train, y_train);

In [6]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
85,67,0,2,115,564,0,0,160,0,1.6,1,0,3
130,54,0,2,160,201,0,1,163,0,0.0,2,1,2
197,67,1,0,125,254,1,1,163,0,0.2,1,2,3
241,59,0,0,174,249,0,1,143,1,0.0,1,0,2
274,47,1,0,110,275,0,0,118,1,1.0,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,54,1,0,120,188,0,1,113,0,1.4,1,1,3
235,51,1,0,140,299,0,1,173,1,1.6,2,0,3
58,34,1,3,118,182,0,0,174,0,0.0,2,0,2
39,65,0,2,160,360,0,0,151,0,0.8,2,0,2


In [7]:
x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
154,39,0,2,138,220,0,1,152,0,0.0,1,0,2
177,64,1,2,140,335,0,1,158,0,0.0,2,0,2
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
101,59,1,3,178,270,0,0,145,0,4.2,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,51,1,2,94,227,0,1,154,1,0.0,2,1,3
248,54,1,1,192,283,0,0,195,0,0.0,2,1,3
287,57,1,1,154,232,0,0,164,0,0.0,2,1,2
198,62,1,0,120,267,0,1,99,1,1.8,1,2,3


In [8]:
x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
154,39,0,2,138,220,0,1,152,0,0.0,1,0,2
177,64,1,2,140,335,0,1,158,0,0.0,2,0,2
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
101,59,1,3,178,270,0,0,145,0,4.2,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,51,1,2,94,227,0,1,154,1,0.0,2,1,3
248,54,1,1,192,283,0,0,195,0,0.0,2,1,3
287,57,1,1,154,232,0,0,164,0,0.0,2,1,2
198,62,1,0,120,267,0,1,99,1,1.8,1,2,3


In [9]:
# Make a prediction
y_label = clf.predict(np.array([1, 2, 3, 4]))



ValueError: Expected 2D array, got 1D array instead:
array=[1. 2. 3. 4.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
y_pred = clf.predict(x_test)
y_pred

In [None]:
y_test

In [None]:
# 4. Evaluating the model
clf.score(x_train, y_train)

In [None]:
clf.score(x_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 

print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
# 5. Improve a model
# Try different amount of n_estimation
np.random.seed(42)
for i in range(10, 100, 10):
    print(f'Trying model with {i} estimators...')
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model accuracy on set: {clf.score(x_test, y_test) * 100:.2f}%")
    print("")

In [None]:
# 6. save the model and load it
import pickle

pickle.dump(clf, open("random_forst_model_1.pkl", "wb"))

In [None]:
load_model = pickle.load(open("random_forst_model_1.pkl", "rb"))
load_model.score(x_test, y_test)

In [None]:
import sklearn
sklearn.show_versions()

In [None]:
what_were_covering = ["0. An end-to-end Scikit-learn workflow",
"1. Getting the data ready",
"2. Choose the right estimator/algorithm for our problems",
"3. Fit the model/algorithm and use it to make predictions on our data",
"4. Evaluating a model",
"5. Improve a model",
"6. Save and load a traned model",
"7. Putting it all together!",]

In [None]:
what_were_covering

In [None]:
# Standard import 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Getting our data ready to be used with machine learning 

Thee main things we have to do:

    1. Split the data into features and labels (usually `x` & `y`)
    2. Filling (also called imputting) or disregarding missing values
    3. Converting non-numericals values to numericals values( alsoi called feature enconding)

In [None]:
heart_disease

In [None]:
x= heart_disease.drop("target", axis=1) # axis=1 ==> columns
x.head()

In [None]:
y = heart_disease["target"]
y

In [None]:
# Split our data into train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, ytrain, y_test = train_test_split(x, y, test_size=.2)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x.shape[0] * 0.8

In [None]:
len(heart_disease)

## 1.1 Make sure it's all numerical

In [None]:
car_sales = pd.read_csv("../data/car-sales-extended.csv")
car_sales.head()

In [None]:
len(car_sales)

In [None]:
car_sales.dtypes

In [None]:
# Split the data
x = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

# Split into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train,y_train)
model.score(x_test, y_test)

In [None]:
# Turn the categorimes into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_x = transformer.fit_transform(x)
transformed_x

In [None]:
pd.DataFrame(transformed_x)

In [None]:
# ncode our data using Pandas
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

In [None]:
# Let's refit the model
np.random.seed(42) 
x_train, x_test, y_train, y_test = train_test_split(transformed_x, 
                                                    y, 
                                                    test_size=.2)
model.fit(x_train, y_train)

In [None]:
model.score(x_test, y_test)

### 1.2 What if we have missing values ?

1. Fill them with some value(also know as imputation)
2. Remove the samples with missing data altogether

In [None]:
# Import car sales missing data
car_sales_missing = pd.read_csv("../data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

In [None]:
car_sales_missing

In [None]:
# Crate x/y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
#  Let's try and convert into number
# Turn the categorimes into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_x = transformer.fit_transform(x)
transformed_x

In [None]:
transformed_x

## option 1: Fill missing data with Pandas

In [None]:
# Fill the "Make" column 
car_sales_missing['Make'].fillna("missing", inplace=True)

# Fill the "Colour" column 
car_sales_missing['Colour'].fillna("missing", inplace=True)

# Fill the "Odometer (KM)" column 
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)

# Fill the "Doors" column 
car_sales_missing['Doors'].fillna(4, inplace=True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Remove rows with missing Prince value
car_sales_missing.dropna(inplace=True)


In [None]:
car_sales_missing.isna().sum()

In [None]:
len(car_sales_missing)

In [None]:
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
#  Let's try and convert into number
# Turn the categorimes into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_x = transformer.fit_transform(car_sales_missing)
transformed_x

## Option 2: Fill Missing values using Scikit-Learn

In [None]:
car_sales_missing = pd.read_csv("../data/car-sales-extended-missing-data.csv")
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Drop columns with null labels
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

In [None]:
# Split into x & y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# fill missing values with Scikit-learn 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' and numecical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ['Make', 'Colour']
door_features = ['Doors']
num_features = ['Odometer (KM)']

# Create an imputer  (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)
])

# Transform the data
filled_x = imputer.fit_transform(x)
filled_x

In [None]:
car_sales_filled = pd.DataFrame(filled_x, 
                                columns=["Make", "Colour", "Doors", "Odometer (KM)"])
car_sales_filled

In [None]:
car_sales_filled.isna().sum()

In [None]:
# Let's try and convert into number
# Turn the categorimes into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_x = transformer.fit_transform(car_sales_filled)
transformed_x

In [None]:
# Now we've got our data as numbers and filled (no missing values )
# Let's fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(transformed_x,y, test_size=.2)
model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

## 2. Chosing the right estimator/algorithm for our problem

Scikit-learn uses estimator as another term for machine learning model or algorithm. 

* Classifcation - predicting whether a sample is one thing or another 
* Regression - predicting a number 

Step 1 - Check the Scikit-Learn machine learning map...https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### 2.1 Picking the right machine learning model for a regression problem

In [None]:
# Import Bostom housing dataset
# from sklearn.datasets import load_boston
# boston = load_boston()
# boston
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

In [None]:
housing_df = pd.DataFrame(housing['data'], columns=housing['feature_names'])
housing_df['target'] = pd.Series(housing['target'])
housing_df

In [None]:
# How many samples?
len(housing_df)

In [None]:
# Let's try Ridge regression model 
from sklearn.linear_model import Ridge

# Setup random seed
np.random.seed(42)

# Create the data 
x = housing_df.drop("target", axis=1)
y = housing_df["target"]

# Split into train and test data
X_train, X_test,y_train, y_test = train_test_split(x, y, test_size=.2)

# Instantiate Ridge model
model =Ridge()
model.fit(X_train, y_train)

# Check the score of the ridge model on the data
model.score(X_test, y_test)

How do we improve thiks score? 

What if Ridge wasn't working?

let's refer back to our Scikit-learning map

In [None]:
# Let's try the random forest regressor
from sklearn.ensemble import RandomForestRegressor

# Setup random seed 
np.random.seed(42)

# Create the data 
x = housing_df.drop('target', axis=1)
y = housing_df["target"]

# Split into train and test data
X_train, X_test,y_train, y_test = train_test_split(x, y, test_size=.2)

# Instantiate Ridge model
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# Evaluate the random forest regressor
rf.score(X_test, y_test)

In [None]:
# Check the ridge model again
model.score(X_test, y_test)

### 2.2 Choosing an estimator for classification problem

In [None]:
heart_disease = pd.read_csv("../data/heart-disease.csv")
heart_disease.head()

In [None]:
len(heart_disease)

Consulting the map and it says to try `Linear SVC`

In [None]:
# Import the LinearSVC estimator class
from sklearn.svm import LinearSVC

# Setup random seed 
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis=1)
y = heart_disease['target']

# Split our data into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# Instantiate LinearSVC
clf = LinearSVC(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate the LinearSVC
clf.score(X_test, y_test)

In [None]:
# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup random seed 
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis=1)
y = heart_disease['target']

# Split our data into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# Instantiate Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Evaluate the LinearSVC
clf.score(X_test, y_test)

Tidbit:

    1. If you have structured data, use ensemble methods
    2. If you have unstructured data(videos, images, audio...), use deep learning or transfer learning

## 3. Fit the model/algorithm and use it to make predictions on our data

### 3.1 Fitting the model to our data

Different names for:
* X = features, features variables, data
* y = labels, targets, target variables

In [None]:
# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup random seed 
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis=1)
y = heart_disease['target']

# Split our data into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# Instantiate Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)

# Fit the model to the data (training the machine learning model)
clf.fit(X_train, y_train)

# Evaluate the Random Forest Classifier (use the patterns the model has learn)
clf.score(X_test, y_test)

In [None]:
X.head()

In [None]:
y.tail()

### 3.2 Make preddiction using the machine learning model

2 ways to make predictions:

1. predict()
2. predict_proba()

In [None]:
# Use a train model to make prediction
clf.predict(np.array[1, 7, 8, 3, 4])  # This doesn't work...

In [None]:
X_test.shape

In [None]:
clf.predict(X_test)

In [None]:
np.array(y_test)

In [None]:
np.array(y_test) == clf.predict(X_test)

In [None]:
# Compare predictions to truth labels to evaluate the model
y_preds = clf.predict(X_test)
np.mean(y_preds == y_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

Make predictions with `predict_proba()`

In [None]:
# predict_proba() returns the probabilities of a classification labels
clf.predict_proba(X_test[:5])

In [None]:
# Let's predict() on the same data...
clf.predict(X_test[:5])

`predict()` can also be used for regression models.

In [None]:
# Booston data sets
housing_df.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create the data
X = housing_df.drop("target", axis=1)
y =  housing_df["target"]

# Split our data into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# Initiate the model & fit the model
model = RandomForestRegressor().fit(X_train, y_train)
# model.fit(X_train, y_train)

# Make prediction
y_preds = model.predict(X_test)
y_preds

In [None]:
X

In [None]:
y

In [None]:
# Check the score of our model
model.score(X_test, y_test)

In [None]:
# Compare the prediction to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

In [None]:
what_were_covering

## 4. Evaluating a machine learning model

Three ways to evaluate a machine learning algo
1. Estimator `score` method
2. The `scoring` parameter
3. Problem-specific metric function

### 4.1 Evaluating a model with the score method

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed()

X = heart_disease.drop("target", axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

clf = RandomForestClassifier()

clf.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

Let's do the same but for regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create the data
X = housing_df.drop("target", axis=1)
y =  housing_df["target"]

# Split our data into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# Initiate the model & fit the model
# model = RandomForestRegressor().fit(X_train, y_train)
model = RandomForestRegressor()
model.fit(X_train, y_train)


In [None]:
# Let's check the score
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

## 4.2 Evaluating the model using scoring 

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

np.random.seed()

X = heart_disease.drop("target", axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

clf = RandomForestClassifier(n_estimators=100)

clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
cross_val_score(clf, X, y)

In [None]:
cross_val_score(clf, X, y, cv=10)

In [None]:
np.random.seed(42)

# Single training and test split score
clf_single_score = clf.score(X_test, y_test)

# Take the mean of 5-flod cross-validation score
clf_cross_val_score = np.mean(cross_val_score(clf, X, y, cv=5))

# Compare the two
clf_single_score, clf_cross_val_score

In [None]:
# Default scoring parameter = mean accuracy
clf.score()

In [None]:
# Scoring parameter set to null be default
cross_val_score(clf, X, y, cv=5, scoring=None)

## 4.2.1 Classification model evaluation metrics

1. Accuracy
2. Area under ROC curve
3. Confusion matrix
4. Classification report

In [None]:
heart_disease.head()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

clf = RandomForestClassifier(n_estimators=100)
cross_val_score = cross_val_score(clf, X, y, cv=5)

In [None]:
np.mean(cross_val_score)

In [None]:
print(f"Heart Disease Classifier Cross-validated Accuracy: {np.mean(cross_val_score) * 100:.2f}%")

**Area under the receiver operating characteristic curve (AUC/ROC)**

* Area under the curve(AUC)
* ROC curve

ROC curves are a comparaison of a model's true positive rate (tpr) versus a models false positive rate (fpr).

* True positive = model predicts 1 when truth is 1
* False positive = model predicts 1 when the truth is 0
* True negative = model predicts 0 when truth is 0
* Falsenegative = model predicts 0 when truth is 1

In [None]:
# Create X_train... etc 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [None]:
from sklearn.metrics import roc_curve

# Fit the classifier
clf.fit(X_train, y_train)

# Make predictions with the probabilities
y_probs = clf.predict_proba(X_test)

y_probs[:10]

In [None]:
y_probs_positive = y_probs[:, 1]
y_probs_positive[:10]

In [None]:
# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

# Check the false positive
fpr

In [None]:
tpr

In [None]:
# Create the function for plotting ROC curve
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate(fpr)
    and true positive rate (tpr) of a model.
    """
    # Plot roc curve
    plt.plot(fpr, tpr, color='orange', label='ROC')
    # Plot line with no predictive power (baselines)
    plt.plot([0, 1], [0,1], color="darkblue", linestyle=":", label="Guessing")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic(ROC) curve")
    plt.legend()
    plt.show()
    

plot_roc_curve(fpr, tpr)
    

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_probs_positive)

In [None]:
# Plot perfect ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_test)
plot_roc_curve(fpr, tpr)

In [None]:
# Perfect AUC score
roc_auc_score(y_test, y_test)

**Confusion Matrix**

A confusion matrix is a quick way to compare the labels a model predicts and labels it was supposed to predict.
In essence, giving you an idea of where the model is getting confused.

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(X_test)

confusion_matrix(y_test, y_preds)

In [None]:
# Visualize the confusion matrix with pd.crosstab()
pd.crosstab(y_test, y_preds, rownames=["Actual Labels"], colnames=["Predicted Labels"])

In [None]:
22+7+8+24


In [None]:
len(X_test)

In [None]:
# How to install a conda package from a jupyter notebook
import sys
!conda install --yes --prefix {sys.prefix} seaborn

# this is to allow the system to install the seaborn library while running the program

In [None]:
# Make our confusion matrix more visual with seaborn heatmap
import seaborn as sns

# Set the scale
sns.set(font_scale=1.5)

# Create a confusion matrix
conf_mat = confusion_matrix(y_test, y_preds)

# Plot it using Seaborn
sns.heatmap(conf_mat);

In [None]:
def plot_conf_mat(conf_mat_input): 
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(conf_mat_input,
                    annot=True # Annotate the boxes with conf_mat info
                    )
    plt.xlabel("True label")
    plt.ylabel("predicted label");
    
    # Fix the broken annatation (this happened in Matplotlib 3.1.1)
    # bottom, top = ax.get_ylim()
    # ax.set_ylim(bottom +0.5, top - 0.5)
    
    
plot_conf_mat(conf_mat)


Creating a confusion matrix using Scikit-Learn

Scikit-Learn has multiple different implementations of plotting confusion matrices:

    sklearn.metrics.ConfusionMatrixDisplay.from_estimator(estimator, X, y) - this takes a fitted estimator (like our clf model), features (X) and labels (y), it then uses the trained estimator to make predictions on X and compares the predictions to y by displaying a confusion matrix.
    sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_true, y_pred) - this takes truth labels and predicted labels and compares them by displaying a confusion matrix.

    Note: Both of these methods/classes require Scikit-Learn 1.0+. To check your version of Scikit-Learn run:

import sklearn
sklearn.__version__

    If you don't have 1.0+, you can upgrade at: https://scikit-learn.org/stable/install.html



In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(estimator=clf, X=X, y=y);

In [None]:
# Plot confusion matrix from predictions
ConfusionMatrixDisplay.from_predictions(y_true=y_test, 
                                        y_pred=y_preds);

**Classification Report**

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

In [None]:
# Where precision and recall become valuable
disease_true = np.zeros(10000)
disease_true[0] = 1 # Only positive case

disease_preds = np.zeros(10000) # model predicts every case as 0

pd.DataFrame(classification_report(disease_true, disease_preds, output_dict=True))

To summarize classification metrics:

* **Accuracy** is a good measure to start with if all classes are balanced(e.g. same amount of samples which are labelled with 0 or 1)
* **Precision** and **recall** become important when classes are imbalanced.
* if false positive predictions are worse than false negatives, aim for higher precision
* If false negative predictions are worse than false positives, aim for higher recall.
* **F1-score** is a combinaition of precision and recall.

### 4.2.2 Regression model evaluation matrix

model evaluation metrics documentation - https://scikit-learn.org/stable/modules/model_evaluation.html

1. R^2 (pronounced r-squared) or coefficient of determination.
2. Mean absulute error (MAE)
3. Mean squared error (MSE)

**R^2**

What R-squared does: Compare our model prediction to the mean of the target. Value of R-squared can range from negative infinity  (a very poor model) to 1. For example, if all our model does is predict the mean of the targets it's R^2 value will be 0. and if our model perfectly predicts a range of numbers it's R^2 value would be 1.

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop("target", axis=1)
y = housing_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
from sklearn.metrics import r2_score

# Fill a array with y_test mean
y_test_mean = np.full(len(y_test), y_test.mean())

In [None]:
y_test_mean

In [None]:
y_test_mean.mean()

In [None]:
# Model only predicting the mean gets an R^2 score of 0
r2_score(y_test, y_test_mean)

In [None]:
# Model predicting perfectly the correct values gets an R^2 score of 1
r2_score(y_test, y_test)

**Meanabsolute error (MAE)**

MAE is the average of the difference b/w predictions and actual values. It gives us an idea of how wrong our model predictions are.

In [None]:
# Mean absolute error
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
mae

In [None]:
df = pd.DataFrame(data={
    "actual values": y_test,
    "predicted values": y_preds
})
df["difference"] = df["predicted values"] - df["actual values"]
df

**Mean squred error (MSE)**

In [None]:
# Mean squared  error
from sklearn.metrics import mean_squared_error

y_preds = model.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
mse

In [None]:
# Calculate MSE by hand
squared = np.square(df["difference"])
squared.mean()



Evaluating the results of a machine learning model is as important as building one.

But just like how different problems have different machine learning models, different machine learning models have different evaluation metrics.

Below are some of the most important evaluation metrics you'll want to look into for classification and regression models.

Classification Model Evaluation Metrics/Techniques

    Accuracy - The accuracy of the model in decimal form. Perfect accuracy is equal to 1.0.

    Precision - Indicates the proportion of positive identifications (model predicted class 1) which were actually correct. A model which produces no false positives has a precision of 1.0.

    Recall - Indicates the proportion of actual positives which were correctly classified. A model which produces no false negatives has a recall of 1.0.

    F1 score - A combination of precision and recall. A perfect model achieves an F1 score of 1.0.

    Confusion matrix - Compares the predicted values with the true values in a tabular way, if 100% correct, all values in the matrix will be top left to bottom right (diagonal line).

    Cross-validation - Splits your dataset into multiple parts and train and tests your model on each part then evaluates performance as an average.

    Classification report - Sklearn has a built-in function called classification_report() which returns some of the main classification metrics such as precision, recall and f1-score.

    ROC Curve - Also known as receiver operating characteristic is a plot of true positive rate versus false-positive rate.

    Area Under Curve (AUC) Score - The area underneath the ROC curve. A perfect model achieves an AUC score of 1.0.

Which classification metric should you use?

    Accuracy is a good measure to start with if all classes are balanced (e.g. same amount of samples which are labelled with 0 or 1).

    Precision and recall become more important when classes are imbalanced.

    If false-positive predictions are worse than false-negatives, aim for higher precision.

    If false-negative predictions are worse than false-positives, aim for higher recall.

    F1-score is a combination of precision and recall.

    A confusion matrix is always a good way to visualize how a classification model is going.

Regression Model Evaluation Metrics/Techniques

    R^2 (pronounced r-squared) or the coefficient of determination - Compares your model's predictions to the mean of the targets. Values can range from negative infinity (a very poor model) to 1. For example, if all your model does is predict the mean of the targets, its R^2 value would be 0. And if your model perfectly predicts a range of numbers it's R^2 value would be 1.

    Mean absolute error (MAE) - The average of the absolute differences between predictions and actual values. It gives you an idea of how wrong your predictions were.

    Mean squared error (MSE) - The average squared differences between predictions and actual values. Squaring the errors removes negative errors. It also amplifies outliers (samples which have larger errors).

Which regression metric should you use?

    R2 is similar to accuracy. It gives you a quick indication of how well your model might be doing. Generally, the closer your R2 value is to 1.0, the better the model. But it doesn't really tell exactly how wrong your model is in terms of how far off each prediction is.

    MAE gives a better indication of how far off each of your model's predictions are on average.

    As for MAE or MSE, because of the way MSE is calculated, squaring the differences between predicted values and actual values, it amplifies larger differences. Let's say we're predicting the value of houses (which we are).

        Pay more attention to MAE: When being $10,000 off is twice as bad as being $5,000 off.

        Pay more attention to MSE: When being $10,000 off is more than twice as bad as being $5,000 off.

For more resources on evaluating a machine learning model, be sure to check out the following resources:

    Scikit-Learn documentation for metrics and scoring (quantifying the quality of predictions)

    Beyond Accuracy: Precision and Recall by Will Koehrsen

    Stack Overflow answer describing MSE (mean squared error) and RSME (root mean squared error)



### 4. FInally using the `Scoring` parameter



In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

clf = RandomForestClassifier()
clf

In [None]:
np.random.seed(42)
cv_acc = cross_val_score(clf, X, y, cv=5)
cv_acc

In [None]:
# Cross-validated accuracy
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
cv_acc = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

In [None]:
# Precision
cv_precision = cross_val_score(clf, X, y, cv=5, scoring="precision")
np.mean(cv_precision)

In [None]:
# Recall
cv_recall = cross_val_score(clf, X, y, cv=5, scoring="recall")
np.mean(cv_recall)

In [None]:
cv_f1 = cross_val_score(clf, X, y, scoring="f1")
np.mean(cv_f1)

# How about our regression model?

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop("target", axis=1)
y = housing_df["target"]

model = RandomForestRegressor(n_estimators=100)
model

In [None]:
X

In [None]:
y

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y,scoring=None)
cv_r2

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y, cv=5, scoring="r2")
cv_r2
# print(f"The cross-validated accuracy is: {np.mean(cv_r2)*100:.2f}%")

In [None]:
# Mean absolute error
np.random.seed(42)
cv_mae = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
cv_mae
# print(f"The cross-validated accuracy is: {np.mean(cv_r2)*100:.2f}%")

In [None]:
# Mean squared error
cv_mse = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")
np.mean(cv_mse)

### 4.3 Using different evaluation metrics as Scikit-Learning functions

**Classification evaluation function**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Make some predictions
y_preds = clf.predict(X_test)

# Evaluate the classifier
print("Classifier metrics on the test set")
print(f"Accurancy: {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision: {precision_score(y_test, y_preds)}")
print(f"F1: {f1_score(y_test, y_preds)}")

**Regression Evaluation functions**

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = housing_df.drop("target", axis=1)
y = housing_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

# Make predictions using regression model
y_preds = model.predict(X_test)

# Evaluate the regression model
print("Regression model metrics on the test set")
print(f"R^2: {r2_score(y_test, y_preds)}")
print(f"MAE: {mean_absolute_error(y_test, y_preds)}")
print(f"MSE: {mean_squared_error(y_test, y_preds)}")

In [None]:
what_were_covering

## 5. Improving a model

First prediction = baseline predictions.
First model = baseline model.

From a data perspective:
* Could we collect more data? (generaly, the more data, the better)
* Could we imporve our data? 

From a model perspective: 
* Is therre a better model we could use?
* Could we improve the current model?

Hyperparameters vs. paraameters
* Parameters = model find these patterns in data
* Hyperparameters = settings on a model you can adjust to (potentially) improve its ability to find patterns

Three ways to adjust hyperparameters:
1. By hand
2. Randomly with RandomSearchCV
3. Exhaustively with GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.get_params()

## 5.1 Turing hyperparameters by hand

Let's make three sets, Training, Validation and Test 


We are going to try and adjust: 
    
    *  `max_depth`
    *  `max_features`
    *  `min_sample_leaf`
    *  `min_sample_split`   
    *  `n_estimators`        

In [None]:
def evaluate_preds(y_true, y_preds):
    """
    Perform evaluation comparaison on y_true labels vs. y_preds labels
    on a classification.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)   
    f1 = f1_score(y_true, y_preds)  
    metric_dict = {
        'accuracy': round(accuracy, 2),
        'precision': round(precision,2),
        'recall': round(recall, 2),
        'f1': round(f1, 2)
    }
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")  
    
    return metric_dict

In [None]:
heart_disease

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# Shuffle the data
heart_disease_suffled = heart_disease.sample(frac=1)

# Split into x & y

X = heart_disease_suffled.drop("target", axis=1)
y = heart_disease_suffled["target"]

# Split the data into train, validation & test sets
train_split = round(.7 * len(heart_disease_suffled)) # 70% of data
valid_split = round(train_split + .15 * len(heart_disease_suffled)) # 15% of data
X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test, y_test = X[valid_split:], y[valid_split: ]

# len(X_train), len(X_valid), len(X_test)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# mAKE baseline PREDICTIONS
y_preds = clf.predict(X_valid)

# Evaluate the classifier on validation set
baseline_metrics = evaluate_preds(y_valid, y_preds)

baseline_metrics

In [None]:
clf.get_params()

In [None]:
np.random.seed(42)

# Create the classifier with different hyperparameters
clf_2 = RandomForestClassifier(n_estimators=100)
clf_2.fit(X_train, y_train)

# Making predictions with different hyperparameters
y_preds_2 = clf_2.predict(X_valid)

# Make predictions with different hyperparameters
clf_2_metrics = evaluate_preds(y_valid, y_preds_2)

In [None]:
np.random.seed(42)

# Create the classifier with different hyperparameters
clf_3 = RandomForestClassifier(n_estimators=100000, max_depth=100000)
clf_3.fit(X_train, y_train)

# Making predictions with different hyperparameters
y_preds_3 = clf_3.predict(X_valid)

# Make predictions with different hyperparameters
clf_3_metrics = evaluate_preds(y_valid, y_preds_3)

### 5.2 Hyperparameters turing with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
       "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1,2,4,]
       }

np.random.seed(42)

# Split into x & y
X = heart_disease_suffled.drop("target", axis=1)
y = heart_disease_suffled["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                           param_distributions=grid,
                           n_iter=10, #number of models try
                           cv=5,
                           verbose=2)

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train)

In [None]:
rs_clf.best_params_

In [None]:
# Make predictions with the best hyper
rs_y_preds = rs_clf.predict(X_test)

# /Evaluate the predictions
rs_metrics = evaluate_preds(y_test, rs_y_preds)

In [None]:
grid_2 = {"n_estimators": [10, 100, 200, 500],
       "max_depth": [None],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [6],
        "min_samples_leaf": [1, 2]
       }

In [10]:
from sklearn.model_selection import GridSearchCV, train_test_split

np.random.seed(42)

# Split into x & y
X = heart_disease_suffled.drop("target", axis=1)
y = heart_disease_suffled["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup GridSearchCV                                                                                                                                                                
gs_clf = GridSearchCV(estimator=clf,
                           param_grid=grid_2,
                           cv=5,
                           verbose=2)

# Fit the GridSearchCV version of clf
gs_clf.fit(X_train, y_train);

NameError: name 'heart_disease_suffled' is not defined

In [None]:
gs_clf.best_params_

In [None]:
gs_clf_preds = gs_clf.predict(X_test)

# Evaluate the prediction
gs_metrics = evaluate_preds(y_test, gs_clf_preds)

Let's compare our different model metrics

In [None]:
compare_metrics = pd.DataFrame({"baseline": baseline_metrics,
                                "clf_2": clf_2_metrics,
                                "random search": rs_metrics,
                                "grid search": gs_metrics
                               })
compare_metrics.plot.bar(figsize=(10, 8));

In [None]:
compare_metrics

## 6. Savind and loading trained machine learning models

2 wayns to do the same
1. With Python's `pickle` module
2. With the `jobLib` module

**Pickle**

In [None]:
import pickle

# Save and existing model to file
pickle.dump(gs_clf, open("../export/gs_random_forest.pkl", "wb"))

In [11]:
# load a saved model
load_pickle_model = pickle.load(open("../export/gs_random_forest.pkl", "rb"))

NameError: name 'pickle' is not defined

In [None]:
# Make predictions 
pickle_y_preds = load_pickle_model.predict(X_test)
evalluate_preds(y_test, pickle_y_preds)

**Job Lib**

In [None]:
from joblib import dump, load

# Save model to file
dump(gs_clf, filename="../export/gs_random_forest.joblib")

In [None]:
# Load the saved joblib model
loaded_job_lib_model = load(filename="../export/gs_random_forest.joblib")

In [None]:
# Make and evalute Joblib predictions
joblib_y_preds = loaded_job_lib_model.predict(X_test)
evalluate_preds(y_test, joblib_y_preds)

## 7. Putting all together

In [12]:
data = pd.read_csv("../data/car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [13]:
# To view a data type of our Data Frame
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [15]:
# To check missing value in each column
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

Step we want to do (all in one cell):
    
    1. Fill missing data
    2. Convert data to numbers
    3. Build a model on the data
    

In [19]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup random seed
np.random.seed(42)

# Import data and drop rows which contains missing labels
data = pd.read_csv("../data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], inplace=True)

# Define different features and transfomer pi[peline
categorical_features =["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing"), 
     ("onehot", OneHotEncoder(handle_unknown="ingnore"))
    )
])

door_features = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))
])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

# Setup the preprocessing step (Fill missing value then convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_transformer, categorical_features),
    ("door", door_transformer, door_features),
    ("num", numeric_transformer, numeric_features)
])

# Creating a preprocessing and modelling pipeline
model = Pipeline(steps=[("preprocessor", preprocessor),
                        ("model", RandomForestRegressor())])

# Split data
X = data.drop("Price", axis=1)
y = data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and score the model
# model.fit(X_train, y_train)
# model.score(X_test, y_test)


# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

ValueError: too many values to unpack (expected 2)

In [22]:
# Getting data ready
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup random seed
import numpy as np
np.random.seed(42)

# Import data and drop rows with missing labels
data = pd.read_csv("../data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], inplace=True)

# Define different features and transformer pipeline
categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))
])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(
                    transformers=[
                        ("cat", categorical_transformer, categorical_features),
                        ("door", door_transformer, door_feature),
                        ("num", numeric_transformer, numeric_features)
                    ])

# Creating a preprocessing and modelling pipeline
model = Pipeline(steps=[("preprocessor", preprocessor),
                        ("model", RandomForestRegressor())])

# Split data
X = data.drop("Price", axis=1)
y = data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.22188417408787875

iT'S also possible to use GridSearchCV or RandomizedSearchCV with our Pipeline


In [27]:


# Use GridSearchCV with our regression Pipeline
from sklearn.model_selection import GridSearchCV

pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [100, 1000],
    "model__max_depth": [None, 5],
#     "model__max_features": ["auto"],
    "model__min_samples_split": [2, 4]    
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.6s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.5s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.5s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.5s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.5s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.5s
[CV] END model__max_depth=None, model__min_samples_spli

[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time=   2.8s
[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time=   2.7s
[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time=   2.8s
[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time=   2.7s
[CV] END model__max_depth=5, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=5, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=5, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean;

In [28]:
gs_model.score(X_test, y_test)

0.3339554263158365

In [16]:
preprocessor

In [17]:
X_train

Unnamed: 0,Make,Colour,Odometer (KM),Doors
986,Honda,White,71934.0,4.0
297,Toyota,Red,162665.0,4.0
566,Honda,White,42844.0,4.0
282,Honda,White,195829.0,4.0
109,Honda,Blue,219217.0,4.0
...,...,...,...,...
106,Toyota,,218803.0,4.0
277,BMW,Blue,245427.0,5.0
904,Toyota,White,196225.0,4.0
450,Honda,Blue,133117.0,
