In [None]:
print("Chapter Three")

In [None]:
import os
print("my path is ", os.getcwd())

### Setup 0⃣
First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20.

In [None]:
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Machine Learning Project Checklist
 
1. Look at the big picture.
2. Get the data.
3. Discover and visualize the data to gain insights.
4. Prepare the data for Machine Learning algorithms.
5. Select a model and train it.
6. Fine-tune your model.
7. Present your solution.
8. Launch, monitor, and maintain your system.

# Chapter 3
Using MNIST dataset

In [None]:
# Import the function to fetch datasets from OpenML, an online repository of well-documented datasets
from sklearn.datasets import fetch_openml

# Fetch MNIST dataset containing handwritten digits with 784 features
mnist = fetch_openml('mnist_784', version=1,as_frame=False)

# keys of the MNIST dataset. object contains the data, target (labels), a description of the dataset, and other metadata.
# keys: 'data': the feature matrix,'target': the label array,'feature_names': the names of the features,'DESCR': a full description of the dataset,
# 'categories': the category labels (for categorical features)
mnist_keys = mnist.keys()

In [None]:
# examine the structure of the data (X) and labels (y)
X, y = mnist["data"], mnist["target"]
#return the dimensions of the array containing the image data
X.shape
#return the dimensions of the array containing the labels for the images
y.shape

In [None]:
import matplotlib.pyplot as plt
import numpy as np
#an instance’s feature vector, reshape it to a 28 × 28 array, and display it using Matplotlib’s imshow() function:
some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap="binary")
plt.axis("off")
plt.show()


In [None]:
#double check the label to make sure it is what we think it is
y[0]
#cast the string label to an int since that is what most ML algorithms expect
y = y.astype(np.uint8)

In [None]:
#create a test set first (for this datsset using already given split)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

## Training a Binary Classifier
Distinguishes between two classes

In [None]:
#target vectors for this classification task
y_train_5 = (y_train == 5) # True for all 5s, False for all other digits
y_test_5 = (y_test == 5)

In [None]:
# pick a classifier and train it
# stochastic gradient descent classifier, train it to differentiate between the digit '5' and other digits in the MNIST dataset 
#The X_train contains the training data features, while y_train_5 is the target variable for the binary classification task
from sklearn.linear_model import SGDClassifier
#random_state gives you reproducible results
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)
#detect images of the number 5
sgd_clf.predict([some_digit])

## Performance Measurements

In [None]:
#meausry accuracy Using Cross-Validation
#evaluates the model
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
for train_index, test_index in skfolds.split(X_train, y_train_5):
 clone_clf = clone(sgd_clf)
 X_train_folds = X_train[train_index]
 y_train_folds = y_train_5[train_index]
 X_test_fold = X_train[test_index]
 y_test_fold = y_train_5[test_index]
 clone_clf.fit(X_train_folds, y_train_folds)
 y_pred = clone_clf.predict(X_test_fold)
 n_correct = sum(y_pred == y_test_fold)
 print(n_correct / len(y_pred)) 

In [None]:
#Use cross_val_score() to evaluate SGDClassifier. using K-fold cross-validation with three folds, ie splitting the training set into K folds 
#then making predictions and evaluating them on each fold using a model trained on the remaining folds
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
# Above 93% accuracy

In [None]:
#very dumb classifier that just classifies every single image in the “not-5” class:
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
 def fit(self, X, y=None):
     return self
 def predict(self, X):
     return np.zeros((len(X), 1), dtype=bool)
    
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")
#90% accuracy bc about 10% of the images are 5, so not too hard to do

In [None]:
#Great example as to why accuracy is not the preferred performance measure for classifiers
#A better way is a Confusion Matrix
#like the cross_val_score() function, cross_val_predict() performs K-fold cross-validation, but instead of returning the evaluation scores, 
#it returns the predictions made on each test fold.
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
# get the confusion matrix using the confusion_matrix() function. pass it the target classes (y_train_5) and the predicted classes (y_train_pred):
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)
#Each row in a confusion matrix represents an actual class, while each column represents a predicted class.

In [None]:
#comute classifier metrics, measurements used to evaluate the performance of a classification model
#compute the harmonic mean of precision and recall/(F_1) 
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

In [None]:
#this decision_function method returns the decision score for each instance
y_scores = sgd_clf.decision_function([some_digit])
y_scores
threshold = 0
y_some_digit_pred = (y_scores > threshold)

In [None]:
#The SGDClassifier uses a threshold equal to 0, so the previous code returns the same result as the predict() method (i.e., True). 
#raise the threshold:
threshold = 8000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred
#raising threshold reduces recall and classifier no longer detects it as a 5 as it did when threshold = 0

In [None]:
#decdide what threshold to use:
#use cross_val_predict() function to get the scores of all instances in the training set,
#specify that you want to return decision scores instead of predictions:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")

In [None]:
#Then with these scores, use the precision_recall_curve() function to compute precision and recall for all possible thresholds:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
# Matplotlib to plot precision and recall as functions of the threshold value 
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.legend(loc="center right", fontsize=16) 
    plt.xlabel("Threshold", fontsize=16)        
    plt.grid(True)                              
    plt.axis([-50000, 50000, 0, 1])            

recall_90_precision = recalls[np.argmax(precisions >= 0.90)]
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]

plt.figure(figsize=(8, 4))                                                                 
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.plot([threshold_90_precision, threshold_90_precision], [0., 0.9], "r:")                 
plt.plot([-50000, threshold_90_precision], [0.9, 0.9], "r:")                                
plt.plot([-50000, threshold_90_precision], [recall_90_precision, recall_90_precision], "r:")
plt.plot([threshold_90_precision], [0.9], "ro")                                            
plt.plot([threshold_90_precision], [recall_90_precision], "ro")                            
plt.show()

    
#     [...] # highlight the threshold and add the legend, axis label, and grid
#     # Highlight the threshold (e.g., with a red vertical line at the chosen threshold value)
#     chosen_threshold = 5000
#     plt.plot([chosen_threshold, chosen_threshold], [0, 1], "r:")     # Threshold line
#     plt.annotate('Threshold', xy=(chosen_threshold, 0.5), xytext=(chosen_threshold+500, 0.5), arrowprops=dict(facecolor='black', shrink=0.05), )
#     plt.xlabel("Threshold")                                           # x-axis label
#     plt.ylabel("Score")                                               # y-axis label
#     plt.legend()                                                      # Show legend
#     plt.grid(True)                                                    # Turn on the grid
#     plt.xlim([-50000, 50000]) 

# plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
# plt.show()

In [None]:
(y_train_pred == (y_scores > 0)).all()

In [None]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.grid(True)

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.plot([recall_90_precision, recall_90_precision], [0., 0.9], "r:")
plt.plot([0.0, recall_90_precision], [0.9, 0.9], "r:")
plt.plot([recall_90_precision], [0.9], "ro")
plt.show()

In [None]:
#if we want a 90% precision
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]

In [None]:
threshold_90_precision

In [None]:
#to make predicitions instead of calling classifier's predicti() method
y_train_pred_90 = (y_scores >= threshold_90_precision)

In [None]:
#check precision
precision_score(y_train_5, y_train_pred_90)
#this confirms 90% precision we wanted

In [None]:
#check recall
recall_score(y_train_5, y_train_pred_90)

## The ROC Curve
Receiver Operating Characteristic. Another classifier metric used with binary classifiers. Plots TPR (recall) against FPR.

In [None]:
#Compute TPR and FPR
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
#then plot TPR and FPR
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])                                    
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) 
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    
    plt.grid(True)                                            

plt.figure(figsize=(8, 6))                                    
plot_roc_curve(fpr, tpr)
fpr_90 = fpr[np.argmax(tpr >= recall_90_precision)]           
plt.plot([fpr_90, fpr_90], [0., recall_90_precision], "r:")   
plt.plot([0.0, fpr_90], [recall_90_precision, recall_90_precision], "r:")  
plt.plot([fpr_90], [recall_90_precision], "ro")                                                  
plt.show()

In [None]:
#The more your ROC line is above the randoming guessing line the better. Use AUC to calculate this. 0.5 is purely random, 1 is perfect
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train_5, y_scores)

In [None]:
#Train RandomForestClassifer and compare it's ROC curve and ROC AUC score to the SGD Classifier
#The predict_proba() method returns an array containing a row per instance and a column per class, each containing the probability that the given instance belongs to the given class (e.g., 70% chance that the image represents a 5):

from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
                                    method="predict_proba")

In [None]:
#roc_curve() function expects labels and scores, but instead of scores you can give it class probabilities. Let’s use the positive class’s probability as the score:
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)

In [None]:
#plot ROC curve  to see how they compare
recall_for_forest = tpr_forest[np.argmax(fpr_forest >= fpr_90)]

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.plot([fpr_90, fpr_90], [0., recall_90_precision], "r:")
plt.plot([0.0, fpr_90], [recall_90_precision, recall_90_precision], "r:")
plt.plot([fpr_90], [recall_90_precision], "ro")
plt.plot([fpr_90, fpr_90], [0., recall_for_forest], "r:")
plt.plot([fpr_90], [recall_for_forest], "ro")
plt.grid(True)
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
#RandomForest is better bc ROC Curve looks much better than SGDClassifier's as it comes much closer to top-left corner. As a result its ROC AUC score is also signifigantly better
roc_auc_score(y_train_5, y_scores_forest)
#99% precision

In [None]:
y_train_pred_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3)
precision_score(y_train_5, y_train_pred_forest)

In [None]:
recall_score(y_train_5, y_train_pred_forest)

In [None]:
## Multiclass Classification

In [None]:
#Run a Support Vector Machine CLassifier (this algorithm is strictly binary)
#trains the SVC on the training set using the 0-9 target classes, not 5 versus the rest (y_train_5)
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto", random_state=42)
svm_clf.fit(X_train[:1000], y_train[:1000]) # y_train, not y_train_5
svm_clf.predict([some_digit])
#this actually trained 45 binary classifiers,got their deicision scores for the image, and selected teh class that won the most duels.

In [None]:
#shown by returning the scores per instance
some_digit_scores = svm_clf.decision_function([some_digit])
some_digit_scores

In [None]:
#highest score is one correspeonding to class 5
np.argmax(some_digit_scores)
svm_clf.classes_
svm_clf.classes_[5]

In [None]:
#to force skikit to use a certain classifier (OvO, OvR, etc
#this creates a multiclass classifier using the OvR strategy based on SVR
from sklearn.multiclass import OneVsRestClassifier
ovr_clf = OneVsRestClassifier(SVC(gamma="auto", random_state=42))
ovr_clf.fit(X_train[:1000], y_train[:1000])
ovr_clf.predict([some_digit])
len(ovr_clf.estimators_)

In [None]:
#Or using the SGDClassifier
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

In [None]:
#since Skikit used the OvR strategy, there are now 10 classes and it trained 10 binary classifiers. THis now returns one value per class
#score that SGD Classifier assigned to each class
sgd_clf.decision_function([some_digit])

In [None]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

## Error Analysis

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

In [None]:
# since sklearn 0.22, you can use sklearn.metrics.plot_confusion_matrix()
def plot_confusion_matrix(matrix):
    """If you prefer color and a colorbar"""
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    cax = ax.matshow(matrix)
    fig.colorbar(cax)

In [None]:
plt.matshow(conf_mx, cmap=plt.cm.gray)
save_fig("confusion_matrix_plot", tight_layout=False)
plt.show()

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

In [None]:
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
save_fig("confusion_matrix_errors_plot", tight_layout=False)
plt.show()

In [None]:
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
save_fig("error_analysis_digits_plot")
plt.show()

## Multilabel Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

In [None]:
knn_clf.predict([some_digit])

In [None]:
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
f1_score(y_multilabel, y_train_knn_pred, average="macro")

## Multioutput Classification

In [None]:
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

In [None]:
some_index = 0
plt.subplot(121); plot_digit(X_test_mod[some_index])
plt.subplot(122); plot_digit(y_test_mod[some_index])
save_fig("noisy_digit_example_plot")
plt.show()

In [None]:
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)
save_fig("cleaned_digit_example_plot")

# Chapter 3, Exercise 1
 Try to build a classifier for the MNIST dataset that achieves over 97% accuracy
on the test set. Hint: the KNeighborsClassifier works quite well for this task;
you just need to find good hyperparameter values (try a grid search on the
weights and n_neighbors hyperparameters).

In [None]:
#GridSearchCV will use cross-validation to evaluate all possible combinations of hyperparameters so you don't have to tinker
# Import the GridSearchCV class for hyperparameter tuning using cross-validation.
from sklearn.model_selection import GridSearchCV

# Define a parameter grid to search over. For the KNeighborsClassifier, we're exploring
# two hyperparameters: 'weights' (with options 'uniform' and 'distance') and 'n_neighbors'
# (with options 3, 4, and 5). This creates a grid of parameter combinations to test.
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

# Initialize Classifier. This is the model we're tuning.
knn_clf = KNeighborsClassifier()
# Set up GridSearchCV with the classifier (knn_clf), the parameter grid (param_grid),
# and a 5-fold cross-validation (cv=5). Verbose=3 increases the messages printed
# to the console so you can track the progress of the search.
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)
# Fit the GridSearchCV to the training data. This will test all combinations
# of parameters in the grid, using 5-fold cross-validation for each combination.
# It selects the best combination based on cross-validated performance.
grid_search.fit(X_train, y_train)
# After fitting, grid_search holds the best model and parameters
grid_search_clf.best_score_
#best parameters
grid_search_clf.best_params_
#best model
grid_search.best_estimator_

In [None]:
# Import the function to compute accuracy score
from sklearn.metrics import accuracy_score

# Use the best model found by GridSearchCV to make predictions on the test set.
# This uses the model with the optimal hyperparameters found during the grid search.
y_pred = grid_search.predict(X_test)
# Calculate the accuracy of the model on the test set by comparing the predicted labels (y_pred)
# to the true labels (y_test). The accuracy score is the fraction of correct predictions over
# the total number of predictions, expressed as a float between 0 and 1, where 1 means perfect accuracy.
accuracy_score(y_test, y_pred)
# The variable 'accuracy' now holds the accuracy score of the best model found by GridSearchCV
# when evaluated on unseen test data. This gives an estimate of the model's generalization ability.

# Chapter 3, Exercise 2
 Write a function that can shift an MNIST image in any direction (left, right, up,
or down) by one pixel.5 Then, for each image in the training set, create four shif‐
ted copies (one per direction) and add them to the training set. Finally, train your
best model on this expanded training set and measure its accuracy on the test set.
You should observe that your model performs even better now! This technique of
artificially growing the training set is called data augmentation or training set
expansion.

In [None]:
# Define a function to shift an image by dx (delta x) and dy (delta y) pixels.
# This can be used for data augmentation, to create more training data from the existing images
# by slightly shifting them in any direction.
def shift_image(image, dx, dy):
    # Reshape the flat image array into a 28x28 matrix, as the original MNIST images are 28x28 pixels.
    image = image.reshape((28, 28))
    # Use the 'shift' function from scipy.ndimage.interpolation to shift the image.
    # 'dy' and 'dx' specify the shift amount along the y and x axes, respectively.
    # 'cval' specifies the value to fill past edges of input if mode is 'constant'. Here it's set to 0, 
    # meaning the empty space created by the shift will be filled with 0s (black).
    # 'mode' specifies how the input array is extended beyond its boundaries. 'constant' means pad with a constant value.
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    # Reshape the shifted image back to a flat array before returning it.
    return shifted_image.reshape([-1])


In [None]:
image = X_train[1000]
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)

plt.figure(figsize=(12,3))
plt.subplot(131)
plt.title("Original", fontsize=14)
plt.imshow(image.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.subplot(132)
plt.title("Shifted down", fontsize=14)
plt.imshow(shifted_image_down.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.subplot(133)
plt.title("Shifted left", fontsize=14)
plt.imshow(shifted_image_left.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.show()

In [None]:
# Initialize lists to hold the augmented training data and labels.
# Copy original training images.
X_train_augmented = [image for image in X_train]
# Copy original training labels.
y_train_augmented = [label for label in y_train]

# Loop over a set of directions to shift the images: right (1, 0), left (-1, 0), down (0, 1), and up (0, -1).
for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    # For each direction, shift each image in the training set and add the shifted image to the augmented dataset.
    
    for image, label in zip(X_train, y_train):
        # Shift the image using the previously defined function.
        shifted_image = shift_image(image, dx, dy)  
# Add the shifted image to the augmented dataset.
        X_train_augmented.append(shift_image(image, dx, dy))
        # The label remains the same, as the image content hasn't changed category.
        y_train_augmented.append(label)
# Convert the augmented datasets from lists to numpy arrays for easier handling in machine learning models.
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)


In [None]:
import numpy as np

# Generate a random permutation of indices based on the length of the augmented training set.
shuffle_idx = np.random.permutation(len(X_train_augmented))

# Reorder the augmented training images and labels according to the random permutation.
# This ensures that the data is shuffled, mixing the original and augmented images,
# which is beneficial for training models to prevent any order bias.
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]


In [None]:
knn_clf = KNeighborsClassifier(**grid_search.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)

# Chapter 3, Exercise 3
 Tackle the Titanic dataset. A great place to start is on Kaggle (https://www.kaggle.com/c/titanic).

In [None]:
import os
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data() 

In [None]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [None]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [None]:
#look at attributes
train_data.head()

In [None]:
#set PassengerId as index column
train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")

In [None]:
#get more info - any missing data?
train_data.info()

In [None]:
#median age of females
train_data[train_data["Sex"]=="female"]["Age"].median()
#to replace null Age attributes with median age

In [None]:
#look at numerical attributes
train_data.describe()

In [None]:
#check in boolean
train_data["Survived"].value_counts()

In [None]:
#look at categorical attributes
train_data["Pclass"].value_counts()

In [None]:
train_data["Sex"].value_counts()

In [None]:
train_data["Embarked"].value_counts()

In [None]:
#build our preprocessing pipelines, starting with the pipeline for numerical attributes:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

In [None]:
#build the pipeline for the categorical attributes:
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

In [None]:
#join the two
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [None]:
X_train = preprocess_pipeline.fit_transform(
    train_data[num_attribs + cat_attribs])
X_train

In [None]:
#get labels
y_train = train_data["Survived"]

In [None]:
#train classifier
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)

In [None]:
#make predictions
X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])
y_pred = forest_clf.predict(X_test)

In [None]:
#use cross validation to determine how good our model is
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

In [None]:
#try to get more accurate classifier with SvC
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

In [None]:
#plot the ten scores
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))
plt.plot([1]*10, svm_scores, ".")
plt.plot([2]*10, forest_scores, ".")
plt.boxplot([svm_scores, forest_scores], labels=("SVM","Random Forest"))
plt.ylabel("Accuracy", fontsize=14)
plt.show()
#can further improve

In [None]:
train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()

In [None]:
train_data["RelativesOnboard"] = train_data["SibSp"] + train_data["Parch"]
train_data[["RelativesOnboard", "Survived"]].groupby(['RelativesOnboard']).mean()

# Chapter 3, Exercise 4
Build a spam classifier (a more challenging exercise):

• Download examples of spam and ham from Apache SpamAssassin’s public
dataset (https://homl.info/spamassassin)..

• Unzip the datasets and familiarize yourself with the data format.

• Split the datasets into a training set and a test set.

• Write a data preparation pipeline to convert each email into a feature vector.
Your preparation pipeline should transform an email into a (sparse) vector that
indicates the presence or absence of each possible word. For example, if all
emails only ever contain four words, “Hello,” “how,” “are,” “you,” then the email
“Hello you Hello Hello you” would be converted into a vector [1, 0, 0, 1]
(meaning [“Hello” is present, “how” is absent, “are” is absent, “you” is
present]), or [3, 0, 0, 2] if you prefer to count the number of occurrences of
each word.

You may want to add hyperparameters to your preparation pipeline to control
whether or not to strip off email headers, convert each email to lowercase,
remove punctuation, replace all URLs with “URL,” replace all numbers with
“NUMBER,” or even perform stemming (i.e., trim off word endings; there are
Python libraries available to do this).
Finally, try out several classifiers and see if you can build a great spam classi‐
fier, with both high recall and high precision.


In [None]:
#retrive data
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()
        
fetch_spam_data()

In [None]:
#download emails
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [None]:
len(ham_filenames)

In [None]:
len(spam_filenames)

In [None]:
#use Python's email module to parse these emails (this handles headers, encoding, etc
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [None]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [None]:
#look at one example of ham and one example of spam, to get a feel of what the data looks like:
print(ham_emails[1].get_content().strip())

In [None]:
#same for spam emails
print(spam_emails[6].get_content().strip())

In [None]:
#Some emails are actually multipart, with images and attachments. Look at the various types of structures we have:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [None]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [None]:
structures_counter(ham_emails).most_common()

In [None]:
structures_counter(spam_emails).most_common()

In [None]:
#look at the email headers:
for header, value in spam_emails[0].items():
    print(header,":",value)

In [None]:
#focus on the Subject header:
spam_emails[0]["Subject"]

In [None]:
#split it into a training set and a test set:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#start writing the preprocessing functions. First,  a function to convert HTML to plain text.
#The following function first drops the <head> section, then converts all <a> tags to the word HYPERLINK, then it gets rid of all HTML tags, leaving only the plain text. For readability, it also replaces multiple newlines with single newlines, and finally it unescapes html entities (such as &gt; or &nbsp;):
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [None]:
#HTML spam
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

In [None]:
#resulting plain text
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")

In [None]:
#function that takes an email as input and returns its content as plain text, no matter its format is:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [None]:
print(email_to_text(sample_html_spam)[:100], "...")

In [None]:
#stemming
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

In [None]:
#way to replace URLs with the word "URL"
%pip install -q -U urlextract

In [None]:
try:
    import urlextract # may require an Internet connection to download root domain names
    
    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None

In [None]:
#transformer
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [None]:
#test transformer
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

In [None]:
#convert word counts to vectors by building  another transformer 
#whose fit() method will build the vocabulary (an ordered list of the most common words) and whose transform() method will use the vocabulary to convert word counts to vectors. The output is a sparse matrix.
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [None]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

In [None]:
X_few_vectors.toarray()

In [None]:
vocab_transformer.vocabulary_

In [None]:
#train our first spam classifier! Let's transform the whole dataset:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

In [None]:
#print out the precision/recall we get on the test set:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))