# Ensemble Learning
You should build an end-to-end machine learning pipeline using an ensemble learning model. In particular, you should do the following:
- Load the `mnist` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
- Build an end-to-end machine learning pipeline, including an ensemble model, such as [random forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) or [gradient boosting](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html).
- Optimize your pipeline by cross-validating your design decisions. 
- Test the best pipeline on the test set and report various [evaluation metrics](https://scikit-learn.org/0.15/modules/model_evaluation.html).  
- Check the documentation to identify the most important hyperparameters, attributes, and methods of the model. Use them in practice.

In [1]:
import pandas as pd
import time

# for cross validations
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# to split the dataset into train and test set
from sklearn.model_selection import train_test_split

# models to be trained
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# for performing hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# for model evaluation 
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score



In [2]:
df = pd.read_csv("mnist.csv")

In [3]:
df.head(10)

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,31953,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34452,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60897,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,36953,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1981,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,61207,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,33799,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,5414,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,61377,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1875,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train Test Split

In [4]:
# seperating the data into predictors and target attribute
X = df.drop(['class', 'id'], axis=1)
y = df['class']

In [5]:
# train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

print(f"Size of X train dataset is {x_train.shape}")
print(f"Size of y train dataset is {y_train.shape}")
print(f"Size of X test dataset is {x_test.shape}")
print(f"Size of y test dataset is {y_test.shape}")

Size of X train dataset is (2800, 784)
Size of y train dataset is (2800,)
Size of X test dataset is (1200, 784)
Size of y test dataset is (1200,)


### Training the model

### Random Forest

In [6]:
# Instantiate model
forest = RandomForestClassifier()

# timing how long it will take for the model to run
start_time = time.time()

# training the model
forest.fit(x_train, y_train)

# Ending the timer
end_time = time.time()
elapsed_time = end_time - start_time

print("Elapsed time:", round(elapsed_time, 5), 'seconds')

Elapsed time: 1.38784 seconds


### Grandient Boost

In [7]:
# Instantiate model
gradient = GradientBoostingClassifier()

# timing how long it will take for the model to run
start_time = time.time()

# training the model
gradient.fit(x_train, y_train)

# Ending the timer
end_time = time.time()
elapsed_time = end_time - start_time

print("Elapsed time:", round(elapsed_time, 5), "seconds")

Elapsed time: 100.26395 seconds


### Cross Validating the Models

In [8]:
# defining the hyperparameters
param_grid = {"n_estimators":[100, 200, 300, 400, 500], 
              "criterion":['gini', 'entropy'], 
              "max_depth":range(20, 30, 5)}

# creating the grid search model and setting the number of cross validations to 5
model_1 = GridSearchCV(RandomForestClassifier(), param_grid, scoring='accuracy', cv=5, n_jobs=-1)

# timing how long it will take for the model to run
start_time = time.time()

# training the model
model_1.fit(x_train, y_train)

# Ending the timer
end_time = time.time()
elapsed_time = end_time - start_time

print("Elapsed time:", round(elapsed_time, 5), "seconds")

# Checking accurary of the model and the best hyperparameters
print("Accurary of best random forest classifier = {:.2f}%".format(model_1.best_score_*100))
print("Best hyperparameter for the model = {}".format(model_1.best_params_))

Elapsed time: 95.86786 seconds
Accurary of best random forest classifier = 93.46%
Best hyperparameter for the model = {'criterion': 'gini', 'max_depth': 25, 'n_estimators': 300}


In [9]:
# # defining the hyperparameters
# param_grid = {"n_estimators":[100, 200, 300], 
#               "criterion":['friedman_mse', 'squared_error'], 
#               "max_depth":range(20, 30, 5)}

# # creating the grid search model and setting the number of cross validations to 5
# model_2 = GridSearchCV(GradientBoostingClassifier(), param_grid, scoring='accuracy', cv=5, n_jobs=-1)

# # timing how long it will take for the model to run
# start_time = time.time()

# # training the model
# model_2.fit(x_train, y_train)

# # Ending the timer
# end_time = time.time()
# elapsed_time = end_time - start_time

# print("Elapsed time:", round(elapsed_time, 5), "seconds")

# # Checking accurary of the model and the best hyperparameters
# print("Accurary of best random forest classifier = {:.2f}%".format(model_2.best_score_*100))
# print("Best hyperparameter for the model = {}".format(model_2.best_params_))

This takes a long time to run. Uncomment and test if needed

### Testing the model on the test dataset

In [10]:
# making predictions using the test dataset
y_predicted = model_1.predict(x_test)

# calculating the accuracy
accuracy = accuracy_score(y_test, y_predicted)

# creating the confusion matrix
cm = confusion_matrix(y_test, y_predicted)

# calculating precision, recall, and f1 score
precision, recall, f1, support = score(y_test, y_predicted)

# printing the evaluation metrics
print("Accuracy = ", accuracy)
print("Precision = ", precision)
print("Recall = ", recall)
print("F1-Score = ", f1)
print("\nConfusion Matrix = \n", cm)

Accuracy =  0.9375
Precision =  [0.98230088 0.93604651 0.93333333 0.9047619  0.93548387 0.96363636
 0.93220339 0.97247706 0.94736842 0.86792453]
Recall =  [0.98230088 0.97575758 0.91803279 0.97435897 0.90625    0.8907563
 0.94827586 0.92982456 0.91970803 0.91089109]
F1-Score =  [0.98230088 0.95548961 0.92561983 0.9382716  0.92063492 0.92576419
 0.94017094 0.95067265 0.93333333 0.88888889]

Confusion Matrix = 
 [[111   0   0   0   0   0   0   0   2   0]
 [  0 161   1   1   1   0   1   0   0   0]
 [  0   1 112   2   1   0   2   2   0   2]
 [  0   0   2 114   0   0   0   0   1   0]
 [  0   0   0   0  87   0   1   0   2   6]
 [  2   1   0   5   1 106   4   0   0   0]
 [  0   0   4   0   0   1 110   0   1   0]
 [  0   2   1   0   1   0   0 106   0   4]
 [  0   5   0   1   0   3   0   0 126   2]
 [  0   2   0   3   2   0   0   1   1  92]]


In [11]:
# # making predictions using the test dataset
# y_predicted = model_2.predict(x_test)

# # calculating the accuracy
# accuracy = accuracy_score(y_test, y_predicted)

# # creating the confusion matrix
# cm = confusion_matrix(y_test, y_predicted)

# # calculating precision, recall, and f1 score
# precision, recall, f1, support = score(y_test, y_predicted)

# # printing the evaluation metrics
# print("Accuracy = ", accuracy)
# print("Precision = ", precision)
# print("Recall = ", recall)
# print("F1-Score = ", f1)
# print("\nConfusion Matrix = \n", cm)