# MNIST Classifier

In this notebook you will create both, an mnist tabular dataset and a classifier.

## 1.- import the Operating System (os) module in python and any other library you need

In [2]:
import os
from PIL import Image
import numpy as np
import pandas as pd

## 2.- As you can see each class has its own folder (Do it only for train). 

    - Iterate folder by folder ( os.listdir() )
    - Inside each folder: 
        1.- Read the image
        2.- Reshape it into a flat array (784,)
        3.- Save the data into a pandas dataframe apending the column name as the class
    - Save the data into a CSV

    Note: if it takes to long try doing only 100 images per folder and the teacher for the CSV.

In [6]:
folder = np.arange(0, 10)
img_csv = pd.DataFrame()
for i in folder:
    dir_path = f'trainingSet/trainingSet/{i}'
    img_dir = os.listdir(dir_path)
    for file in img_dir:
        img = Image.open(dir_path + '/' + file)
        arr = np.array(img, dtype=int)
        arr = arr.flatten()
        arr = np.append(arr, int(i))
        img_csv[f'{file}'] = arr
img_csv = img_csv.T
img_csv.to_csv('train.csv')

In [None]:
def img_to_csv():
    parent_dir = "11. Images"
    data = pd.DataFrame()
    for folder in os.listdir(parent_dir):

        print(folder)

        if folder == "train":
            for f in os.listdir(parent_dir+"/"+folder):

                class_data = np.zeros(  ( len(os.listdir(parent_dir+"/"+folder+"/"+f) ), 785) )
                print(class_data.shape)

                for i, img_name in enumerate(os.listdir(parent_dir+"/"+folder+"/"+f)):

                    img = Image.open(parent_dir+"/"+folder+"/"+f+"/"+img_name)
                    img_arr = np.array(img, dtype=int)
                    img_arr = img_arr.flatten()
                    class_data[i,:784] = img_arr
                    class_data[i,784] = int(f)

                class_data = pd.DataFrame(class_data)
                data = pd.concat([data, class_data])
    data.to_csv("train.csv", index=False)
    return data


data = img_to_csv()

## 3.- Load the CSV

In [7]:
images_df = pd.read_csv('train.csv', index_col=0)
images_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
img_1.jpg,3,0,0,3,7,3,0,3,0,11,...,0,0,0,0,0,0,0,0,0,0
img_10007.jpg,0,0,0,0,0,0,0,0,8,0,...,0,0,0,0,0,0,0,0,0,0
img_10010.jpg,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
img_10017.jpg,0,0,0,0,0,0,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0
img_10032.jpg,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
img_9943.jpg,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,9
img_9953.jpg,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,9
img_9961.jpg,0,3,6,1,0,2,2,0,0,7,...,7,0,0,1,0,0,0,0,0,9
img_997.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9


## 4.- Create a dictionary of models (No preprocessing needed, it has already been done).
    
    Include both, tree models and mult models.

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [12]:
models = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees": ExtraTreesClassifier(n_estimators=10),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost": AdaBoostClassifier(n_estimators=100),
  "Skl GBM": GradientBoostingClassifier(n_estimators=100),
  "Skl HistGBM": HistGradientBoostingClassifier(max_iter=100),
  "XGBoost": XGBClassifier(n_estimators=100),
  "LightGBM": LGBMClassifier(n_estimators=100),
  "CatBoost": CatBoostClassifier(n_estimators=100)}

## 5.- Using either cross validation or stratification find out which is the best model
    - Base your code on the previous two days examples

In [14]:
import time
from sklearn import model_selection
from sklearn import metrics


x = images_df.iloc[:, :784]
y = images_df.iloc[:, 784]

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

skf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

for model_name, model in models.items():
    start_time = time.time()
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = model_selection.cross_val_predict(model, x, y, cv=skf)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')


KeyboardInterrupt: 

## Optional: Can you rotate an image?