# Inteligencia Artificial Proyecto #2: Clasificación
## Integrantes
    - Luis Berrospi
    - Pedro Dominguez
    - Carlos Esteban Guerrero Robles

### Lectura de datos

In [43]:
import pandas as pd
import numpy as np
import os

#Librerías que permitirá reducir dimensiones
import pywt
import pywt.data

#Librerías para lectura/edición de imágenes
from skimage.io import imread, imshow
from skimage.transform import resize
from skimage.color import rgb2gray

#Librerías para generar gráficas
import matplotlib.pyplot as plt
from tabulate import tabulate

#Librerías de modelos de clasificación
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score # k fold cross validation
from sklearn import metrics # k fold cross validation

In [11]:
train_csv_path = "../data/sign_mnist_train.csv"

df_train = pd.read_csv(train_csv_path)
df_train_x = df_train.loc[:, "pixel1":"pixel784"]
df_train_y = df_train.label

df_train = df_train.to_numpy()

test_csv_path = "../data/sign_mnist_test.csv"

df_test = pd.read_csv(test_csv_path)
df_test_x = df_test.loc[:, "pixel1":"pixel784"]
df_test_y = df_test.label

df_test = df_test.to_numpy()

### Reducción de dimensionalidad

In [81]:
def reduce_dimension(letter, cuts, wavelet):
  for i in range(cuts):
    (letter, cD) = pywt.dwt(letter, wavelet)
  return letter

def vectorizar(matrix):
  return matrix.flatten()

def proccess_letters(dataset, wavelet, cuts):
  
  data_X = []
  data_Y = []

  for letter_features in dataset:
      
      letter = letter_features[0]
      data_Y.append(letter)

      letter_features = reduce_dimension(letter_features[1:], cuts, wavelet)
      letter_features = vectorizar(letter_features)
      data_X.append(letter_features)

  return data_X, data_Y

#### Wavelet families

In [61]:
discrete_family_wavelets = ['haar', 'bior', 'coif', 'rbio', 'sym', 'db', 'dmey']
discrete_wavelets = {family:pywt.wavelist(family) for family in discrete_family_wavelets}

#### PCA

In [5]:
#print(df_train.shape)
#print(df_test.shape)
random_state = 0
pca = make_pipeline(StandardScaler(), PCA(n_components=2, random_state=random_state))

### Experiments

#### Wavelet experiment

In [80]:
def wavelet_experiment(classification, get_accuracy):
    experiment_wavelets = dict()

    for family, wavelets in discrete_wavelets.items():
        wavelets_accuracy_list = list()
        
        for wavelet in wavelets:
            df_train_x, df_train_y = proccess_letters(df_train, wavelet)
            df_test_x, df_test_y = proccess_letters(df_test, wavelet)

            prediction = classification(df_train_x, df_train_y, df_test_x)
            accuracy = get_accuracy(prediction)

            wavelets_accuracy_list.append((wavelet, accuracy))
        
        experiment_wavelets[family] = wavelets_accuracy_list
    
    return experiment_wavelets

#### Accuracy calculation

In [None]:
def get_max_accuracy(experiment_wavelets):
    max_results = list()

    for family, result in experiment_wavelets.items():
        max_result = max(result, key = lambda tuple: tuple[1])
        max_results.append(max_result)

    return max_results

### SVM

In [76]:
def SVM_classification(df_train_x, df_train_y, df_test_x):
    _svm = svm.SVC(kernel='linear')
    _svm.fit(df_train_x,df_train_y)
    svm_predicted = _svm.predict(df_test_x)

    return svm_predicted

#### Estimación del error

In [75]:
def get_SVM_error(svm_predicted):
    svm_success = 0.0
    for i, val in enumerate(svm_predicted):
        if val == df_test_y[i]:
            svm_success += 1
            
    return svm_success/len(svm_predicted)*100

#### Tabla de resumen

In [78]:
svm_experiment_wavelets = wavelet_experiment(SVM_classification, get_SVM_error)

svm_experiment_wavelets_max = get_max_accuracy(svm_experiment_wavelets)

print("SVM wavelets experiment:")
svm_experiment_wavelets_max

('haar', 80.77244841048523)
('bior1.5', 82.40379252649191)
('coif5', 83.58895705521472)
('rbio2.2', 83.4495259341885)
('sym5', 83.5750139431121)
('db4', 83.61684327941997)
('dmey', 81.51143335192414)


### KNN

In [7]:
knn = KNeighborsClassifier(n_neighbors=24)
knn.fit(df_train_x,df_train_y)
knn_predicted = knn.predict(df_test_x)

: 

: 

#### Estimación del error

In [12]:
knn_success = 0.0
for i, val in enumerate(knn_predicted):
    if val == df_test_y[i]:
        knn_success += 1
print(knn_success/len(knn_predicted)*100)

76.26882320133855


#### Tabla de resumen

### Decision tree

In [65]:
def DT_classification(df_train_x, df_train_y, df_test_x):
    dt = tree.DecisionTreeClassifier()
    dt.fit(df_train_x,df_train_y)
    dt_predicted = dt.predict(df_test_x)

    return dt_predicted

#### Estimación del error

In [66]:
def get_DT_error(dt_predicted):
    dt_success = 0.0
    for i, val in enumerate(dt_predicted):
        if val == df_test_y[i]:
            dt_success += 1
    return dt_success/len(dt_predicted)*100

#### Tabla de resumen

In [74]:
dt_experiment_wavelets = wavelet_experiment(DT_classification, get_DT_error)

dt_experiment_wavelets_max = get_max_accuracy(dt_experiment_wavelets)

print("Decision tree wavelets experiment:")
dt_experiment_wavelets_max

('haar', 45.133853876185164)
('bior1.3', 51.436140546569995)
('coif5', 50.76687116564417)
('rbio3.7', 53.471834913552705)
('sym11', 52.900167317345236)
('db31', 52.370329057445616)
('dmey', 49.28890128276631)


In [None]:
dt_experiment_dimension = dimension_experiment(DT_classification, get_DT_error)

dt_experiment_dimension_max = get_max_accuracy(dt_experiment_dimension)

print("Decision tree wavelets experiment:")
dt_experiment_dimension_max

### K Fold Cross Validation

In [12]:
def sklearn_k_fold_cross_validation (model, x_train, y_train, k)
    scores = cross_val_score(model, x_train, y_, cv = k, scoring = ‘accuracy’)
    errors = 1 - scores
    return scores, errors

SyntaxError: invalid syntax (3753608538.py, line 1)