In [1]:
import pickle as pkl
from scipy.linalg import cho_factor, cho_solve
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from utils import *

In [2]:
from kernels import RBF, LinearKernel, LaplacianRBFKernel
from classifiers import KernelSVC, MulticlassKernelSVC, OneVsOneKernelSVC, MultivariateKernelRidgeClassifier

In [3]:
data_path = 'data/'

#Read training data
Xtr = np.array(pd.read_csv(data_path+'Xtr.csv',header=None,sep=',',usecols=range(3072))) #Drop the last column of Xtr because it is generated by the format of the data but it is unnecessary.
Ytr = np.array(pd.read_csv(data_path+'Ytr.csv',sep=',',usecols=[1])).squeeze() 

#Read test data
Xte = np.array(pd.read_csv(data_path+'/Xte.csv',header=None,sep=',',usecols=range(3072))) ##Drop the last column of Xte because it is generated by the format of the data but it is unnecessary.

# define your learning algorithm here 
# # for instance, define an object called ``classifier'' 
# # classifier.train(Ytr,Xtr) 
# # predict on the test data 
# # for instance, Yte = classifier.fit(Xte) 
# Yte = {'Prediction' : Yte} 
# dataframe = pd.DataFrame(Yte) dataframe.index += 1 
# dataframe.to_csv('Yte_pred.csv',index_label='Id') 

In [4]:
# split the training data into a training and a validation set
X_train, X_val, Y_train, Y_val = train_test_split(Xtr, Ytr, test_size=0.2, random_state=42, stratify=Ytr)

In [5]:
#Check that the distribution of the labels is the same in the training and validation set
unique, counts = np.unique(Y_train, return_counts=True)
print(f"Training data classes count: {dict(zip(unique, counts))}")

unique, counts = np.unique(Y_val, return_counts=True)
print(f"Validation data classes count: {dict(zip(unique, counts))}")

Training data classes count: {0: 400, 1: 400, 2: 400, 3: 400, 4: 400, 5: 400, 6: 400, 7: 400, 8: 400, 9: 400}
Validation data classes count: {0: 100, 1: 100, 2: 100, 3: 100, 4: 100, 5: 100, 6: 100, 7: 100, 8: 100, 9: 100}


In [6]:
# X_train = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
# X_val = (X_val - np.mean(X_val, axis=0)) / np.std(X_val, axis=0)

In [7]:
# # data augmentation for training
X_train_augmented, Y_train_augmented = flip_augmentation(X_train, Y_train, aug_ratio=1.0)
X_train_augmented, Y_train_augmented = rotate_dataset(X_train_augmented, Y_train_augmented, ratio=1.0)

In [8]:
# apply hog features extraction
hog_model = hog_feature_extractor()

In [9]:
X_train_hog = hog_model.fit_extract(X_train_augmented, Y_train_augmented)
X_val_hog = hog_model.extract_features(X_val)

  0%|          | 0/16000 [00:00<?, ?it/s]

100%|██████████| 16000/16000 [00:06<00:00, 2285.77it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1566.87it/s]


In [10]:
#Define the kernel and the classifier
kernel = RBF(sigma=1.01)
ridgeClf = MultivariateKernelRidgeClassifier(kernel, lmbda=0.0000001)

In [11]:
ridgeClf.fit(X_train_hog, Y_train_augmented)

In [12]:
y_pred = ridgeClf.predict(X_val_hog)

In [13]:
sum(y_pred == Y_val)/len(Y_val)

0.604

In [72]:
Xtr_augmented, Ytr_augmented = flip_augmentation(Xtr, Ytr, aug_ratio=1.0)
Xtr_augmented, Ytr_augmented = rotate_dataset(Xtr_augmented, Ytr_augmented, ratio=1.0)
Xtr_hog = hog_model.fit_extract(Xtr_augmented, Ytr_augmented)

100%|██████████| 20000/20000 [00:10<00:00, 1993.16it/s]


In [73]:
#Define the kernel and the classifier
kernel = RBF(sigma=1.01)
ridgeClf = MultivariateKernelRidgeClassifier(kernel, lmbda=0.0000001)

In [74]:
ridgeClf.fit(Xtr_hog, Ytr_augmented)

In [75]:
#Compute the prediction and generate the csv file to uplode for the challenge
Xte_hog = hog_model.extract_features(Xte)
Yte = ridgeClf.predict(Xte_hog) 
Yte = {'Prediction' : Yte} 
dataframe = pd.DataFrame(Yte) 
dataframe.index += 1 
dataframe.to_csv('./Yte_pred_hog_ridgeclf_full_augm.csv',index_label='Id')

100%|██████████| 2000/2000 [00:01<00:00, 1902.37it/s]
