In [1]:
# imports 

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
import pickle
from sklearn.svm import SVC

# Preprocessing 

## Data Augmentation 

In [None]:
import random
from scipy import ndarray
import skimage as sk
from skimage import transform
from skimage import util
import random
import os
from tqdm.notebook import tqdm 

def random_rotation(image_array: ndarray):
    random_degree = random.uniform(-25, 25)
    return sk.transform.rotate(image_array, random_degree)

def vertical_flip(image_array: ndarray):
    return sk.transform.rotate(image_array, 180)

def horizontal_flip(image_array: ndarray):
    return image_array[:, ::-1]

def rotflip(image_array):
    return np.flip(np.rot90(image_array, axes=(1,0)), axis=1)

available_transformations = {
    'rotate': random_rotation,
    'vertical_flip':vertical_flip,
    'horizontal_flip': horizontal_flip
}

# For filling the NaN values, we use a combination of sklearn's imputer and
# Panda's interpolate function. Also to increase the training set, we add an augmentation
# by rotating the image by 90 degrees. 

images = []
labels = []
imputer = KNNImputer(n_neighbors=3)
for i in range(10):
    for j in tqdm(range(1, 1001)):
        path = './Training_Dataset/character_' + str(i) + '/' + str(j) + '.csv'
        img = np.loadtxt(path, delimiter=',')
        img = pd.DataFrame(img)
        img.interpolate(method='linear',inplace=True)
        img = imputer.fit_transform(img.to_numpy())

        num_transformations = 0
        transformed_image = [img]
        labels.append(i)
        transformed_image.append(rotflip(img))
        labels.append(i)
        images.extend(transformed_image)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [None]:
images = np.array(images)
labels = np.array(labels)

In [None]:
np.savetxt('x_train_augment.csv', images.reshape((images.shape[0],-1)), delimiter=',')
np.savetxt('y_train_augment.csv', labels, delimiter=',')

# Train Test split + PCA

In [None]:
x_train_ = np.loadtxt('x_train_augment.csv', delimiter=',')
y_train_ = np.loadtxt('y_train_augment.csv', delimiter=',')

In [None]:
np.random.seed(0)
# Splitting into trainig and testing with test-ratio of 0.2
x_train, x_val, y_train, y_val = train_test_split(x_train_, y_train_, test_size=0.2, shuffle=True, stratify=y_train_)

# PCA + SVM

In [None]:
n_var_explained = 0.75 # We choose the number of componenets that are enough to explain 75% of the variance.

pca = PCA(n_var_explained).fit(x_train)
x_train_trans = pca.transform(x_train)
x_val_trans = pca.transform(x_val)

In [None]:
np.random.seed(0)
svc = SVC()
parameters = {'gamma':['scale', 'auto'], 'shrinking':[True,False], 'class_weight': [None, 'balanced'], 'C':[0.1,1,10]} # GridSearch on the parameters of SVM
clf = GridSearchCV(svc, parameters,cv=3,scoring='f1_macro',verbose=1)
clf.fit(x_train_trans, y_train)
best_params_dict_svc = clf.best_params_
print(best_params_dict_svc)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  3.7min finished


{'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'shrinking': True}


In [None]:
clf_svc = SVC(gamma=best_params_dict_svc['gamma'],shrinking=best_params_dict_svc['shrinking'],class_weight=best_params_dict_svc['class_weight'],C=best_params_dict_svc['C'])
clf_svc.fit(x_train_trans,y_train)
y_val_pred_svc = clf_svc.predict(x_val_trans)
f1_score(y_val, y_val_pred_svc, average='macro') 

0.9647095758142326

In [None]:
# Performing a final fit on the entire training set with best parameters
clf_svc = SVC(gamma=best_params_dict_svc['gamma'],shrinking=best_params_dict_svc['shrinking'],class_weight=best_params_dict_svc['class_weight'],C=best_params_dict_svc['C'])
pca = PCA(n_var_explained).fit(x_train_)
x_train_trans_ = pca.transform(x_train_)
clf_svc.fit(x_train_trans_, y_train_)

# Saving PCA and SVM model in a single pickle file
models = [clf_svc, pca]
with open("saved_model.pickle", "wb") as f:
    for model in models:
         pickle.dump(model, f)

# Prediction on test ( you can run this directly without preprocessing/training)

In [2]:
def predict_public_dataset():
  # Function to output predictions on public test set. 
  # Loads saved_model.pickle and outputs a 1000x1 array
  test = np.loadtxt('public_test.csv', delimiter=',')
  x_test = test.T
  models = []
  with open("saved_model.pickle", "rb") as f:
      while True:
          try:
              models.append(pickle.load(f))
          except EOFError:
              break
  clf_model = models[0]
  pca_model = models[1]
  x_test_trans = pca_model.transform(x_test)
  predicted_class = np.expand_dims(clf_model.predict(x_test_trans),1)
  return predicted_class

def predict_private_dataset():
  # Function to output predictions on private test set. 
  # Loads saved_model.pickle and outputs a 1000x1 array
  test = np.loadtxt('private_test.csv', delimiter=',')
  x_test = test.T
  models = []
  with open("saved_model.pickle", "rb") as f:
      while True:
          try:
              models.append(pickle.load(f))
          except EOFError:
              break
  clf_model = models[0]
  pca_model = models[1]
  x_test_trans = pca_model.transform(x_test)
  predicted_class = np.expand_dims(clf_model.predict(x_test_trans),1)
  return predicted_class

In [5]:
y_test_pred = predict_public_dataset() # you can replace predict_public_dataset() with predict_private_dataset() to test on private dataset
sample_list = ['Sample_' + str(i) for i in range(1, y_test_pred.shape[0]+1)]
result = {'Id': sample_list, 'Expected': np.squeeze(y_test_pred)}
result_df = pd.DataFrame(data=result)
result_df['Expected'] = result_df['Expected'].astype(int)
result_df.to_csv('y_svm_final.csv', index=False) # Predictions will be saved as y_svm_final.csv



AttributeError: 'SVC' object has no attribute '_probA'

In [4]:
!conda install -c anaconda scikit-learn 

^C
