# Import Libaries

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import time

In [2]:
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

In [3]:
df

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
[col for col in df.columns if df[col].isnull().sum()>0]

[]

# Initial Evaluation

In [None]:
y = df["label"]
X = df.iloc[:, 1:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=42,stratify=y)

In [None]:
clf = SVC()
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
print(end - start)
y_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

#### 97% Accuracy. This model is too good to be true :D

In [None]:
preds = clf.predict(test)
submission.iloc[:, 1] = preds
submission.to_csv("submission.csv", index=False)

### As expected, This model does not perform well on unseen data :/

## Data Preprocessing

In [None]:
###Check whether the training data is biased or not
df["label"].value_counts()

### Data for label 1 is 4684(which is max) and for label 5 is 3795(which is min) meaning data is biased towards 1

In [None]:
#As the data is pixel data so its values varies from 0-255 we to standardize all the features
scalar = StandardScaler()
scalar.fit(X_train)
X_train = scalar.transform(X_train)
X_test =  scalar.transform(X_test)
test = scalar.transform(test)

In [None]:
X_train

In [None]:
clf = SVC(kernel='poly') ## I am using poly kernel since it is popular is image
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
print(end - start)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
preds = clf.predict(test)
submission.iloc[:, 1] = preds
submission.to_csv("submission_01.csv", index=False)

### Phew using kernel = 'Poly' did the trick. In one shot we reached from 0 to 0.83771 accuracy on kaggle

## Hyperparameter Tunning

In [None]:
clf = SVC(decision_function_shape='ovo', kernel='poly')
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
print(end - start)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
preds = clf.predict(test)
submission.iloc[:, 1] = preds
submission.to_csv("submission_02.csv", index=False)

#### using OVO as decision_function_shape decreased accuracy of the model, i will not be using this one

In [None]:
parameters = {'C':[0.001, 0.1, 100, 10000], 'gamma':[10,1,0.1,0.01]}
grid = GridSearchCV(SVC(kernel='poly'), param_grid=parameters, cv =2)
grid.fit(X_train, y_train)
print("Score= {}").format(grid.score(X_test, y_test))

In [None]:
grid.best_params_ ##best parameters c=0.001 gamma=10

In [None]:
clf = SVC(C=0.001, kernel='poly', gamma=10)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
print(end - start)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
preds = clf.predict(test)
submission.iloc[:, 1] = preds
submission.to_csv("submission_03.csv", index=False)

#### I am changing Scaling also

In [5]:
#Instead of using StandardScalar lets try to create range of this data between 0 and 1
def scaling(col):
    return (col-col.min())/(col.max()-col.min())

In [6]:
y = df["label"]
X = df.iloc[:, 1:]
X = X.apply(scaling)
test = test.apply(scaling)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=42,stratify=y)

In [7]:
X_train =X_train.fillna(value=0, axis=1)
X_test = X_test.fillna(value=0, axis=1)
test = test.fillna(value=0, axis=1)

In [8]:
clf = SVC(C=0.001, kernel='poly', gamma=10)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
print(end - start)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

221.66533994674683


0.9703174603174604

In [9]:
preds = clf.predict(test)
submission.iloc[:, 1] = preds
submission.to_csv("submission_04.csv", index=False)

In [None]:
test.shape

In [None]:
X_train.shape

In [None]:
X_test.shape
