# Import dependencies

In [3]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder


from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

In [4]:
# !pip install gdown

# Download and import dataset

In [5]:
! gdown 1G93DGgD2Xw58be5PNOGT8HB_YSWrWll7

Downloading...
From: https://drive.google.com/uc?id=1G93DGgD2Xw58be5PNOGT8HB_YSWrWll7
To: /content/train_data (4).csv
100% 1.04G/1.04G [00:10<00:00, 95.8MB/s]


In [6]:
! gdown 1Ku1_9Y7zQaDmYv8jdDVAtD289cIrGuon

Downloading...
From: https://drive.google.com/uc?id=1Ku1_9Y7zQaDmYv8jdDVAtD289cIrGuon
To: /content/test_data_flatten.csv
100% 116M/116M [00:02<00:00, 44.1MB/s]


In [9]:
! mv 'train_data (4).csv' 'train_data.csv'

In [10]:
data = pd.read_csv('train_data.csv')

In [8]:
data_test = pd.read_csv('test_data_flatten.csv')

## Training set

In [12]:
X_train = data.drop(columns=['Label'])
y_train = data['Label']

In [13]:
X_train = X_train.to_numpy()
X_train = X_train / np.linalg.norm(X_train)

In [14]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

## Testing set

In [15]:
X_test = data_test.drop(columns=['Label'])
y_test = data_test['Label']

In [16]:
X_test = X_test.to_numpy()
X_test = X_test / np.linalg.norm(X_test)

In [17]:
y_test = label_encoder.transform(y_test)

# GridSearch for RFC

In [18]:
RFC_parameters = {
    'n_estimators': [100, 150, 200],
    'max_depth': [5, 10, 20, 50, 100, 200, 500, 1000],
    'max_features': ['sqrt', 'log2', 50, 100, 200]
}

In [22]:
RFC = RandomForestClassifier(n_jobs=-1, verbose=1)

In [29]:
RFC_GridSearch = GridSearchCV(RFC, RFC_parameters, cv=5)

# GridSearch for KNN

In [32]:
KNN_parameters = {
    'n_neighbors': [2, 3, 4, 5, 6],
    "weights": ['uniform','distance'],
    "p": [1,2,3]
}

In [33]:
KNN = KNeighborsClassifier(n_jobs=-1)

In [34]:
KNN_GridSearch = GridSearchCV(KNN, KNN_parameters, cv=5, verbose=100)

In [None]:
KNN_GridSearch.fit(X_train, y_train)

# GridSearch for SGD Classifier

In [None]:
SGD_parameters = {
    "max_iter": [2000],
    "eta0": [0.001, 0.003, 0.005, 0.0005],
    "random_state": [0, 1, 123, 42],
    "penalty": ['l2','l1'],
    'n_jobs': [-1],
    'learning_rate': ['optimal', 'adaptive'],
    'early_stopping': [True],
    'validation_fraction': [0.2]
}

In [None]:
SGD_Classifier = SGDClassifier()

In [None]:
SGD_GridSearch = GridSearchCV(SGD_Classifier, SGD_parameters, cv=5)

# Voting Classifier

In [None]:
RFC_GridSearch.fit(X_train, y_train)
KNN_GridSearch.fit(X_train, y_train)
SGD_GridSearch.fit(X_train, y_train)

In [None]:
best_RFC_estimator = RFC_GridSearch.best_estimator_
best_KNN_estimator = KNN_GridSearch.best_estimator_
best_SGD_estimator = SGD_GridSearch.best_estimator_

In [None]:
clf = VotingClassifier(estimators=[('RFC', best_RFC_estimator),
                                   ('KNN', best_KNN_estimator)
                                   ('SGD', best_SGD_estimator)],
                       voting='soft')

In [None]:
clf.fit(X_train, y_train)