# 190705B Layer 8

In [36]:
%pip install --upgrade pip

Collecting pip
  Using cached https://files.pythonhosted.org/packages/50/c2/e06851e8cc28dcad7c155f4753da8833ac06a5c704c109313b8d5a62968a/pip-23.2.1-py3-none-any.whl
Installing collected packages: pip
  Found existing installation: pip 19.2.3
    Uninstalling pip-19.2.3:
      Successfully uninstalled pip-19.2.3
Successfully installed pip-23.2.1
Note: you may need to restart the kernel to use updated packages.


# Import all require libraries

In [32]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score

# Load datasets

In [2]:
train_data = pd.read_csv("train.csv")
valid_data = pd.read_csv("valid.csv")
test_data = pd.read_csv("test.csv")

In [3]:
train_data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768,label_1,label_2,label_3,label_4
0,0.07181,0.068413,-0.022749,0.086143,0.026361,-0.028817,0.199237,-0.287368,-0.05956,-0.043694,...,0.004646,0.123011,0.04304,-0.042152,0.026225,0.072623,45,,1,6
1,0.03093,0.024088,0.057811,-0.230877,-0.146281,0.102807,0.128767,-0.146269,0.053893,0.055378,...,0.077742,0.081691,-0.004778,0.171727,-0.026027,0.171089,45,,1,6
2,-0.044019,-0.004626,-0.029383,-0.165376,-0.026611,-0.028142,-0.009649,-0.082088,0.018933,0.00683,...,0.076249,-0.046272,0.027831,0.028096,0.030994,0.009709,45,,1,6
3,-0.086241,0.129585,-0.013893,0.089885,-0.1003,-0.035184,0.24098,-0.128362,-0.072328,-0.019385,...,0.006934,-0.049213,0.078852,0.088323,0.168815,-0.049188,45,,1,6
4,0.126416,0.088338,0.088307,0.020371,0.174417,-0.03056,0.181163,-0.009382,0.085396,0.015823,...,-0.028883,0.110844,-0.041875,0.025686,0.003534,-0.104945,45,,1,6


# Getting train and valid data standardized after splitting features and labels

In [4]:
labels = ['label_1', 'label_2', 'label_3', 'label_4']
X_train = {}
X_valid = {}
y_train = {}
y_valid = {}
X_test = {}

for label in labels:
    robust_scaler = RobustScaler()
    scaler = StandardScaler()
    tr_df = train_data
    val_df = valid_data
    tst_df = test_data
    if label == 'label_2': # Remove NaN rows for label_2
      tr_df = train_data[train_data[label].notna()]
      val_df = valid_data[valid_data[label].notna()]

    X_train[label] = pd.DataFrame(scaler.fit_transform(tr_df.iloc[:, :-4]))
    X_valid[label] = pd.DataFrame(scaler.transform(val_df.iloc[:, :-4]))
    X_test = pd.DataFrame(scaler.transform(tst_df.iloc[:, 1:]))

    y_train[label] = tr_df[label]
    y_valid[label] = val_df[label]

# Cross Validation

In [5]:
def cross_validation(model, X_train, y_train):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print("Cross-validation scores: {}".format(scores))
    print("Mean Accuracy: {:.6f}".format(scores.mean()))
    print("Standard deviation of accuracy: {:.6f}".format(scores.std()))
    

In [6]:
cross_validation(SVC(), X_train['label_1'], y_train['label_1'])

Cross-validation scores: [0.9279453  0.93443198 0.93092567 0.93039972 0.93057504]
Mean Accuracy: 0.930856
Standard deviation of accuracy: 0.002076


In [7]:
cross_validation(SVC(), X_train['label_2'], y_train['label_2'])

Cross-validation scores: [0.37963623 0.59771755 0.64782454 0.63694722 0.45096291]
Mean Accuracy: 0.542618
Standard deviation of accuracy: 0.107672


In [8]:
cross_validation(SVC(), X_train['label_3'], y_train['label_3'])

Cross-validation scores: [0.98106592 0.99474053 0.98474755 0.97650771 0.98965638]
Mean Accuracy: 0.985344
Standard deviation of accuracy: 0.006382


In [9]:
cross_validation(SVC(), X_train['label_4'], y_train['label_4'])

Cross-validation scores: [0.91006311 0.91058906 0.93110098 0.93671108 0.92145863]
Mean Accuracy: 0.921985
Standard deviation of accuracy: 0.010698


# Dimensionality Reduction using PCA

In [10]:
PCA_X_train = {}
PCA_X_valid = {}
PCA_X_test = {}

def PCA_feature_selection(L, training_features, validation_features):
    variance_threshold = 0.99

    # Apply PCA with the determined number of components
    pca = PCA(n_components=variance_threshold, svd_solver='full')

    pca_train_result = pca.fit_transform(training_features[L])
    pca_valid_result = pca.transform(validation_features[L])
    pca_test_result = pca.transform(X_test)

    explained_variance_ratio_reduced = pca.explained_variance_ratio_
    
    # Plot explained variance ratio
    # plt.figure(figsize=(18, 10))
    # plt.bar(range(1, pca_train_result.shape[1] + 1), explained_variance_ratio_reduced)
    # plt.xlabel('Principal Component')
    # plt.ylabel('Explained Variance Ratio')
    # plt.title('Explained Variance Ratio per Principal Component (Reduced)')
    # plt.show()

    # Display the reduced train feature matrix
    print("Reduced Train feature matrix shape: {}".format(pca_train_result.shape))
    # Display the reduced valid feature matrix
    print("Reduced Valid feature matrix shape: {}".format(pca_valid_result.shape))

    PCA_X_train[L] = pd.DataFrame(pca_train_result)
    PCA_X_valid[L] = pd.DataFrame(pca_valid_result)
    PCA_X_test[L] = pd.DataFrame(pca_test_result)

In [11]:
for label in labels:
    PCA_feature_selection(label, X_train, X_valid)

Reduced Train feature matrix shape: (28520, 572)
Reduced Valid feature matrix shape: (750, 572)
Reduced Train feature matrix shape: (28040, 571)
Reduced Valid feature matrix shape: (736, 571)
Reduced Train feature matrix shape: (28520, 572)
Reduced Valid feature matrix shape: (750, 572)
Reduced Train feature matrix shape: (28520, 572)
Reduced Valid feature matrix shape: (750, 572)


# Checking Accuracy before Hyperparameter tuning

In [12]:
def get_accuracy_for_model(model, label, X_train, X_valid, y_train, y_valid):
    model.fit(X_train[label], y_train[label])
    y_pred = model.predict(X_valid[label])
    return accuracy_score(y_valid[label], y_pred)

Using SVC

In [13]:
for label in labels:
    print("Accuracy after PCA for label_{}: {}".format(label[-1], get_accuracy_for_model(SVC(),label, PCA_X_train, PCA_X_valid, y_train, y_valid)))

Accuracy after PCA for label_1: 0.932
Accuracy after PCA for label_2: 0.8953804347826086
Accuracy after PCA for label_3: 0.9986666666666667
Accuracy after PCA for label_4: 0.9386666666666666


Using CatBoost

In [33]:
for label in labels:
    print("Accuracy after PCA for label_{}: {}".format(label[-1], get_accuracy_for_model(CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, task_type='GPU'),label, PCA_X_train, PCA_X_valid, y_train, y_valid)))

0:	learn: 3.9356370	total: 258ms	remaining: 2m 8s
1:	learn: 3.8043421	total: 524ms	remaining: 2m 10s
2:	learn: 3.6895197	total: 773ms	remaining: 2m 8s
3:	learn: 3.5981463	total: 1.03s	remaining: 2m 7s
4:	learn: 3.5130503	total: 1.28s	remaining: 2m 6s
5:	learn: 3.4320803	total: 1.53s	remaining: 2m 6s
6:	learn: 3.3599464	total: 1.79s	remaining: 2m 5s
7:	learn: 3.2987481	total: 2.03s	remaining: 2m 4s
8:	learn: 3.2258114	total: 2.29s	remaining: 2m 5s
9:	learn: 3.1614114	total: 2.55s	remaining: 2m 5s
10:	learn: 3.1053630	total: 2.8s	remaining: 2m 4s
11:	learn: 3.0534044	total: 3.05s	remaining: 2m 3s
12:	learn: 2.9998137	total: 3.3s	remaining: 2m 3s
13:	learn: 2.9538309	total: 3.54s	remaining: 2m 2s
14:	learn: 2.9081157	total: 3.79s	remaining: 2m 2s
15:	learn: 2.8700934	total: 4.03s	remaining: 2m 1s
16:	learn: 2.8283825	total: 4.28s	remaining: 2m 1s
17:	learn: 2.7863698	total: 4.53s	remaining: 2m 1s
18:	learn: 2.7447649	total: 4.78s	remaining: 2m 1s
19:	learn: 2.6957187	total: 5.05s	remainin

# Hyperparameter Tuning

In [14]:
def random_search(model, param_dist, X_train, y_train):
    # Create a RandomizedSearchCV object
    random_search_best_model = RandomizedSearchCV(
        model, param_distributions=param_dist, n_iter=2, 
        scoring='accuracy', cv=2, random_state=42, n_jobs=-1, verbose=2
    )

    # Perform the random search on your training data
    random_search_best_model.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding accuracy
    print("Best Hyperparameters:", random_search_best_model.best_params_)
    print("Best Accuracy:", random_search_best_model.best_score_)

    # Get the best SVM model with the optimized hyperparameters
    return random_search_best_model

In [15]:
best_model_l1 = random_search(SVC(), 
              {'C': [100,10,1,0.1,0.01,0.001],     
               'kernel': ['rbf','linear'],       
               'gamma': [0.001, 0.01, 0.1, 1, 10]},       
               PCA_X_train['label_1'], y_train['label_1'])

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best Hyperparameters: {'kernel': 'rbf', 'gamma': 0.001, 'C': 100}
Best Accuracy: 0.938218793828892


In [16]:
print("Accuracy for label_1 after Hyperparameter tuning:", best_model_l1.score(PCA_X_valid['label_1'],y_valid['label_1']))

Accuracy for label_1 after Hyperparameter tuning: 0.972


In [17]:
best_model_l2 = random_search(SVC(),
                {'C': [100,10,1,.1,0.01,0.0010],
                    'kernel': ['poly','linear','rbf'],
                    'gamma': [0.001,0.01, 0.1, 1, 10],
                    'class_weight': ['balanced', None],},
                PCA_X_train['label_2'], y_train['label_2'])

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best Hyperparameters: {'kernel': 'poly', 'gamma': 10, 'class_weight': 'balanced', 'C': 10}
Best Accuracy: 0.2597004279600571


In [18]:
print("Accuracy for label_2 after Hyperparameter tuning:", best_model_l2.score(PCA_X_valid['label_2'],y_valid['label_2']))

Accuracy for label_2 after Hyperparameter tuning: 0.9347826086956522


In [19]:
best_model_l3 = random_search(SVC(),
                {'C': [100,10,1,0.1,0.01,0.001],
                    'kernel': ['rbf','linear'],
                    'gamma': [0.001, 0.01, 0.1, 1, 10],
                    'class_weight': ['balanced']},
                PCA_X_train['label_3'], y_train['label_3'])

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best Hyperparameters: {'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced', 'C': 100}
Best Accuracy: 0.9726507713884993


In [20]:
print("Accuracy for label_3 after Hyperparameter tuning:", best_model_l3.score(PCA_X_valid['label_3'],y_valid['label_3']))

Accuracy for label_3 after Hyperparameter tuning: 0.9973333333333333


In [21]:
best_model_l4 = random_search(SVC(),
                {   'C': [100,10,1,0.1,0.01,0.001],
                    'kernel': ['rbf','linear'],
                    'gamma': [0.001, 0.01, 0.1, 1, 10],
                    'class_weight': ['balanced']},
                PCA_X_train['label_4'], y_train['label_4'])

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best Hyperparameters: {'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced', 'C': 100}
Best Accuracy: 0.8127629733520336


In [22]:
print("Accuracy for label_4 after Hyperparameter tuning:", best_model_l4.score(PCA_X_valid['label_4'],y_valid['label_4']))

Accuracy for label_4 after Hyperparameter tuning: 0.9693333333333334


# Geeting Predictions

In [23]:
y_pred_l1 = best_model_l1.predict(PCA_X_test['label_1'])

In [24]:
y_pred_l2 = best_model_l2.predict(PCA_X_test['label_2'])

In [25]:
y_pred_l3 = best_model_l3.predict(PCA_X_test['label_3'])

In [26]:
y_pred_l4 = best_model_l4.predict(PCA_X_test['label_4'])

Making predications dataframes

In [27]:
id = pd.DataFrame(test_data.iloc[:, :1])
y_pred_l1 = pd.DataFrame(y_pred_l1)
y_pred_l2 = pd.DataFrame(y_pred_l2)
y_pred_l3 = pd.DataFrame(y_pred_l3)
y_pred_l4 = pd.DataFrame(y_pred_l4)

In [28]:
y_pred_l1.columns = ['label_1']
y_pred_l2.columns = ['label_2']
y_pred_l3.columns = ['label_3']
y_pred_l4.columns = ['label_4']

In [29]:
final_pred = pd.concat([id, y_pred_l1, y_pred_l2, y_pred_l3, y_pred_l4], axis=1)

In [30]:
final_pred

Unnamed: 0,ID,label_1,label_2,label_3,label_4
0,1,26,22.0,0,2
1,2,18,25.0,1,8
2,3,16,30.0,1,6
3,4,7,25.0,1,6
4,5,58,29.0,0,6
...,...,...,...,...,...
739,740,31,24.0,1,6
740,741,35,24.0,1,2
741,742,54,27.0,1,6
742,743,38,32.0,1,12


In [31]:
#create submission file
final_pred.to_csv('190705B_layer_8.csv', index=False)