In [29]:
%pip install catboost

Collecting catboost
  Downloading https://files.pythonhosted.org/packages/09/a8/6f6b224cf2efadf20ea47eac3a5d757cffb6452fac01f12ef28266edbb19/catboost-1.2.2-cp38-cp38-win_amd64.whl (101.1MB)
Collecting graphviz (from catboost)
  Downloading https://files.pythonhosted.org/packages/de/5e/fcbb22c68208d39edff467809d06c9d81d7d27426460ebc598e55130c1aa/graphviz-0.20.1-py3-none-any.whl (47kB)
Collecting plotly (from catboost)
  Using cached https://files.pythonhosted.org/packages/df/79/c80174d711ee26ee5da55a9cc3e248f1ec7a0188b5e4d6bbbbcd09b974b0/plotly-5.17.0-py2.py3-none-any.whl
Collecting tenacity>=6.2.0 (from plotly->catboost)
  Downloading https://files.pythonhosted.org/packages/f4/f1/990741d5bb2487d529d20a433210ffa136a367751e454214013b441c4575/tenacity-8.2.3-py3-none-any.whl
Installing collected packages: graphviz, tenacity, plotly, catboost
Successfully installed catboost-1.2.2 graphviz-0.20.1 plotly-5.17.0 tenacity-8.2.3
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'python -m pip install --upgrade pip' command.


# 190705B Layer 11

# Import all require libraries

In [30]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score

# Load datasets

In [31]:
train_data = pd.read_csv("train.csv")
valid_data = pd.read_csv("valid.csv")
test_data = pd.read_csv("test.csv")

In [32]:
train_data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768,label_1,label_2,label_3,label_4
0,0.031138,0.079892,0.157382,-0.014636,-0.051778,-0.021332,-0.073593,-0.005386,-0.212557,0.099683,...,-0.085248,-0.096007,-0.000766,0.021399,-0.041432,0.094806,45,,1,6
1,0.11304,0.175731,0.217741,-0.196254,-0.010129,-0.030586,0.067114,-0.072412,-0.239192,0.104741,...,-0.090283,-0.053885,-0.010967,0.062209,-0.122958,0.192949,45,,1,6
2,0.04857,0.091281,0.160776,-0.150937,0.020115,0.044117,-0.050092,-0.045661,-0.155332,0.117206,...,-0.021524,-0.008411,-0.006248,0.031468,-0.056915,0.154731,45,,1,6
3,0.039212,0.118388,0.173831,-0.096659,-0.008702,0.061298,0.008974,-0.003277,-0.065046,0.09548,...,-0.071936,-0.02312,-0.007812,0.0576,-0.121892,0.072796,45,,1,6
4,0.056019,0.170639,0.157917,-0.228605,-0.065965,-0.088732,-0.082243,-0.080568,-0.3415,0.14243,...,-0.155621,-0.079447,0.015316,0.127726,-0.151966,0.169634,45,,1,6


# Getting train and valid data standardized after splitting features and labels

In [33]:
labels = ['label_1', 'label_2', 'label_3', 'label_4']
X_train = {}
X_valid = {}
y_train = {}
y_valid = {}
X_test = {}

for label in labels:
    robust_scaler = RobustScaler()
    scaler = StandardScaler()
    tr_df = train_data
    val_df = valid_data
    tst_df = test_data
    if label == 'label_2': # Remove NaN rows for label_2
      tr_df = train_data[train_data[label].notna()]
      val_df = valid_data[valid_data[label].notna()]

    X_train[label] = pd.DataFrame(scaler.fit_transform(tr_df.iloc[:, :-4]))
    X_valid[label] = pd.DataFrame(scaler.transform(val_df.iloc[:, :-4]))
    X_test = pd.DataFrame(scaler.transform(tst_df.iloc[:, 1:]))

    y_train[label] = tr_df[label]
    y_valid[label] = val_df[label]

# Dimensionality Reduction using PCA

In [34]:
PCA_X_train = {}
PCA_X_valid = {}
PCA_X_test = {}

def PCA_feature_selection(L, training_features, validation_features):
    variance_threshold = 0.99

    # Apply PCA with the determined number of components
    pca = PCA(n_components=variance_threshold, svd_solver='full')

    pca_train_result = pca.fit_transform(training_features[L])
    pca_valid_result = pca.transform(validation_features[L])
    pca_test_result = pca.transform(X_test)

    explained_variance_ratio_reduced = pca.explained_variance_ratio_
    
    # Plot explained variance ratio
    # plt.figure(figsize=(18, 10))
    # plt.bar(range(1, pca_train_result.shape[1] + 1), explained_variance_ratio_reduced)
    # plt.xlabel('Principal Component')
    # plt.ylabel('Explained Variance Ratio')
    # plt.title('Explained Variance Ratio per Principal Component (Reduced)')
    # plt.show()

    # Display the reduced train feature matrix
    print("Reduced Train feature matrix shape: {}".format(pca_train_result.shape))
    # Display the reduced valid feature matrix
    print("Reduced Valid feature matrix shape: {}".format(pca_valid_result.shape))

    PCA_X_train[L] = pd.DataFrame(pca_train_result)
    PCA_X_valid[L] = pd.DataFrame(pca_valid_result)
    PCA_X_test[L] = pd.DataFrame(pca_test_result)

In [35]:
for label in labels:
    PCA_feature_selection(label, X_train, X_valid)

Reduced Train feature matrix shape: (28520, 386)
Reduced Valid feature matrix shape: (750, 386)
Reduced Train feature matrix shape: (28040, 385)
Reduced Valid feature matrix shape: (736, 385)
Reduced Train feature matrix shape: (28520, 386)
Reduced Valid feature matrix shape: (750, 386)
Reduced Train feature matrix shape: (28520, 386)
Reduced Valid feature matrix shape: (750, 386)


# Checking Accuracy before Hyperparameter tuning

In [36]:
def get_accuracy_for_model(model, label, X_train, X_valid, y_train, y_valid):
    model.fit(X_train[label], y_train[label])
    y_pred = model.predict(X_valid[label])
    return accuracy_score(y_valid[label], y_pred)

Using SVC

In [8]:
for label in labels:
    print("Accuracy after PCA for label_{}: {}".format(label[-1], get_accuracy_for_model(SVC(),label, PCA_X_train, PCA_X_valid, y_train, y_valid)))

Accuracy after PCA for label_1: 0.9026666666666666
Accuracy after PCA for label_2: 0.84375
Accuracy after PCA for label_3: 0.9973333333333333
Accuracy after PCA for label_4: 0.8986666666666666


Using CatBoost

In [39]:
for label in labels:
    print("Accuracy after PCA for label_{}: {}".format(label[-1], get_accuracy_for_model(CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, task_type='GPU'),label, PCA_X_train, PCA_X_valid, y_train, y_valid)))

0:	learn: 3.9708365	total: 218ms	remaining: 1m 48s
1:	learn: 3.8647870	total: 407ms	remaining: 1m 41s
2:	learn: 3.7792520	total: 593ms	remaining: 1m 38s
3:	learn: 3.7030694	total: 775ms	remaining: 1m 36s
4:	learn: 3.6200224	total: 961ms	remaining: 1m 35s
5:	learn: 3.5421083	total: 1.15s	remaining: 1m 34s
6:	learn: 3.4783406	total: 1.33s	remaining: 1m 33s
7:	learn: 3.4116774	total: 1.51s	remaining: 1m 33s
8:	learn: 3.3521145	total: 1.69s	remaining: 1m 32s
9:	learn: 3.2904187	total: 1.88s	remaining: 1m 31s
10:	learn: 3.2318581	total: 2.06s	remaining: 1m 31s
11:	learn: 3.1760856	total: 2.24s	remaining: 1m 31s
12:	learn: 3.1173223	total: 2.43s	remaining: 1m 30s
13:	learn: 3.0651560	total: 2.61s	remaining: 1m 30s
14:	learn: 3.0144211	total: 2.8s	remaining: 1m 30s
15:	learn: 2.9582919	total: 2.99s	remaining: 1m 30s
16:	learn: 2.9196321	total: 3.17s	remaining: 1m 30s
17:	learn: 2.8755087	total: 3.35s	remaining: 1m 29s
18:	learn: 2.8293889	total: 3.53s	remaining: 1m 29s
19:	learn: 2.7868678	to

# Hyperparameter Tuning

In [9]:
def random_search(model, param_dist, X_train, y_train):
    # Create a RandomizedSearchCV object
    random_search_best_model = RandomizedSearchCV(
        model, param_distributions=param_dist, n_iter=2, 
        scoring='accuracy', cv=2, random_state=42, n_jobs=-1, verbose=2
    )

    # Perform the random search on your training data
    random_search_best_model.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding accuracy
    print("Best Hyperparameters:", random_search_best_model.best_params_)
    print("Best Accuracy:", random_search_best_model.best_score_)

    # Get the best SVM model with the optimized hyperparameters
    return random_search_best_model

In [10]:
best_model_l1 = random_search(SVC(), 
              {'C': [100,10,1,0.1,0.01,0.001],     
               'kernel': ['rbf','linear'],       
               'gamma': [0.001, 0.01, 0.1, 1, 10]},       
               PCA_X_train['label_1'], y_train['label_1'])

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best Hyperparameters: {'kernel': 'rbf', 'gamma': 0.001, 'C': 100}
Best Accuracy: 0.9143408134642357


In [11]:
print("Accuracy for label_1 after Hyperparameter tuning:", best_model_l1.score(PCA_X_valid['label_1'],y_valid['label_1']))

Accuracy for label_1 after Hyperparameter tuning: 0.9493333333333334


In [12]:
best_model_l2 = random_search(SVC(),
                {'C': [100,10,1,.1,0.01,0.0010],
                    'kernel': ['poly','linear','rbf'],
                    'gamma': [0.001,0.01, 0.1, 1, 10],
                    'class_weight': ['balanced', None],},
                PCA_X_train['label_2'], y_train['label_2'])

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best Hyperparameters: {'kernel': 'poly', 'gamma': 10, 'class_weight': 'balanced', 'C': 10}
Best Accuracy: 0.2402639087018545


In [13]:
print("Accuracy for label_2 after Hyperparameter tuning:", best_model_l2.score(PCA_X_valid['label_2'],y_valid['label_2']))

Accuracy for label_2 after Hyperparameter tuning: 0.8573369565217391


In [14]:
best_model_l3 = random_search(SVC(),
                {'C': [100,10,1,0.1,0.01,0.001],
                    'kernel': ['rbf','linear'],
                    'gamma': [0.001, 0.01, 0.1, 1, 10],
                    'class_weight': ['balanced']},
                PCA_X_train['label_3'], y_train['label_3'])

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best Hyperparameters: {'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced', 'C': 100}
Best Accuracy: 0.9685483870967742


In [15]:
print("Accuracy for label_3 after Hyperparameter tuning:", best_model_l3.score(PCA_X_valid['label_3'],y_valid['label_3']))

Accuracy for label_3 after Hyperparameter tuning: 0.9973333333333333


In [16]:
best_model_l4 = random_search(SVC(),
                {   'C': [100,10,1,0.1,0.01,0.001],
                    'kernel': ['rbf','linear'],
                    'gamma': [0.001, 0.01, 0.1, 1, 10],
                    'class_weight': ['balanced']},
                PCA_X_train['label_4'], y_train['label_4'])

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best Hyperparameters: {'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced', 'C': 100}
Best Accuracy: 0.7855539971949509


In [17]:
print("Accuracy for label_4 after Hyperparameter tuning:", best_model_l4.score(PCA_X_valid['label_4'],y_valid['label_4']))

Accuracy for label_4 after Hyperparameter tuning: 0.956


# Geeting Predictions

In [18]:
y_pred_l1 = best_model_l1.predict(PCA_X_test['label_1'])

In [19]:
y_pred_l2 = best_model_l2.predict(PCA_X_test['label_2'])

In [20]:
y_pred_l3 = best_model_l3.predict(PCA_X_test['label_3'])

In [21]:
y_pred_l4 = best_model_l4.predict(PCA_X_test['label_4'])

Making predications dataframes

In [22]:
id = pd.DataFrame(test_data.iloc[:, :1])
y_pred_l1 = pd.DataFrame(y_pred_l1)
y_pred_l2 = pd.DataFrame(y_pred_l2)
y_pred_l3 = pd.DataFrame(y_pred_l3)
y_pred_l4 = pd.DataFrame(y_pred_l4)

In [23]:
y_pred_l1.columns = ['label_1']
y_pred_l2.columns = ['label_2']
y_pred_l3.columns = ['label_3']
y_pred_l4.columns = ['label_4']

In [24]:
final_pred = pd.concat([id, y_pred_l1, y_pred_l2, y_pred_l3, y_pred_l4], axis=1)

In [25]:
final_pred

Unnamed: 0,ID,label_1,label_2,label_3,label_4
0,1,26,22.0,0,2
1,2,18,25.0,1,8
2,3,16,30.0,1,6
3,4,7,27.0,1,6
4,5,58,29.0,0,6
...,...,...,...,...,...
739,740,26,24.0,1,6
740,741,35,24.0,1,2
741,742,54,23.0,1,6
742,743,38,32.0,1,12


In [26]:
#create submission file
final_pred.to_csv('190705B_layer_11.csv', index=False)