# Combine all models
MSc in Statistical Science\
University of Oxford\
Group-assessed practical\
HT 2024

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from search_param.grid_search import read_data, decode_dict
from fit_models import fit_model
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
import shutil
import json
import time

## Data processing

In [2]:
X_train, X_val, y_train, y_val = read_data()

In [3]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Convert labels to one-hot encoding
y_train_onehot = to_categorical(y_train_encoded)
y_val_onehot = to_categorical(y_val_encoded)

In [4]:
# Standard scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_val_sc = scaler.transform(X_val)

In [5]:
# LDA
lda = LinearDiscriminantAnalysis(n_components=None)
X_train_lda = lda.fit_transform(X_train, y_train)
X_val_lda = lda.transform(X_val)

In [6]:
# PCA
p_PCA = 25 # from notebook pictures
pca = PCA(n_components=p_PCA, random_state=42)  # Select top 25 components
X_train_pca = pca.fit_transform(X_train_sc)
X_val_pca = pca.transform(X_val_sc)

## Create dataframe

In [7]:
column_names = ['Model', 'Pre-processing', 'Training time (sec)', 'Training acc', 'Testing acc']

performance = pd.DataFrame({col: [np.nan] * 17 for col in column_names})
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [8]:
y_train_pred = []
y_val_pred = []

## Neural Network

In [9]:
performance.iloc[0, 0] = 'Neural Network'
performance.iloc[0, 1] = 'Scaling'

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

In [11]:
# Load JSON file into Python dictionary
with open('search_nn/sc_param.json', 'r') as f:
    config = json.load(f)

In [12]:
nn_model = Sequential.from_config(config)
nn_model.summary()

In [13]:
nn_model.compile(optimizer="adam",
                 loss='categorical_crossentropy',  # Suitable for multi-class classification
                 metrics=['accuracy'])

In [14]:
start_time = time.time()

nn_model.fit(X_train_sc, y_train_onehot,
             validation_data=(X_val_sc, y_val_onehot),
             epochs=20)

end_time = time.time()
performance.iloc[0, 2] = end_time - start_time

Epoch 1/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.3086 - loss: 1.9313 - val_accuracy: 0.4675 - val_loss: 1.4540
Epoch 2/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4994 - loss: 1.3809 - val_accuracy: 0.5358 - val_loss: 1.3157
Epoch 3/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5657 - loss: 1.2397 - val_accuracy: 0.5592 - val_loss: 1.3178
Epoch 4/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5535 - loss: 1.1999 - val_accuracy: 0.5725 - val_loss: 1.2600
Epoch 5/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6077 - loss: 1.1031 - val_accuracy: 0.5658 - val_loss: 1.2766
Epoch 6/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6337 - loss: 1.0520 - val_accuracy: 0.5900 - val_loss: 1.2259
Epoch 7/20
[1m150/150[0m 

In [15]:
y_train_pred.append(nn_model.predict(X_train_sc).argmax(axis=-1))
train_loss, performance.iloc[0, 3] = nn_model.evaluate(X_train_sc, y_train_onehot, verbose=0)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [16]:
y_val_pred.append(nn_model.predict(X_val_sc).argmax(axis=-1))
test_loss, performance.iloc[0, 4] = nn_model.evaluate(X_val_sc, y_val_onehot)

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6377 - loss: 1.3422 


In [17]:
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


## XGB

In [18]:
i = 1
performance.iloc[i, 0] = 'XGBooster'
performance.iloc[i, 1] = 'Scaling'

In [19]:
from xgboost import XGBClassifier


model = XGBClassifier(
    objective='multi:softmax',   # for multiclass classification
    num_class=8,                 # specify the number of classes
)
performance.iloc[i, 2:5], y_train_pred_m, y_val_pred_m  = fit_model(model, 'search_param/xgb_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [20]:
y_train_pred.append(y_train_pred_m)
y_val_pred.append(y_val_pred_m)

## Random Forest

In [21]:
performance.iloc[i, 0] = 'Random Forest'
performance.iloc[i, 1] = 'Scaling'

In [22]:
from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier()
performance.iloc[i, 2:5], y_train_pred_m, y_val_pred_m  = fit_model(model, 'search_param/rf_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [23]:
y_train_pred.append(y_train_pred_m)
y_val_pred.append(y_val_pred_m)

## AdaBooster

In [24]:
performance.iloc[i, 0] = 'AdaBooster'
performance.iloc[i, 1] = 'Scaling'

In [25]:
from sklearn.ensemble import AdaBoostClassifier


model = AdaBoostClassifier(random_state=42)
performance.iloc[i, 2:5], y_train_pred_m, y_val_pred_m  = fit_model(model, 'search_param/abc_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [26]:
y_train_pred.append(y_train_pred_m)
y_val_pred.append(y_val_pred_m)

## Logistic

In [27]:
performance.iloc[i, 0] = 'Logistic'
performance.iloc[i, 1] = 'Scaling'

In [28]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
performance.iloc[i, 2:5], y_train_pred_1, y_val_pred_1  = fit_model(model, 'search_param/logistic_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance



Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [29]:
performance.iloc[i, 0] = 'Logistic'
performance.iloc[i, 1] = 'PCA'

In [30]:
model = LogisticRegression()
performance.iloc[i, 2:5], y_train_pred_2, y_val_pred_2  = fit_model(model, 'search_param/logistic_param.json', 'pca',
    X_train_pca, y_train_encoded, X_val_pca, y_val_encoded)

i += 1
performance



Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [31]:
performance.iloc[i, 0] = 'Logistic'
performance.iloc[i, 1] = 'LDA'

In [32]:
model = LogisticRegression()
performance.iloc[i, 2:5], y_train_pred_3, y_val_pred_3  = fit_model(model, 'search_param/logistic_param.json', 'lda',
    X_train_lda, y_train_encoded, X_val_lda, y_val_encoded)

i += 1
performance



Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,,,,,
8,,,,,
9,,,,,


In [33]:
y_train_pred.append(y_train_pred_1)
y_val_pred.append(y_val_pred_1)

## SVM

In [34]:
performance.iloc[i, 0] = 'SVM'
performance.iloc[i, 1] = 'Scaling'

In [35]:
from sklearn.svm import SVC


model = SVC()
performance.iloc[i, 2:5], y_train_pred_1, y_val_pred_1  = fit_model(model, 'search_param/svm_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,SVM,Scaling,8.515433,0.999167,0.615833
8,,,,,
9,,,,,


In [36]:
performance.iloc[i, 0] = 'SVM'
performance.iloc[i, 1] = 'PCA'

In [37]:
model = SVC()
performance.iloc[i, 2:5], y_train_pred_2, y_val_pred_2  = fit_model(model, 'search_param/svm_param.json', 'pca',
    X_train_pca, y_train_encoded, X_val_pca, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,SVM,Scaling,8.515433,0.999167,0.615833
8,SVM,PCA,1.244491,0.654375,0.534167
9,,,,,


In [38]:
performance.iloc[i, 0] = 'SVM'
performance.iloc[i, 1] = 'LDA'

In [39]:
model = SVC()
performance.iloc[i, 2:5], y_train_pred_3, y_val_pred_3  = fit_model(model, 'search_param/svm_param.json', 'lda',
    X_train_lda, y_train_encoded, X_val_lda, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,SVM,Scaling,8.515433,0.999167,0.615833
8,SVM,PCA,1.244491,0.654375,0.534167
9,SVM,LDA,0.803868,0.715208,0.561667


In [40]:
y_train_pred.append(y_train_pred_1)
y_val_pred.append(y_val_pred_1)

## KNN

In [41]:
performance.iloc[i, 0] = 'KNN'
performance.iloc[i, 1] = 'Scaling'

In [42]:
from sklearn.neighbors import KNeighborsClassifier


model = KNeighborsClassifier()
performance.iloc[i, 2:5], y_train_pred_1, y_val_pred_1  = fit_model(model, 'search_param/knn_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,SVM,Scaling,8.515433,0.999167,0.615833
8,SVM,PCA,1.244491,0.654375,0.534167
9,SVM,LDA,0.803868,0.715208,0.561667


In [43]:
performance.iloc[i, 0] = 'KNN'
performance.iloc[i, 1] = 'PCA'

In [44]:
model = KNeighborsClassifier()
performance.iloc[i, 2:5], y_train_pred_2, y_val_pred_2  = fit_model(model, 'search_param/knn_param.json', 'pca',
    X_train_pca, y_train_encoded, X_val_pca, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,SVM,Scaling,8.515433,0.999167,0.615833
8,SVM,PCA,1.244491,0.654375,0.534167
9,SVM,LDA,0.803868,0.715208,0.561667


In [45]:
performance.iloc[i, 0] = 'KNN'
performance.iloc[i, 1] = 'LDA'

In [46]:
model = KNeighborsClassifier()
performance.iloc[i, 2:5], y_train_pred_3, y_val_pred_3  = fit_model(model, 'search_param/knn_param.json', 'lda',
    X_train_lda, y_train_encoded, X_val_lda, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,SVM,Scaling,8.515433,0.999167,0.615833
8,SVM,PCA,1.244491,0.654375,0.534167
9,SVM,LDA,0.803868,0.715208,0.561667


In [47]:
y_train_pred.append(y_train_pred_3)
y_val_pred.append(y_val_pred_3)

## Naive Bayes

In [48]:
performance.iloc[i, 0] = 'Naive Bayes'
performance.iloc[i, 1] = 'Scaling'

In [49]:
from sklearn.neighbors import KNeighborsClassifier


model = KNeighborsClassifier()

start_time = time.time()
model.fit(X_train_sc, y_train_encoded)

end_time = time.time()
performance.iloc[i, 2] = end_time - start_time

y_train_pred_1 = model.predict(X_train_sc)
y_val_pred_1 = model.predict(X_val_sc)

performance.iloc[i, 3] = accuracy_score(y_train_encoded, y_train_pred_1)
performance.iloc[i, 4] = accuracy_score(y_val_encoded, y_val_pred_1)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,SVM,Scaling,8.515433,0.999167,0.615833
8,SVM,PCA,1.244491,0.654375,0.534167
9,SVM,LDA,0.803868,0.715208,0.561667


In [50]:
performance.iloc[i, 0] = 'Naive Bayes'
performance.iloc[i, 1] = 'PCA'

In [51]:
model = KNeighborsClassifier()

start_time = time.time()
model.fit(X_train_pca, y_train_encoded)

end_time = time.time()
performance.iloc[i, 2] = end_time - start_time

y_train_pred_2 = model.predict(X_train_pca)
y_val_pred_2 = model.predict(X_val_pca)

performance.iloc[i, 3] = accuracy_score(y_train_encoded, y_train_pred_2)
performance.iloc[i, 4] = accuracy_score(y_val_encoded, y_val_pred_2)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,SVM,Scaling,8.515433,0.999167,0.615833
8,SVM,PCA,1.244491,0.654375,0.534167
9,SVM,LDA,0.803868,0.715208,0.561667


In [52]:
performance.iloc[i, 0] = 'Naive Bayes'
performance.iloc[i, 1] = 'LDA'

In [53]:
model = KNeighborsClassifier()

start_time = time.time()
model.fit(X_train_lda, y_train_encoded)

end_time = time.time()
performance.iloc[i, 2] = end_time - start_time

y_train_pred_3 = model.predict(X_train_lda)
y_val_pred_3 = model.predict(X_val_lda)

performance.iloc[i, 3] = accuracy_score(y_train_encoded, y_train_pred_3)
performance.iloc[i, 4] = accuracy_score(y_val_encoded, y_val_pred_3)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,18.737374,0.900417,0.605833
1,XGBooster,Scaling,114.592163,1.0,0.576667
2,Random Forest,Scaling,45.279888,1.0,0.55
3,AdaBooster,Scaling,47.811196,0.482083,0.449167
4,Logistic,Scaling,0.53318,0.692708,0.5675
5,Logistic,PCA,0.103984,0.491875,0.4675
6,Logistic,LDA,0.041544,0.707708,0.549167
7,SVM,Scaling,8.515433,0.999167,0.615833
8,SVM,PCA,1.244491,0.654375,0.534167
9,SVM,LDA,0.803868,0.715208,0.561667


In [54]:
y_train_pred.append(y_train_pred_3)
y_val_pred.append(y_val_pred_3)

## Voting

In [55]:
from scipy.stats import mode


y_train_vote, _ = mode(np.array(y_train_pred), axis=0)
y_val_vote, _ = mode(np.array(y_val_pred), axis=0)

In [56]:
accuracy_score(y_train_encoded, y_train_vote)

0.92875

In [57]:
accuracy_score(y_val_encoded, y_val_vote)

0.6083333333333333