# Combine all models
MSc in Statistical Science\
University of Oxford\
Group-assessed practical\
HT 2024

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from search_param.grid_search import read_data, decode_dict
from fit_models import fit_model
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
import shutil
import json
import time

## Data processing

In [2]:
X_train, X_val, y_train, y_val = read_data()

In [3]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Convert labels to one-hot encoding
y_train_onehot = to_categorical(y_train_encoded)
y_val_onehot = to_categorical(y_val_encoded)

In [4]:
# Standard scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_val_sc = scaler.transform(X_val)

In [5]:
# LDA
lda = LinearDiscriminantAnalysis(n_components=None)
X_train_lda = lda.fit_transform(X_train, y_train)
X_val_lda = lda.transform(X_val)

In [6]:
# PCA
p_PCA = 25 # from notebook pictures
pca = PCA(n_components=p_PCA, random_state=42)  # Select top 25 components
X_train_pca = pca.fit_transform(X_train_sc)
X_val_pca = pca.transform(X_val_sc)

## Create dataframe

In [7]:
column_names = ['Model', 'Pre-processing', 'Training time (sec)', 'Training acc', 'Testing acc']

performance = pd.DataFrame({col: [np.nan] * 17 for col in column_names})
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [8]:
y_train_pred = []
y_val_pred = []

## Neural Network

In [9]:
performance.iloc[0, 0] = 'Neural Network'
performance.iloc[0, 1] = 'Scaling'

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

In [11]:
# Load JSON file into Python dictionary
with open('search_nn/sc_param.json', 'r') as f:
    config = json.load(f)

In [12]:
nn_model = Sequential.from_config(config)
nn_model.summary()

In [13]:
nn_model.compile(optimizer="adam",
                 loss='categorical_crossentropy',  # Suitable for multi-class classification
                 metrics=['accuracy'])

In [14]:
start_time = time.time()

nn_model.fit(X_train_sc, y_train_onehot,
             validation_data=(X_val_sc, y_val_onehot),
             epochs=20)

end_time = time.time()
performance.iloc[0, 2] = end_time - start_time

Epoch 1/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.2315 - loss: 2.0943 - val_accuracy: 0.4825 - val_loss: 1.4987
Epoch 2/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4731 - loss: 1.4480 - val_accuracy: 0.5408 - val_loss: 1.3425
Epoch 3/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5724 - loss: 1.2141 - val_accuracy: 0.5375 - val_loss: 1.3093
Epoch 4/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5945 - loss: 1.1606 - val_accuracy: 0.5633 - val_loss: 1.2681
Epoch 5/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6448 - loss: 1.0297 - val_accuracy: 0.5650 - val_loss: 1.2681
Epoch 6/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6434 - loss: 1.0010 - val_accuracy: 0.5708 - val_loss: 1.2447
Epoch 7/20
[1m150/150[0m 

In [15]:
y_train_pred.append(nn_model.predict(X_train_sc).argmax(axis=-1))
train_loss, performance.iloc[0, 3] = nn_model.evaluate(X_train_sc, y_train_onehot, verbose=0)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [16]:
y_val_pred.append(nn_model.predict(X_val_sc).argmax(axis=-1))
test_loss, performance.iloc[0, 4] = nn_model.evaluate(X_val_sc, y_val_onehot)

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6064 - loss: 1.5387 


In [17]:
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


## XGB

In [18]:
i = 1
performance.iloc[i, 0] = 'XGBooster'
performance.iloc[i, 1] = 'Scaling'

In [19]:
from xgboost import XGBClassifier


model = XGBClassifier(
    objective='multi:softmax',   # for multiclass classification
    num_class=8,                 # specify the number of classes
)
performance.iloc[i, 2:5], y_train_pred_m, y_val_pred_m  = fit_model(model, 'search_param/xgb_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [20]:
y_train_pred.append(y_train_pred_m)
y_val_pred.append(y_val_pred_m)

## Random Forest

In [21]:
performance.iloc[i, 0] = 'Random Forest'
performance.iloc[i, 1] = 'Scaling'

In [22]:
from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier()
performance.iloc[i, 2:5], y_train_pred_m, y_val_pred_m  = fit_model(model, 'search_param/rf_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [23]:
y_train_pred.append(y_train_pred_m)
y_val_pred.append(y_val_pred_m)

## AdaBooster

In [24]:
performance.iloc[i, 0] = 'AdaBooster'
performance.iloc[i, 1] = 'Scaling'

In [25]:
from sklearn.ensemble import AdaBoostClassifier


model = AdaBoostClassifier(random_state=42)
performance.iloc[i, 2:5], y_train_pred_m, y_val_pred_m  = fit_model(model, 'search_param/abc_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [26]:
y_train_pred.append(y_train_pred_m)
y_val_pred.append(y_val_pred_m)

## Logistic

In [27]:
performance.iloc[i, 0] = 'Logistic'
performance.iloc[i, 1] = 'Scaling'

In [28]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
performance.iloc[i, 2:5], y_train_pred_1, y_val_pred_1  = fit_model(model, 'search_param/logistic_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance



Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [29]:
performance.iloc[i, 0] = 'Logistic'
performance.iloc[i, 1] = 'PCA'

In [30]:
model = LogisticRegression()
performance.iloc[i, 2:5], y_train_pred_2, y_val_pred_2  = fit_model(model, 'search_param/logistic_param.json', 'pca',
    X_train_pca, y_train_encoded, X_val_pca, y_val_encoded)

i += 1
performance



Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [31]:
performance.iloc[i, 0] = 'Logistic'
performance.iloc[i, 1] = 'LDA'

In [32]:
model = LogisticRegression()
performance.iloc[i, 2:5], y_train_pred_3, y_val_pred_3  = fit_model(model, 'search_param/logistic_param.json', 'lda',
    X_train_lda, y_train_encoded, X_val_lda, y_val_encoded)

i += 1
performance

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,,,,,
8,,,,,
9,,,,,


In [33]:
y_train_pred.append(y_train_pred_1)
y_val_pred.append(y_val_pred_1)

## SVM

In [34]:
performance.iloc[i, 0] = 'SVM'
performance.iloc[i, 1] = 'Scaling'

In [35]:
from sklearn.svm import SVC


model = SVC()
performance.iloc[i, 2:5], y_train_pred_1, y_val_pred_1  = fit_model(model, 'search_param/svm_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,SVM,Scaling,9.29433,0.999167,0.615833
8,,,,,
9,,,,,


In [36]:
performance.iloc[i, 0] = 'SVM'
performance.iloc[i, 1] = 'PCA'

In [37]:
model = SVC()
performance.iloc[i, 2:5], y_train_pred_2, y_val_pred_2  = fit_model(model, 'search_param/svm_param.json', 'pca',
    X_train_pca, y_train_encoded, X_val_pca, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,SVM,Scaling,9.29433,0.999167,0.615833
8,SVM,PCA,8.810364,0.822292,0.5975
9,,,,,


In [38]:
performance.iloc[i, 0] = 'SVM'
performance.iloc[i, 1] = 'LDA'

In [39]:
model = SVC()
performance.iloc[i, 2:5], y_train_pred_3, y_val_pred_3  = fit_model(model, 'search_param/svm_param.json', 'lda',
    X_train_lda, y_train_encoded, X_val_lda, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,SVM,Scaling,9.29433,0.999167,0.615833
8,SVM,PCA,8.810364,0.822292,0.5975
9,SVM,LDA,8.715013,0.536458,0.478333


In [40]:
y_train_pred.append(y_train_pred_1)
y_val_pred.append(y_val_pred_1)

## KNN

In [41]:
performance.iloc[i, 0] = 'KNN'
performance.iloc[i, 1] = 'Scaling'

In [42]:
from sklearn.neighbors import KNeighborsClassifier


model = KNeighborsClassifier()
performance.iloc[i, 2:5], y_train_pred_1, y_val_pred_1  = fit_model(model, 'search_param/knn_param.json', 'scaler',
    X_train_sc, y_train_encoded, X_val_sc, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,SVM,Scaling,9.29433,0.999167,0.615833
8,SVM,PCA,8.810364,0.822292,0.5975
9,SVM,LDA,8.715013,0.536458,0.478333


In [43]:
performance.iloc[i, 0] = 'KNN'
performance.iloc[i, 1] = 'PCA'

In [44]:
model = KNeighborsClassifier()
performance.iloc[i, 2:5], y_train_pred_2, y_val_pred_2  = fit_model(model, 'search_param/knn_param.json', 'pca',
    X_train_pca, y_train_encoded, X_val_pca, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,SVM,Scaling,9.29433,0.999167,0.615833
8,SVM,PCA,8.810364,0.822292,0.5975
9,SVM,LDA,8.715013,0.536458,0.478333


In [45]:
performance.iloc[i, 0] = 'KNN'
performance.iloc[i, 1] = 'LDA'

In [46]:
model = KNeighborsClassifier()
performance.iloc[i, 2:5], y_train_pred_3, y_val_pred_3  = fit_model(model, 'search_param/knn_param.json', 'lda',
    X_train_lda, y_train_encoded, X_val_lda, y_val_encoded)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,SVM,Scaling,9.29433,0.999167,0.615833
8,SVM,PCA,8.810364,0.822292,0.5975
9,SVM,LDA,8.715013,0.536458,0.478333


In [47]:
y_train_pred.append(y_train_pred_1)
y_val_pred.append(y_val_pred_1)

## Naive Bayes

In [48]:
performance.iloc[i, 0] = 'Naive Bayes'
performance.iloc[i, 1] = 'Scaling'

In [49]:
from sklearn.neighbors import KNeighborsClassifier


model = KNeighborsClassifier()

start_time = time.time()
model.fit(X_train_sc, y_train_encoded)

end_time = time.time()
performance.iloc[i, 2] = end_time - start_time

y_train_pred_1 = model.predict(X_train_sc)
y_val_pred_1 = model.predict(X_val_sc)

performance.iloc[i, 3] = accuracy_score(y_train_encoded, y_train_pred_1)
performance.iloc[i, 4] = accuracy_score(y_val_encoded, y_val_pred_1)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,SVM,Scaling,9.29433,0.999167,0.615833
8,SVM,PCA,8.810364,0.822292,0.5975
9,SVM,LDA,8.715013,0.536458,0.478333


In [50]:
performance.iloc[i, 0] = 'Naive Bayes'
performance.iloc[i, 1] = 'PCA'

In [51]:
model = KNeighborsClassifier()

start_time = time.time()
model.fit(X_train_pca, y_train_encoded)

end_time = time.time()
performance.iloc[i, 2] = end_time - start_time

y_train_pred_2 = model.predict(X_train_pca)
y_val_pred_2 = model.predict(X_val_pca)

performance.iloc[i, 3] = accuracy_score(y_train_encoded, y_train_pred_2)
performance.iloc[i, 4] = accuracy_score(y_val_encoded, y_val_pred_2)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,SVM,Scaling,9.29433,0.999167,0.615833
8,SVM,PCA,8.810364,0.822292,0.5975
9,SVM,LDA,8.715013,0.536458,0.478333


In [52]:
performance.iloc[i, 0] = 'Naive Bayes'
performance.iloc[i, 1] = 'LDA'

In [53]:
model = KNeighborsClassifier()

start_time = time.time()
model.fit(X_train_lda, y_train_encoded)

end_time = time.time()
performance.iloc[i, 2] = end_time - start_time

y_train_pred_3 = model.predict(X_train_lda)
y_val_pred_3 = model.predict(X_val_lda)

performance.iloc[i, 3] = accuracy_score(y_train_encoded, y_train_pred_3)
performance.iloc[i, 4] = accuracy_score(y_val_encoded, y_val_pred_3)

i += 1
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.566401,0.9025,0.580833
1,XGBooster,Scaling,115.175411,1.0,0.576667
2,Random Forest,Scaling,48.204262,1.0,0.561667
3,AdaBooster,Scaling,52.048564,0.482083,0.449167
4,Logistic,Scaling,0.56759,0.692708,0.5675
5,Logistic,PCA,0.628999,0.692708,0.5675
6,Logistic,LDA,0.607732,0.774167,0.523333
7,SVM,Scaling,9.29433,0.999167,0.615833
8,SVM,PCA,8.810364,0.822292,0.5975
9,SVM,LDA,8.715013,0.536458,0.478333


In [54]:
y_train_pred.append(y_train_pred_3)
y_val_pred.append(y_val_pred_3)

## Voting