# Combine all models
MSc in Statistical Science\
University of Oxford\
Group-assessed practical\
HT 2024

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from search_param.grid_search import read_data, decode_dict
from fit_models import fit_model
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
import shutil
import json
import time

## Data processing

In [2]:
X_train, X_val, y_train, y_val = read_data()

In [3]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Convert labels to one-hot encoding
y_train_onehot = to_categorical(y_train_encoded)
y_val_onehot = to_categorical(y_val_encoded)

In [4]:
# Standard scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_val_sc = scaler.transform(X_val)

In [5]:
# LDA
lda = LinearDiscriminantAnalysis(n_components=None)
X_train_lda = lda.fit_transform(X_train, y_train)
X_val_lda = lda.transform(X_val)

In [6]:
# PCA
p_PCA = 25 # from notebook pictures
pca = PCA(n_components=p_PCA, random_state=42)  # Select top 25 components
X_train_pca = pca.fit_transform(X_train_sc)
X_val_pca = pca.transform(X_val_sc)

## Create dataframe

In [7]:
column_names = ['Model', 'Pre-processing', 'Training time (sec)', 'Training acc', 'Testing acc']

performance = pd.DataFrame({col: [np.nan] * 17 for col in column_names})
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [23]:
y_train_pred = []
y_val_pred = []

## Neural Network

In [8]:
performance.iloc[0, 0] = 'Neural Network'
performance.iloc[0, 1] = 'Scaling'

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

In [10]:
# Load JSON file into Python dictionary
with open('search_nn/sc_param.json', 'r') as f:
    config = json.load(f)

In [11]:
nn_model = Sequential.from_config(config)
nn_model.summary()

In [12]:
nn_model.compile(optimizer="adam",
                 loss='categorical_crossentropy',  # Suitable for multi-class classification
                 metrics=['accuracy'])

In [13]:
start_time = time.time()

nn_model.fit(X_train_sc, y_train_onehot,
             validation_data=(X_val_sc, y_val_onehot),
             epochs=20)

end_time = time.time()
performance.iloc[0, 2] = end_time - start_time

Epoch 1/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.3309 - loss: 1.8774 - val_accuracy: 0.5092 - val_loss: 1.4400
Epoch 2/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5009 - loss: 1.4016 - val_accuracy: 0.5408 - val_loss: 1.3491
Epoch 3/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5593 - loss: 1.2493 - val_accuracy: 0.5600 - val_loss: 1.2915
Epoch 4/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6065 - loss: 1.1124 - val_accuracy: 0.5742 - val_loss: 1.2681
Epoch 5/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6295 - loss: 1.0629 - val_accuracy: 0.5742 - val_loss: 1.2962
Epoch 6/20
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6703 - loss: 0.9550 - val_accuracy: 0.5792 - val_loss: 1.2599
Epoch 7/20
[1m150/150[0m 

In [21]:
y_train_pred.append(nn_model.predict(X_train_sc).argmax(axis=-1))
train_loss, performance.iloc[0, 3] = nn_model.evaluate(X_train_sc, y_train_onehot, verbose=0)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [24]:
y_val_pred.append(nn_model.predict(X_val_sc).argmax(axis=-1))
test_loss, performance.iloc[0, 4] = nn_model.evaluate(X_val_sc, y_val_onehot)

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6137 - loss: 1.5140


In [26]:
performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.514085,0.96875,0.594167
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


## XGB

In [17]:
performance.iloc[1, 0] = 'XGBooster'
performance.iloc[1, 1] = 'Scaling'

In [18]:
from xgboost import XGBClassifier

In [19]:
# Load JSON file into Python dictionary
with open('search_param/xgb_param.json', 'r') as f:
    param = json.load(f)

param['scaler'].popitem()
param = decode_dict(param['scaler'])

In [22]:
xgb_model = XGBClassifier(
    objective='multi:softmax',   # for multiclass classification
    num_class=8,                 # specify the number of classes
)
xgb_model.set_params(**param)

In [23]:
start_time = time.time()

xgb_model.fit(X_train_sc, y_train_encoded)

end_time = time.time()
performance.iloc[1, 2] = end_time - start_time

In [26]:
# Get train accuracy
performance.iloc[1, 3] = accuracy_score(y_train_encoded, xgb_model.predict(X_train_sc))

# Get test accuracy
performance.iloc[1, 4] = accuracy_score(y_val_encoded, xgb_model.predict(X_val_sc))

performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.343971,0.974583,0.596667
1,XGBooster,Scaling,122.958627,1.0,0.576667
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


## Random Forest

In [27]:
performance.iloc[2, 0] = 'Random Forest'
performance.iloc[2, 1] = 'Scaling'

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
# Load JSON file into Python dictionary
with open('search_param/rf_param.json', 'r') as f:
    param = json.load(f)

param['scaler'].popitem()
param = decode_dict(param['scaler'])

In [31]:
rf_model = RandomForestClassifier()
rf_model.set_params(**param)

In [32]:
start_time = time.time()

rf_model.fit(X_train_sc, y_train)

end_time = time.time()
performance.iloc[1, 2] = end_time - start_time

In [33]:
# Get train accuracy
performance.iloc[1, 3] = accuracy_score(y_train, rf_model.predict(X_train_sc))

# Get test accuracy
performance.iloc[1, 4] = accuracy_score(y_val, rf_model.predict(X_val_sc))

performance

Unnamed: 0,Model,Pre-processing,Training time (sec),Training acc,Testing acc
0,Neural Network,Scaling,17.343971,0.974583,0.596667
1,XGBooster,Scaling,45.392582,1.0,0.55
2,Random Forest,Scaling,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,
