# XGB method without preprocessing (LDA and PCA)
MSc in Statistical Science\
University of Oxford\
Group-assessed practical\
HT 2024


In [35]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, zero_one_loss, make_scorer, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier

## Load dataset

In [36]:
# Load the training data and the test inputs
X_train = pd.read_csv('X_train.csv', index_col = 0, header=[0, 1, 2]) # inputs of the training set
y_train = pd.read_csv('y_train.csv', index_col=0).squeeze('columns').to_numpy() # outputs of the training set
X_test = pd.read_csv('X_test.csv', index_col = 0, header=[0, 1, 2]) # inputs of the test set

In [3]:
# X_train is a 6,000 * 518 dataframe. 
# Entries (i,j) correspond to the j'th dimension of the observation i
X_train


feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
Id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,-0.266585,-0.984668,-0.729823,-0.895122,2.138628,0.935209,0.104089,-0.698659,-0.736408,-0.334376,...,0.065003,0.016522,0.015776,5.743597,0.307617,0.051370,0.042480,0.002441,1.976972,0.034533
1,-0.180061,0.260884,-0.069373,0.208734,-0.078855,-0.577818,0.583788,0.143781,0.291556,0.007314,...,0.087692,0.016355,0.016605,64.870987,0.812988,0.082784,0.069824,0.003906,7.374503,0.074870
2,-0.692900,0.356662,0.062617,0.248280,3.470037,0.166613,0.823874,0.181112,0.551939,0.357985,...,0.132387,0.025847,0.023922,34.251705,0.850098,0.058200,0.036621,0.010254,5.927942,0.117603
3,0.243339,0.214182,-0.049026,1.456255,-0.360826,-0.875256,-0.770200,0.315500,0.789956,0.448319,...,0.071478,0.019166,0.025535,1.364990,0.342285,0.081713,0.075195,0.000000,1.100437,0.041754
4,-0.968576,0.309255,0.223164,0.160960,0.919838,-0.111985,-1.012521,-0.665692,-0.316646,-0.264381,...,0.106220,0.023536,0.019742,3.589230,0.322266,0.073736,0.069336,0.004395,1.210593,0.036459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.503490,-0.540720,-0.690117,-0.107338,-0.647856,-0.681969,-0.246245,-0.546552,0.062783,0.070393,...,0.084929,0.017250,0.020335,4.868783,0.668945,0.076452,0.044434,0.001465,2.045856,0.084214
5996,-0.600597,0.406386,-0.748409,-0.316157,-0.507428,-0.054214,-0.476804,-0.373120,-0.930158,-1.080690,...,0.075407,0.014998,0.020683,7.893681,0.584961,0.076210,0.048340,0.000000,2.561808,0.073010
5997,-1.014298,-0.950744,0.618304,0.204298,-0.788411,-0.794254,-0.586847,0.099172,-0.313476,-0.523417,...,0.138591,0.024969,0.023658,27.257378,0.373047,0.042598,0.037598,0.000000,3.778109,0.027813
5998,-0.002938,0.646034,-0.732819,1.205990,-0.898733,-0.684953,0.134642,-0.374792,-0.019524,-1.016032,...,0.137695,0.030371,0.029970,431.200500,0.384277,0.025731,0.025391,0.008301,10.260160,0.006870


In [4]:
#y_train contains the true class:  Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock
y_train

array(['Electronic', 'Rock', 'Instrumental', ..., 'Pop', 'Instrumental',
       'Instrumental'], dtype=object)

In [5]:
# X_test is the array of test inputs, of the same format as X_train. The objective is to predict the class (Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock) of the output
X_test

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
Id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,3.974762,4.354650,3.394523,0.033462,5.611623,2.592655,3.041094,2.815378,3.954026,2.365586,...,0.052970,0.013487,0.022031,70.844788,0.671387,0.035129,0.026367,0.010254,8.394708,0.067026
1,0.033636,-0.139950,0.678688,-0.553824,-0.165293,0.370275,-0.314710,-0.368706,-0.437181,-0.441662,...,0.093105,0.022865,0.028800,18.427612,0.538574,0.055975,0.039062,0.000000,3.638194,0.053879
2,0.044094,4.285359,0.977213,3.240997,0.400350,1.026224,0.772464,0.807625,1.942534,1.938970,...,0.081955,0.017371,0.016673,38.401405,0.405762,0.030685,0.028809,0.002930,2.620369,0.016835
3,-0.425218,0.793007,0.509624,-0.967103,-1.432252,-0.900761,-0.501279,-0.855886,-0.556825,7.404243,...,0.098877,0.019089,0.016183,4.129582,0.252441,0.037955,0.024902,0.000000,1.898847,0.034382
4,-1.120238,-0.503659,0.303515,-0.596549,-0.716761,-0.874363,-0.708101,-0.642351,-0.327327,-0.342220,...,0.102545,0.023823,0.025166,16.758356,0.380371,0.027851,0.019043,0.000000,3.244483,0.027128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.166039,-0.660406,-0.955245,-0.689303,-0.916851,-0.136661,-0.367279,-1.135291,-1.174082,-0.609749,...,0.060857,0.022777,0.017722,8.467750,0.444824,0.055770,0.039062,0.001465,2.520271,0.056601
1996,2.010343,0.344572,2.738141,1.238262,2.815471,0.689118,4.040531,2.749969,2.658481,4.763800,...,0.050589,0.015169,0.013096,-0.649581,0.357422,0.126850,0.120117,0.000000,0.234551,0.055476
1997,-0.869148,-0.600280,0.105814,0.553810,-0.839182,-0.706434,-0.360566,0.053638,-0.791513,-0.669329,...,0.079556,0.025229,0.026858,1.563682,0.152344,0.039138,0.035156,0.002441,1.084503,0.020284
1998,-0.854251,-0.804227,1.347275,-0.748155,-0.408385,-0.909165,-0.870467,-0.077660,-0.538250,-0.108390,...,0.147899,0.023989,0.027429,22.050222,0.319824,0.032715,0.028809,0.000000,3.429668,0.022654


In [37]:
X = X_train
y = y_train
# Assuming you have X_train and y_train defined earlier

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Scale the features using StandardScaler
scalerx = StandardScaler()
X_train = scalerx.fit_transform(X_train)
X_val = scalerx.transform(X_val)

# Assuming y is your target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

## Classification with XGB and prediction on the test set

In [38]:
model = XGBClassifier(
    objective='multi:softmax',   # for multiclass classification
    num_class= 8,       # specify the number of classes
    max_depth= 6,
    learning_rate=0.3,
    subsample= 1,
    colsample_bytree=0.8,
    n_estimators=50,            # You can adjust the number of boosting rounds
)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

# Make Predictions
y_pred = model.predict(X_val)

# Evaluate the Model with 0-1 Loss
loss = zero_one_loss(y_val, y_pred)
accuracy = 1 - loss

[0]	validation_0-mlogloss:1.84172
[1]	validation_0-mlogloss:1.70272
[2]	validation_0-mlogloss:1.60646
[3]	validation_0-mlogloss:1.54191
[4]	validation_0-mlogloss:1.48632
[5]	validation_0-mlogloss:1.44015
[6]	validation_0-mlogloss:1.40626
[7]	validation_0-mlogloss:1.37522
[8]	validation_0-mlogloss:1.34862
[9]	validation_0-mlogloss:1.32735
[10]	validation_0-mlogloss:1.31051
[11]	validation_0-mlogloss:1.29491
[12]	validation_0-mlogloss:1.28136
[13]	validation_0-mlogloss:1.26790
[14]	validation_0-mlogloss:1.25897
[15]	validation_0-mlogloss:1.25061
[16]	validation_0-mlogloss:1.24285
[17]	validation_0-mlogloss:1.23376
[18]	validation_0-mlogloss:1.22744
[19]	validation_0-mlogloss:1.22124
[20]	validation_0-mlogloss:1.21643
[21]	validation_0-mlogloss:1.21214
[22]	validation_0-mlogloss:1.20804
[23]	validation_0-mlogloss:1.20619
[24]	validation_0-mlogloss:1.20489
[25]	validation_0-mlogloss:1.19826
[26]	validation_0-mlogloss:1.19572
[27]	validation_0-mlogloss:1.19052
[28]	validation_0-mlogloss:1.1

In [39]:
print('Accuracy of XGB on the training set: ', accuracy) # evaluate the accuracy on the testing set

Accuracy of XGB on the training set:  0.6158333333333333


In [14]:
# Define the XGBoost model
model = XGBClassifier(
    objective='multi:softmax',   # for multiclass classification
    num_class=8,                  # specify the number of classes
)

# Define the hyperparameter grid for grid search
param_grid = {
    'max_depth': [3, 5, 6, 8, 10, 20],
    'learning_rate': [0.1, 0.3, 0.5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [50]
}

# Define the evaluation metric for grid search
scorer = make_scorer(zero_one_loss, greater_is_better=False)

# Perform grid search
grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    scoring=scorer,
    cv=3,  # Number of cross-validation folds 
    verbose=2,
)
# choose cv = 3 due to the limited time
grid_search.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose =10)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best parameters
print("Best Parameters:", best_params)

# Make predictions and evaluate the best model
y_pred = best_model.predict(X_val)
accuracy = 1 - zero_one_loss(y_val, y_pred)

print(f'Validation Accuracy (0-1 Loss) with Best Model: {accuracy:.4f}')

# Output:
# Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 6, 'n_estimators': 50, 'subsample': 1.0}
# Validation Accuracy (0-1 Loss) with Best Model: 0.6008

Fitting 3 folds for each of 162 candidates, totalling 486 fits
[0]	validation_0-mlogloss:2.02084
[10]	validation_0-mlogloss:1.67813
[20]	validation_0-mlogloss:1.51556
[30]	validation_0-mlogloss:1.41994
[40]	validation_0-mlogloss:1.35964
[49]	validation_0-mlogloss:1.32178
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=50, subsample=0.6; total time=   2.5s
[0]	validation_0-mlogloss:2.02433
[10]	validation_0-mlogloss:1.68126
[20]	validation_0-mlogloss:1.52113
[30]	validation_0-mlogloss:1.42795
[40]	validation_0-mlogloss:1.36949
[49]	validation_0-mlogloss:1.33024
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=50, subsample=0.6; total time=   2.7s
[0]	validation_0-mlogloss:2.02363
[10]	validation_0-mlogloss:1.69516
[20]	validation_0-mlogloss:1.54297
[30]	validation_0-mlogloss:1.45305
[40]	validation_0-mlogloss:1.39206
[49]	validation_0-mlogloss:1.35227
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=50, su

In [40]:
X_test = scalerx.transform(X_test)
y_pred = model.predict(X_test) # compute predictions on the test inputs
y_pred

array([2, 6, 5, ..., 7, 6, 6])

In [41]:
y_pred_dec = label_encoder.inverse_transform(y_pred)
y_pred_dec

array(['Folk', 'Pop', 'International', ..., 'Rock', 'Pop', 'Pop'],
      dtype=object)

## Export in csv format 

In [31]:
# Export the predictions on the test data in csv format
prediction = pd.DataFrame(y_pred, columns=['Genre'])
prediction.index.name='Id'
prediction.to_csv('myprediction.csv') # export to csv file

# The csv file should be of the form
#Id, Genre
#0, Folk
#1, Hip-Hop
#2, International
#...
#1998, Experimental
#1999, Pop