# FLAML to tune parameters
MSc in Statistical Science\
University of Oxford\
Group-assessed practical\
HT 2024


In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, zero_one_loss, make_scorer, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from flaml import AutoML
from flaml.ml import sklearn_metric_loss_score

## Load dataset

In [2]:
# Load the training data and the test inputs
X_train = pd.read_csv('X_train.csv', index_col = 0, header=[0, 1, 2]) # inputs of the training set
y_train = pd.read_csv('y_train.csv', index_col=0).squeeze('columns').to_numpy() # outputs of the training set
X_test = pd.read_csv('X_test.csv', index_col = 0, header=[0, 1, 2]) # inputs of the test set

In [8]:
# X_train is a 6,000 * 518 dataframe. 
# Entries (i,j) correspond to the j'th dimension of the observation i
X_train


array([[ 0.19681275,  0.6254612 ,  0.95287865, ..., -0.24135272,
        -0.60184891,  0.01167708],
       [ 0.2193385 , -0.01257184, -0.46847935, ...,  0.10675216,
        -1.0064726 , -0.95444705],
       [-0.26428755, -0.10758321, -0.20438099, ..., -0.58945759,
        -0.36409566, -0.55043354],
       ...,
       [ 1.10534076,  0.19574237,  0.68858455, ..., -0.35738767,
        -0.42491963,  0.3646974 ],
       [-0.34973988, -0.41117318, -0.20465631, ...,  0.10675216,
        -0.03484316,  0.38408757],
       [ 1.52698566, -0.64418351, -0.80023496, ...,  0.10675216,
        -0.47781726, -0.82483892]])

In [4]:
#y_train contains the true class:  Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock
y_train

array(['Electronic', 'Rock', 'Instrumental', ..., 'Pop', 'Instrumental',
       'Instrumental'], dtype=object)

In [5]:
# X_test is the array of test inputs, of the same format as X_train. The objective is to predict the class (Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock) of the output
X_test

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
Id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,3.974762,4.354650,3.394523,0.033462,5.611623,2.592655,3.041094,2.815378,3.954026,2.365586,...,0.052970,0.013487,0.022031,70.844788,0.671387,0.035129,0.026367,0.010254,8.394708,0.067026
1,0.033636,-0.139950,0.678688,-0.553824,-0.165293,0.370275,-0.314710,-0.368706,-0.437181,-0.441662,...,0.093105,0.022865,0.028800,18.427612,0.538574,0.055975,0.039062,0.000000,3.638194,0.053879
2,0.044094,4.285359,0.977213,3.240997,0.400350,1.026224,0.772464,0.807625,1.942534,1.938970,...,0.081955,0.017371,0.016673,38.401405,0.405762,0.030685,0.028809,0.002930,2.620369,0.016835
3,-0.425218,0.793007,0.509624,-0.967103,-1.432252,-0.900761,-0.501279,-0.855886,-0.556825,7.404243,...,0.098877,0.019089,0.016183,4.129582,0.252441,0.037955,0.024902,0.000000,1.898847,0.034382
4,-1.120238,-0.503659,0.303515,-0.596549,-0.716761,-0.874363,-0.708101,-0.642351,-0.327327,-0.342220,...,0.102545,0.023823,0.025166,16.758356,0.380371,0.027851,0.019043,0.000000,3.244483,0.027128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.166039,-0.660406,-0.955245,-0.689303,-0.916851,-0.136661,-0.367279,-1.135291,-1.174082,-0.609749,...,0.060857,0.022777,0.017722,8.467750,0.444824,0.055770,0.039062,0.001465,2.520271,0.056601
1996,2.010343,0.344572,2.738141,1.238262,2.815471,0.689118,4.040531,2.749969,2.658481,4.763800,...,0.050589,0.015169,0.013096,-0.649581,0.357422,0.126850,0.120117,0.000000,0.234551,0.055476
1997,-0.869148,-0.600280,0.105814,0.553810,-0.839182,-0.706434,-0.360566,0.053638,-0.791513,-0.669329,...,0.079556,0.025229,0.026858,1.563682,0.152344,0.039138,0.035156,0.002441,1.084503,0.020284
1998,-0.854251,-0.804227,1.347275,-0.748155,-0.408385,-0.909165,-0.870467,-0.077660,-0.538250,-0.108390,...,0.147899,0.023989,0.027429,22.050222,0.319824,0.032715,0.028809,0.000000,3.429668,0.022654


In [6]:
X = X_train
y = y_train
# Assuming you have X_train and y_train defined earlier

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Scale the features using StandardScaler
scalerx = StandardScaler()
X_train = scalerx.fit_transform(X_train)
X_val = scalerx.transform(X_val)

# Assuming y is your target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

## Classification with AutoML and prediction on the test set

In [7]:
# Initialize an AutoML instance
automl = AutoML()

# Specify automl goal and constraint
automl_settings = {
    "time_budget": 600,  # in seconds
    "metric": 'accuracy',
    "task": 'classification',
    "log_file_name": "song-600.log",
    "early_stop": True,
}

# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,
           **automl_settings)
# Predict
print(automl.predict_proba(X_train))
# Print the best model
print(automl.model.estimator)

# Output:
#LGBMClassifier(colsample_bytree=0.6888970622475563,
               #learning_rate=0.09951515716853933, max_bin=511,
               #min_child_samples=4, n_estimators=1, n_jobs=-1, num_leaves=13,
               #reg_alpha=0.007704104902643932, reg_lambda=9.757362285043104,
               #verbose=-1)

[flaml.automl.logger: 03-10 10:28:30] {1679} INFO - task = classification
[flaml.automl.logger: 03-10 10:28:30] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 03-10 10:28:30] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 03-10 10:28:30] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 03-10 10:28:30] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 03-10 10:28:31] {2344} INFO - Estimated sufficient time budget=3398s. Estimated necessary time budget=78s.
[flaml.automl.logger: 03-10 10:28:31] {2391} INFO -  at 0.5s,	estimator lgbm's best error=0.5799,	best estimator lgbm's best error=0.5799
[flaml.automl.logger: 03-10 10:28:31] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 03-10 10:28:31] {2391} INFO -  at 0.8s,	estimator lgbm's best error=0.5799,	best estimator lgbm's best error=0.5799
[flaml.automl.logger: 03-10 1



[flaml.automl.logger: 03-10 10:31:59] {2391} INFO -  at 208.5s,	estimator lgbm's best error=0.3975,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:31:59] {2218} INFO - iteration 71, current learner lrl1
[flaml.automl.logger: 03-10 10:32:32] {2391} INFO -  at 242.1s,	estimator lrl1's best error=0.4406,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:32:32] {2218} INFO - iteration 72, current learner lrl1




[flaml.automl.logger: 03-10 10:32:51] {2391} INFO -  at 260.8s,	estimator lrl1's best error=0.4406,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:32:51] {2218} INFO - iteration 73, current learner xgboost




[flaml.automl.logger: 03-10 10:32:52] {2391} INFO -  at 261.5s,	estimator xgboost's best error=0.5369,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:32:52] {2218} INFO - iteration 74, current learner lrl1
[flaml.automl.logger: 03-10 10:33:19] {2391} INFO -  at 289.4s,	estimator lrl1's best error=0.4406,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:33:19] {2218} INFO - iteration 75, current learner xgboost




[flaml.automl.logger: 03-10 10:33:20] {2391} INFO -  at 290.1s,	estimator xgboost's best error=0.5369,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:33:20] {2218} INFO - iteration 76, current learner extra_tree
[flaml.automl.logger: 03-10 10:33:20] {2391} INFO -  at 290.3s,	estimator extra_tree's best error=0.4980,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:33:20] {2218} INFO - iteration 77, current learner xgboost
[flaml.automl.logger: 03-10 10:33:22] {2391} INFO -  at 291.6s,	estimator xgboost's best error=0.4857,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:33:22] {2218} INFO - iteration 78, current learner xgboost
[flaml.automl.logger: 03-10 10:33:23] {2391} INFO -  at 292.9s,	estimator xgboost's best error=0.4857,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:33:23] {2218} INFO - iteration 79, current learner extra_tree
[flaml.automl.logger: 03-10 10:33:23] {2391} INFO -  at 293



[flaml.automl.logger: 03-10 10:33:47] {2391} INFO -  at 316.8s,	estimator xgboost's best error=0.4857,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:33:47] {2218} INFO - iteration 84, current learner extra_tree
[flaml.automl.logger: 03-10 10:33:47] {2391} INFO -  at 317.0s,	estimator extra_tree's best error=0.4980,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:33:47] {2218} INFO - iteration 85, current learner xgboost
[flaml.automl.logger: 03-10 10:33:48] {2391} INFO -  at 318.1s,	estimator xgboost's best error=0.4857,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:33:48] {2218} INFO - iteration 86, current learner extra_tree
[flaml.automl.logger: 03-10 10:33:49] {2391} INFO -  at 318.5s,	estimator extra_tree's best error=0.4980,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:33:49] {2218} INFO - iteration 87, current learner xgboost
[flaml.automl.logger: 03-10 10:33:49] {2391} INFO -  at 



[flaml.automl.logger: 03-10 10:34:16] {2391} INFO -  at 346.0s,	estimator xgboost's best error=0.4713,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:34:16] {2218} INFO - iteration 93, current learner lrl1
[flaml.automl.logger: 03-10 10:34:44] {2391} INFO -  at 374.0s,	estimator lrl1's best error=0.4406,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:34:44] {2218} INFO - iteration 94, current learner extra_tree




[flaml.automl.logger: 03-10 10:34:44] {2391} INFO -  at 374.4s,	estimator extra_tree's best error=0.4918,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:34:44] {2218} INFO - iteration 95, current learner extra_tree
[flaml.automl.logger: 03-10 10:34:45] {2391} INFO -  at 374.6s,	estimator extra_tree's best error=0.4918,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:34:45] {2218} INFO - iteration 96, current learner lgbm
[flaml.automl.logger: 03-10 10:34:51] {2391} INFO -  at 381.0s,	estimator lgbm's best error=0.3975,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:34:51] {2218} INFO - iteration 97, current learner lrl1
[flaml.automl.logger: 03-10 10:35:17] {2391} INFO -  at 406.6s,	estimator lrl1's best error=0.4406,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:35:17] {2218} INFO - iteration 98, current learner xgboost




[flaml.automl.logger: 03-10 10:35:19] {2391} INFO -  at 408.4s,	estimator xgboost's best error=0.4713,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:35:19] {2218} INFO - iteration 99, current learner xgboost
[flaml.automl.logger: 03-10 10:35:22] {2391} INFO -  at 411.8s,	estimator xgboost's best error=0.4713,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:35:22] {2218} INFO - iteration 100, current learner rf
[flaml.automl.logger: 03-10 10:35:23] {2391} INFO -  at 412.8s,	estimator rf's best error=0.5000,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:35:23] {2218} INFO - iteration 101, current learner rf
[flaml.automl.logger: 03-10 10:35:28] {2391} INFO -  at 417.7s,	estimator rf's best error=0.5000,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:35:28] {2218} INFO - iteration 102, current learner lrl1
[flaml.automl.logger: 03-10 10:35:48] {2391} INFO -  at 437.4s,	estimator lrl1's best er



[flaml.automl.logger: 03-10 10:35:48] {2391} INFO -  at 437.7s,	estimator extra_tree's best error=0.4918,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:35:48] {2218} INFO - iteration 104, current learner lgbm
[flaml.automl.logger: 03-10 10:35:58] {2391} INFO -  at 448.1s,	estimator lgbm's best error=0.3975,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:35:58] {2218} INFO - iteration 105, current learner extra_tree
[flaml.automl.logger: 03-10 10:35:59] {2391} INFO -  at 448.4s,	estimator extra_tree's best error=0.4918,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:35:59] {2218} INFO - iteration 106, current learner rf
[flaml.automl.logger: 03-10 10:36:01] {2391} INFO -  at 450.7s,	estimator rf's best error=0.5000,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:36:01] {2218} INFO - iteration 107, current learner xgboost
[flaml.automl.logger: 03-10 10:36:02] {2391} INFO -  at 452.1s,	estimat



[flaml.automl.logger: 03-10 10:36:23] {2391} INFO -  at 472.8s,	estimator rf's best error=0.5000,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:36:23] {2218} INFO - iteration 110, current learner lrl1
[flaml.automl.logger: 03-10 10:36:49] {2391} INFO -  at 499.3s,	estimator lrl1's best error=0.4406,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:36:49] {2218} INFO - iteration 111, current learner extra_tree




[flaml.automl.logger: 03-10 10:36:50] {2391} INFO -  at 499.5s,	estimator extra_tree's best error=0.4918,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:36:50] {2218} INFO - iteration 112, current learner lgbm
[flaml.automl.logger: 03-10 10:37:03] {2391} INFO -  at 513.2s,	estimator lgbm's best error=0.3975,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:37:03] {2218} INFO - iteration 113, current learner lrl1
[flaml.automl.logger: 03-10 10:37:29] {2391} INFO -  at 538.7s,	estimator lrl1's best error=0.4406,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:37:29] {2218} INFO - iteration 114, current learner extra_tree




[flaml.automl.logger: 03-10 10:37:29] {2391} INFO -  at 538.9s,	estimator extra_tree's best error=0.4918,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:37:29] {2218} INFO - iteration 115, current learner lrl1
[flaml.automl.logger: 03-10 10:37:59] {2391} INFO -  at 569.3s,	estimator lrl1's best error=0.4406,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:37:59] {2218} INFO - iteration 116, current learner lgbm




[flaml.automl.logger: 03-10 10:38:29] {2391} INFO -  at 599.2s,	estimator lgbm's best error=0.3975,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:38:29] {2218} INFO - iteration 117, current learner extra_tree
[flaml.automl.logger: 03-10 10:38:30] {2391} INFO -  at 600.1s,	estimator extra_tree's best error=0.4918,	best estimator lgbm's best error=0.3975
[flaml.automl.logger: 03-10 10:38:50] {2627} INFO - retrain lgbm for 20.0s
[flaml.automl.logger: 03-10 10:38:50] {2630} INFO - retrained model: LGBMClassifier(colsample_bytree=0.6874570413763783,
               learning_rate=0.0516599331111777, max_bin=511,
               min_child_samples=8, n_estimators=1, n_jobs=-1, num_leaves=26,
               reg_alpha=0.0009765625, reg_lambda=2.5682014397894393,
               verbose=-1)
[flaml.automl.logger: 03-10 10:38:50] {1930} INFO - fit succeeded
[flaml.automl.logger: 03-10 10:38:50] {1931} INFO - Time taken to find the best model: 151.2748064994812
[[0.73553154 0.0

In [9]:
x_pred = automl.predict(X_val)
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', x_pred, y_val))
# 0.615

accuracy = 0.5733333333333334


In [10]:
X_test = scalerx.transform(X_test)
y_pred = automl.predict(X_test) # compute predictions on the test inputs
y_pred
y_pred_dec = label_encoder.inverse_transform(y_pred)
y_pred_dec

array(['Experimental', 'Pop', 'International', ..., 'Experimental', 'Pop',
       'Pop'], dtype=object)

In [None]:
# Tune parameters of a certain model
from flaml.default import preprocess_and_suggest_hyperparams

hyperparams, estimator_class, X_transformed, y_transformed, feature_transformer, label_transformer = preprocess_and_suggest_hyperparams(
    "classification", X_train, y_train, "lgbm"
)
model = estimator_class(**hyperparams)  # estimator_class is lightgbm.LGBMClassifier
model.fit(X_transformed, y_train)  # LGBMClassifier can handle raw labels
X_val = feature_transformer.transform(X_val)  # preprocess test data
x_pred = model.predict(X_val)

In [None]:
print(model)
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', x_pred, y_val))

In [13]:
X_test = scalerx.transform(X_test)
y_pred = model.predict(X_test) # compute predictions on the test inputs
y_pred

NameError: name 'model' is not defined

In [41]:
y_pred_dec = label_encoder.inverse_transform(y_pred)
y_pred_dec

array(['Folk', 'Pop', 'International', ..., 'Rock', 'Pop', 'Pop'],
      dtype=object)

## Export in csv format 

In [31]:
# Export the predictions on the test data in csv format
prediction = pd.DataFrame(y_pred, columns=['Genre'])
prediction.index.name='Id'
prediction.to_csv('myprediction.csv') # export to csv file

# The csv file should be of the form
#Id, Genre
#0, Folk
#1, Hip-Hop
#2, International
#...
#1998, Experimental
#1999, Pop