In [18]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, make_scorer

In [6]:
# Loadign training data (features)
input_file_name = '../data/processed/X_arr_train.pickle'
with open(input_file_name, 'rb') as in_file:
    X_train = pickle.load(in_file)

# Loading test data (features)
input_file_name = '../data/processed/X_arr_test.pickle'
with open(input_file_name, 'rb') as in_file:
    X_test = pickle.load(in_file)

# Loading training labels
input_file_name = '../data/processed/y_arr_train.pickle'
with open(input_file_name, 'rb') as in_file:
    y_train = pickle.load(in_file)

# Loading test labels
input_file_name = '../data/processed/y_arr_test.pickle'
with open(input_file_name, 'rb') as in_file:
    y_test = pickle.load(in_file)

In [7]:
# Function defenition for getting smaller random samples for medelling and testing
def get_sample(df_features = X_train, df_labels = y_train, size=100_000, random_state = 1812):
    '''Randomly selectting rows and labels as a training samles for the model'''

    ind_train = df_features.sample(n = size, random_state = random_state).index
    training_samples_filter = df_features.index.isin(ind_train)

    return df_features.iloc[training_samples_filter], df_labels.iloc[training_samples_filter]

In [8]:
bins = [-np.inf, -240.0, -180.0, -120.0, -90.0, -60.0, -45.0, -30.0, -15.0, 0, 15.0, 30.0, 45.0, 60.0, 90.0, 120.0,
        180.0, 360.0, 720.0, 1140.0, 2280.0, np.inf]
labels = np.arange(len(bins) - 1)

y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

# Transformation of the original continuous target variables into multi-label classes
y_train_cat = pd.cut(y_train['ActArrDelay'].astype('float32'), bins = bins, labels = labels)
y_test_cat = pd.cut(y_test['ActArrDelay'].astype('float32'), bins = bins, labels = labels)

In [34]:
X_train_sample, y_train_sample = get_sample(df_features = X_train, df_labels= y_train_cat, size = 100_000)
X_test_sample, y_test_sample = get_sample(df_features = X_test, df_labels= y_test_cat, size = 100_000)

In [35]:
param_dist = {'learning_rate': [0.01, 0.1, 1, 2, 3, 4, 5],
              'n_estimators': [10, 20, 25, 50, 100], 
              'max_depth': [3, 5, 10, 20, 30], 
              'random_state': [1812]}

gbc = GradientBoostingClassifier()
rs_cv = RandomizedSearchCV(estimator = gbc, param_distributions = param_dist, scoring = 'f1_macro', 
                           n_iter = 10, n_jobs = -1, random_state = 1812, verbose = 2)

rs_cv.fit(X_train_sample, y_train_sample)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END learning_rate=5, max_depth=10, n_estimators=10, random_state=1812; total time= 3.5min
[CV] END learning_rate=5, max_depth=10, n_estimators=10, random_state=1812; total time= 3.5min
[CV] END learning_rate=5, max_depth=10, n_estimators=10, random_state=1812; total time= 3.6min
[CV] END learning_rate=5, max_depth=10, n_estimators=10, random_state=1812; total time= 3.6min
[CV] END learning_rate=5, max_depth=10, n_estimators=10, random_state=1812; total time= 3.6min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=25, random_state=1812; total time= 4.2min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=25, random_state=1812; total time= 4.2min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=25, random_state=1812; total time= 4.2min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=25, random_state=1812; total time= 4.9min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=25, random_state=1812; tot



[CV] END learning_rate=1, max_depth=20, n_estimators=100, random_state=1812; total time=100.5min
[CV] END learning_rate=1, max_depth=20, n_estimators=100, random_state=1812; total time=103.2min
[CV] END learning_rate=1, max_depth=20, n_estimators=100, random_state=1812; total time=106.0min
[CV] END learning_rate=4, max_depth=30, n_estimators=50, random_state=1812; total time=104.8min
[CV] END learning_rate=1, max_depth=20, n_estimators=100, random_state=1812; total time=111.8min
[CV] END learning_rate=4, max_depth=30, n_estimators=50, random_state=1812; total time=108.2min
[CV] END learning_rate=0.01, max_depth=20, n_estimators=25, random_state=1812; total time=45.7min
[CV] END learning_rate=0.01, max_depth=20, n_estimators=25, random_state=1812; total time=45.4min
[CV] END learning_rate=0.01, max_depth=20, n_estimators=25, random_state=1812; total time=46.7min
[CV] END learning_rate=0.01, max_depth=20, n_estimators=25, random_state=1812; total time=46.3min
[CV] END learning_rate=0.01,

In [36]:
print('Best score: {:.6f} \nParameters: {}'.format(rs_cv.best_score_, rs_cv.best_params_))

Best score: 0.070471 
Parameters: {'random_state': 1812, 'n_estimators': 50, 'max_depth': 30, 'learning_rate': 0.1}
