In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, date

from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
import preprocessing

In [3]:
#Import data
xtrain = pd.read_csv("Xtrain.csv")
ytrain = pd.read_csv("Ytrain.csv")
records = xtrain.copy()
records['ChargeOff'] = ytrain['ChargeOff']

In [4]:
processed_records = preprocessing.all_preprocess_with_label_encoding(records)

In [5]:
x = processed_records.drop(columns='ChargeOff')
y = processed_records['ChargeOff']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=0)

In [6]:

#Initialize LightGBM hyper parameters
params = {'boosting_type':'dart',
         'num_leaves':100, 
         'objective':'binary',
         'max_depth':-1,
         'learning_rate':.05,
         'max_bin':200,}
params['metric'] = ['auc', 'binary_logloss']

lgb = LGBMClassifier()
lgb.fit(x_train,y_train)
accuracy=accuracy_score(y_test, lgb.predict(x_test))
print(accuracy)
print(lgb.get_params())

lgb.set_params(**params)
lgb.fit(x_train,y_train)
accuracy=accuracy_score(y_test, lgb.predict(x_test))
print(accuracy)
print(lgb.get_params())

0.92656
{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}
0.92024
{'boosting_type': 'dart', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 100, 'objective': 'binary', 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'max_bin': 200, 'metric': ['auc', 'binary_logloss']}


In [7]:
# Boosting type in LightGBM
boosting_type = ['gbdt', 'dart', 'goss']
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 120, num = 20)]
num_leaves = [int(x) for x in np.linspace(start = 70, stop = 150, num = 20)]
learning_rate=[.05, .07, .09, .1, .12]
max_depth = [5,6,-1]
# Create the random grid
random_grid = {'boosting_type':boosting_type,
               'n_estimators': n_estimators,
               'num_leaves': num_leaves,
               'learning_rate': learning_rate,
               'max_depth': max_depth, }
# Use the random grid to search for best hyperparameters
# First create the base model to tune
lgb = LGBMClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
lgb_random = RandomizedSearchCV(estimator = lgb, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
lgb_random.fit(x_train, y_train)

print(lgb_random.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.2min finished


{'num_leaves': 74, 'n_estimators': 116, 'max_depth': -1, 'learning_rate': 0.09, 'boosting_type': 'gbdt'}


In [8]:
def evaluate(model, x_test, y_test):
    accuracy=accuracy_score(y_test, model.predict(x_test))
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

base_model = LGBMClassifier(random_state = 42)
base_model.fit(x_train, y_train)
base_accuracy = evaluate(base_model, x_test, y_test)

best_random = lgb_random.best_estimator_
random_accuracy = evaluate(best_random, x_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Model Performance
Accuracy = 0.93%.
Model Performance
Accuracy = 0.93%.
Improvement of 0.47%.


In [9]:
lgb.set_params(**lgb_random.best_params_)

LGBMClassifier(learning_rate=0.09, n_estimators=116, num_leaves=74)

In [10]:
lgb.fit(x_train,y_train)
accuracy=accuracy_score(y_test, lgb.predict(x_test))
accuracy

0.93088

In [11]:
Xtest = pd.read_csv('Xtest.csv')
Xtest_processed = preprocessing.all_preprocess_with_label_encoding(Xtest)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [12]:
submission = pd.DataFrame()
submission['Id'] = Xtest['Id']
submission['ChargeOff'] = best_random.predict(Xtest_processed)
submission.to_csv('submission_lightgbm.csv', index=False)