# Step 1: Import helpful libraries

In [1]:
# Familiar imports

#basic tools 
import time
import sys
import os
import numpy as np
import pandas as pd

#graph, plots
import matplotlib.pyplot as plt
import seaborn as sns

# For ordinal encoding categorical variables, splitting data
import sklearn as sk
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve

# For training LGBM
from lightgbm import LGBMRegressor
from tqdm import tqdm
import xgboost as xgb

#tuning hyperparameters
from skopt  import BayesSearchCV 

import optuna
from functools import partial
from termcolor import colored

import shap
import warnings
#warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

print("set up complete")

set up complete


In [2]:
#Python libraries and their versions used for this problem
print('SciKit Learn:',sk.__version__)
print('Pandas:',pd.__version__)
print('Numpy:',np.__version__)
print('Seaborn:',sns.__version__)

SciKit Learn: 0.23.2
Pandas: 1.3.1
Numpy: 1.20.3
Seaborn: 0.11.2


# Step 2: Load the data

In [3]:
#https://towardsdatascience.com/make-working-with-large-dataframes-easier-at-least-for-your-memory-6f52b5f4b5c4
    
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
%%time
#local
train = reduce_mem_usage(pd.read_csv("./dataset/train.csv",encoding='utf-8', index_col=0, low_memory=False))
test = reduce_mem_usage(pd.read_csv("./dataset/test.csv",encoding='utf-8', index_col=0, low_memory=False))

#Internet
#train = reduce_mem_usage(pd.read_csv("../input/30-days-of-ml/train.csv",encoding='utf-8', index_col=0, low_memory=False))
#test = reduce_mem_usage(pd.read_csv("../input/30-days-of-ml/test.csv",encoding='utf-8', index_col=0, low_memory=False))

#Sem redução de espaço
#train = pd.read_csv("../input/30-days-of-ml/train.csv",encoding='utf-8', index_col=0, low_memory=False)
#test = pd.read_csv("../input/30-days-of-ml/test.csv",encoding='utf-8', index_col=0, low_memory=False)

print("\nShape of train set: ",train.shape)
print("Shape of test set: ",test.shape)

print("\nload complete")

Mem. usage decreased to 33.76 Mb (43.3% reduction)
Mem. usage decreased to 22.13 Mb (42.0% reduction)

Shape of train set:  (300000, 25)
Shape of test set:  (200000, 24)

load complete
Wall time: 1.93 s


In [5]:
# Preview data
train.head(3)

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,B,B,C,B,B,A,E,C,N,...,0.400391,0.160278,0.311035,0.389404,0.267578,0.237305,0.37793,0.32251,0.869629,8.117188
2,B,B,A,A,B,D,A,F,A,O,...,0.533203,0.559082,0.516113,0.594727,0.341553,0.90625,0.921875,0.261963,0.465088,8.484375
3,A,A,A,C,B,D,A,D,A,F,...,0.650391,0.375244,0.902344,0.555176,0.84375,0.749023,0.620117,0.541504,0.763672,8.367188


In [6]:
train.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 1 to 499999
Data columns (total 25 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   cat0    300000 non-null  object 
 1   cat1    300000 non-null  object 
 2   cat2    300000 non-null  object 
 3   cat3    300000 non-null  object 
 4   cat4    300000 non-null  object 
 5   cat5    300000 non-null  object 
 6   cat6    300000 non-null  object 
 7   cat7    300000 non-null  object 
 8   cat8    300000 non-null  object 
 9   cat9    300000 non-null  object 
 10  cont0   300000 non-null  float16
 11  cont1   300000 non-null  float16
 12  cont2   300000 non-null  float16
 13  cont3   300000 non-null  float16
 14  cont4   300000 non-null  float16
 15  cont5   300000 non-null  float16
 16  cont6   300000 non-null  float16
 17  cont7   300000 non-null  float16
 18  cont8   300000 non-null  float16
 19  cont9   300000 non-null  float16
 20  cont10  300000 non-null  float16
 21  cont11  30

In [7]:
print('Info about train data: ')
print('Number of rows:',colored(train.shape[0],'green'))
print('Number of columns:',colored(train.shape[1],'green'))
print('Number of missing values:',colored(sum(train.isna().sum()),'green'))

Info about train data: 
Number of rows: [32m300000[0m
Number of columns: [32m25[0m
Number of missing values: [32m0[0m


In [8]:
num_col = list(train.select_dtypes(include='float16').columns)
cat_cols = list(train.select_dtypes(include='object').columns)
num_col.remove('target')
print('Number of numerical columns is:',colored(len(num_col),'green'),
      '\nNumber of categorical columsn is:',colored(len(cat_cols),'green'))

Number of numerical columns is: [32m14[0m 
Number of categorical columsn is: [32m10[0m


In [9]:
print('target column basic statistics:')
target=train['target'].copy()
target.describe()

target column basic statistics:


count    300000.000000
mean               NaN
std           0.000000
min           0.140381
25%           7.742188
50%           8.187500
75%           8.726562
max          10.414062
Name: target, dtype: float64

In [10]:
# Separate target from features
y_train = train['target'].copy()
X_train = train.drop(['target'], axis=1).copy()

In [11]:
# Assuring that test data and whether or not it has the same columns as the train
if list(test.columns) == list(X_train.columns):
    print(colored('True', 'green'))  
else:
    print(colored('False', 'red'))  


[32mTrue[0m


# Step 3: Prepare the data

In [12]:
# Checking if there are missing values in the datasets
#Train
print(f'Train null values:',colored(X_train.isna().sum().sum(), 'green'))

#Test
print(f'Test null values:',colored(test.isna().sum().sum(), 'green'))

Train null values: [32m0[0m
Test null values: [32m0[0m


In [13]:
categorical_feature = np.where(X_train.dtypes != 'float16')[0].tolist()
categorical_feature_columns = X_train.select_dtypes(exclude=['float16']).columns
#categorical_feature_columns = [feature for feature in train.columns if 'cat' in feature]

In [14]:
#Checking if test categorical unique values are all subsets of their train peers

lis = []
for i in X_train[categorical_feature_columns].columns:
    test_vals = set(test[i].unique())
    X_vals = set(X_train[i].unique())
    lis.append(test_vals.issubset(X_vals))

print(colored(all(lis),'green'))

[32mTrue[0m


### Features Standarization

In [15]:
#cat_cols = [feature for feature in train.columns if 'cat' in feature]
cat_cols = categorical_feature_columns.tolist()

def label_encoder(df):
    for feature in cat_cols:
        le = LabelEncoder()
        le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

X_train = label_encoder(X_train)
X_test = label_encoder(test)

print('Info about train data: ')
print('Number of rows:',colored(X_train.shape[0],'green'))
print('Number of columns:',colored(X_train.shape[1],'green'))

print('\nInfo about test data: ')
print('Number of rows:',colored(test.shape[0],'green'))
print('Number of columns:',colored(test.shape[1],'green'))

Info about train data: 
Number of rows: [32m300000[0m
Number of columns: [32m24[0m

Info about test data: 
Number of rows: [32m200000[0m
Number of columns: [32m24[0m


In [16]:
# Extreme Fine Tuning LGBM using 7-step training
# https://www.kaggle.com/awwalmalhi/extreme-fine-tuning-lgbm-using-7-step-training#Extreme-Fine-Tuning-of-LGBM-using-Incremental-training

def objective(trial, X, y, name='xgb'):
        
    params = {'max_depth':trial.suggest_int('max_depth', 5, 50),
              'n_estimators':200000,
              #'boosting':trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
              'subsample': trial.suggest_uniform('subsample', 0.2, 1.0),
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.2, 1.0),
              'learning_rate':trial.suggest_uniform('learning_rate', 0.007, 0.02),
              'reg_lambda':trial.suggest_uniform('reg_lambda', 0.01, 50),
              'reg_alpha':trial.suggest_uniform('reg_alpha', 0.01, 50),
              'min_child_samples':trial.suggest_int('min_child_samples', 5, 100),
              'num_leaves':trial.suggest_int('num_leaves', 10, 200),
              'n_jobs' : -1,
              'metric':'rmse',
              'max_bin':trial.suggest_int('max_bin', 300, 1000),
              'cat_smooth':trial.suggest_int('cat_smooth', 5, 100),
              'cat_l2':trial.suggest_loguniform('cat_l2', 1e-3, 100)}

    model = LGBMRegressor(**params)
                  
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
    

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              eval_metric=['rmse'],
              early_stopping_rounds=250, 
              categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
              #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
              verbose=0)

    train_score = np.round(np.sqrt(mean_squared_error(y_train, model.predict(X_train))), 5)
    test_score = np.round(np.sqrt(mean_squared_error(y_val, model.predict(X_val))), 5)
                  
    print(f'TRAIN RMSE : {train_score} || TEST RMSE : {test_score}')
                  
    return test_score

In [17]:
%%time

optimize = partial(objective, X=X_train, y=y_train)

study_lgbm = optuna.create_study(direction='minimize')
#study_lgbm.optimize(optimize, n_trials=50)

# i have commented out the trials so as to cut short the notebook execution time.

[32m[I 2021-08-26 10:10:19,483][0m A new study created in memory with name: no-name-08faaa32-4785-4a65-b80f-d3dd877f558c[0m


Wall time: 2 ms


In [18]:
#From the above optuna trials the best parameters i could find were the following ones!
#study_lgbm.best_params

lgbm_params = {
 'max_depth': 44,
 'subsample': 0.394545907670217,
 'colsample_bytree': 0.20198138209747638,
 'learning_rate': 0.009310766402801046,
 'reg_lambda': 6.237661450596901,
 'reg_alpha': 22.879691155166864,
 'min_child_samples': 32,
 'num_leaves': 17,
 'max_bin': 797,
 'cat_smooth': 81,
 'cat_l2': 3.716241852773303,
 'metric': 'rmse', 
 'n_jobs': -1, 
 'n_estimators': 20000
}

# Step 4: Train a model

In [None]:
%%time
split = KFold(n_splits=10, shuffle=True)
#split = KFold(n_splits=5, random_state=2, shuffle=True)

preds_list_base = []
preds_list_final_iteration = []
preds_list_all = []

for train_idx, val_idx in split.split(X_train):
            X_tr = X_train.iloc[train_idx]
            X_val = X_train.iloc[val_idx]
            y_tr = y_train.iloc[train_idx]
            y_val = y_train.iloc[val_idx]
            
            Model = LGBMRegressor(**lgbm_params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                          eval_metric=['rmse'],
                          early_stopping_rounds=250, 
                          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                          verbose=0)
            
            preds_list_base.append(Model.predict(X_test))
            preds_list_all.append(Model.predict(X_test))
            print(f'RMSE for Base model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
            first_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
            params = lgbm_params.copy()
            
            for i in range(1, 8):
                if i >2:    
                    
                    # reducing regularizing params if 
                    
                    params['reg_lambda'] *= 0.9
                    params['reg_alpha'] *= 0.9
                    params['num_leaves'] += 40
                    
                params['learning_rate'] = 0.003
                Model = LGBMRegressor(**params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                          eval_metric=['rmse'],
                          early_stopping_rounds=200, 
                          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                          verbose=0,
                          init_model=Model)
                
                preds_list_all.append(Model.predict(X_test))
                print(f'RMSE for Incremental trial {i} model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
            last_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
            print('',end='\n\n')
            print(f'Improvement of : {first_rmse - last_rmse}')
            print('-' * 100)
            preds_list_final_iteration.append(Model.predict(X_test))

Using categorical_feature in Dataset.
categorical_feature in Dataset is overridden.
New categorical_feature is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


RMSE for Base model is 0.7150945895852655


Using categorical_feature in Dataset.
categorical_feature in Dataset is overridden.
New categorical_feature is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


RMSE for Incremental trial 1 model is 0.7150912380025591


Using categorical_feature in Dataset.
categorical_feature in Dataset is overridden.
New categorical_feature is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


RMSE for Incremental trial 2 model is 0.7150585221872836


Using categorical_feature in Dataset.
categorical_feature in Dataset is overridden.
New categorical_feature is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


RMSE for Incremental trial 3 model is 0.7150295932633853


Using categorical_feature in Dataset.
categorical_feature in Dataset is overridden.
New categorical_feature is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


RMSE for Incremental trial 4 model is 0.7150222531174983


Using categorical_feature in Dataset.
categorical_feature in Dataset is overridden.
New categorical_feature is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


RMSE for Incremental trial 5 model is 0.7150090516998127


Using categorical_feature in Dataset.
categorical_feature in Dataset is overridden.
New categorical_feature is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


RMSE for Incremental trial 6 model is 0.7150030474578102


Using categorical_feature in Dataset.
categorical_feature in Dataset is overridden.
New categorical_feature is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


RMSE for Incremental trial 7 model is 0.7149994029251712


Improvement of : 9.51866600943374e-05
----------------------------------------------------------------------------------------------------


# Step 5: Submit to the competition

In [None]:
y_preds_base = np.array(preds_list_base).mean(axis=0)
y_preds_base

In [None]:
y_preds_all = np.array(preds_list_all).mean(axis=0)
y_preds_all

In [None]:
y_preds_final_iteration = np.array(preds_list_final_iteration).mean(axis=0)
y_preds_final_iteration

In [None]:
# Use the model to generate predictions
#predictions = model.predict(test)

# Save the predictions to a CSV file
output = pd.DataFrame({'Id': test.index,
                       'target': y_preds_final_iteration})
output.to_csv('submission.csv', index=False)