In [None]:
#pip install -q info_gain

In [None]:
import numpy as np
import pandas as pd
from google.colab import drive #to use Google Drive
from sklearn.model_selection import train_test_split #to split transet ans testset
from sklearn.cluster import KMeans #to cluster with missing values - https://pypi.org/project/missingpy/
from sklearn.impute import SimpleImputer #to impute missing values
from sklearn.experimental import enable_iterative_imputer #to impute missing values
from sklearn.impute import IterativeImputer #to impute missing values
import xgboost #extreme gradient boosting
import lightgbm #light gradient boosting
from sklearn.metrics import mean_squared_error #error measurement
from math import sqrt #math functions
import pickle #to save and load models

In [None]:
#mount google drive
drive.mount('/gdrive')
#read dataset
dataset = pd.read_csv('/gdrive/My Drive/My Developments/ReggresionProblem/dataset_00_with_header.csv',sep=',')


In [None]:
##### FUNCTIONS - DATA TRANSFORMATION

### Function to log-transform numerical variables

def LogTransform(pd_data, variables):
  dataset = pd_data.copy()
  for variable in variables:
    dataset[variable] = np.log(1-min(dataset[variable])+dataset[variable])
  return dataset

### Function to transform categorial variables in dummies

def CategoricalToDummies(pd_data, variables):
  dataset = pd_data.copy()
  for variable in variables:
    dummies = pd.get_dummies(dataset[variable]).rename(columns=lambda x: variable + '_' + str(int(round(x,0))))
    #dummies = dummies.drop(dummies.columns[[0]], axis=1) #remove one category. Not recommended for variables with NaN
    dataset = pd.concat([dataset, dummies], axis=1)
    dataset = dataset.drop(variable, axis=1)
  return dataset

### Function to standardize with Z score (x-u)/s

def Standardize(pd_data, columns):
  dataset = pd_data.copy()
  for c in columns:
    dataset[c] = (dataset[c]-dataset[c].mean())/dataset[c].std()
  return dataset

### Function to scale data

def Scaler(ttype, pd_data, columns):
  dataset = pd_data.copy()
  if ttype == 'minmax':
    for c in columns:
      dataset[c] = (dataset[c]-dataset[c].min())/(dataset[c].max()-dataset[c].min())
  if ttype == 'meanscale':
    for c in columns:
      dataset[c] = (dataset[c]-dataset[c].mean())/(dataset[c].max()-dataset[c].min())  
  return dataset

### Function to reshape data after scaling (only one column)

def InverseScaler(ttype, data_vector, params):
  #reshaped = []
  if ttype == 'minmax':
    xmin = params[0]
    xmax = params[1]
    reshaped = xmin + data_vector*(xmax-xmin)
  if ttype == 'meanscale':
    xmin = params[0]
    xmax = params[1]
    xmean = params[2]
    reshaped = xmean + data_vector*(xmax-xmin)
  return reshaped

### Function to identify outliers based on Z score

def Z_outliers(pd_data, proportion_to_flag):
  dataset = Standardize(pd_data, pd_data.columns)
  score=abs(dataset).sum(axis=1)
  outliers = []
  for s in score:
    if s>= np.percentile(score,100-proportion_to_flag*100):
      outliers.append(1)
    else:
      outliers.append(0)
  return outliers

### Function to remove outliers based con Z score

def RemoveZ_outliers(pd_data, proportion_to_remove):
  outlier_flagZ = Z_outliers(pd_data, proportion_to_remove) #identify outliera
  pd_data_1 = pd_data.copy()
  pd_data_1['outlier_Z']=outlier_flagZ
  pd_data_1 = pd_data_1.loc[pd_data_1['outlier_Z'] == 0]
  pd_data_1 = pd_data_1.drop('outlier_Z',axis=1)
  return pd_data_1

### Function to cluster with kmeans (NaN allowed)
#source: https://stackoverflow.com/questions/35611465/python-scikit-learn-clustering-with-missing-data
def kmeans_missing(X, n_clusters, max_iter=10):
    """Perform K-Means clustering on data with missing values.

    Args:
      X: An [n_samples, n_features] array of data to cluster.
      n_clusters: Number of clusters to form.
      max_iter: Maximum number of EM iterations to perform.

    Returns:
      labels: An [n_samples] vector of integer labels.
      centroids: An [n_clusters, n_features] array of cluster centroids.
      X_hat: Copy of X with the missing values filled in.
    """

    # Initialize missing values to their column means
    missing = ~np.isfinite(X)
    mu = np.nanmean(X, 0, keepdims=1)
    X_hat = np.where(missing, mu, X)

    for i in range(max_iter):
        if i > 0:
            # initialize KMeans with the previous set of centroids. this is much
            # faster and makes it easier to check convergence (since labels
            # won't be permuted on every iteration), but might be more prone to
            # getting stuck in local minima.
            cls = KMeans(n_clusters, init=prev_centroids)
        else:
            # do multiple random initializations in parallel
            cls = KMeans(n_clusters, n_jobs=-1)

        # perform clustering on the filled-in data
        labels = cls.fit_predict(X_hat)
        centroids = cls.cluster_centers_

        # fill in the missing values based on their cluster centroids
        X_hat[missing] = centroids[labels][missing]

        # when the labels have stopped changing then we have converged
        if i > 0 and np.all(labels == prev_labels):
            break

        prev_labels = labels
        prev_centroids = cls.cluster_centers_
    #return labels, centroids, X_hat
    return labels
  
### Function to impute missing values
def kmImputer(pd_data, n_clusters, strategy = 'most_frequent'):
  labels = kmeans_missing(pd_data, n_clusters=100, max_iter=10)
  labels = list(labels)
  X_labels = pd_data.copy()
  X_labels['labels'] = labels
  imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
  imputer.fit(X_labels)
  groups=list(X_labels.labels.unique())
  X_imputed = X_labels[0:0]
  for x in groups:
    group_imputed = pd.DataFrame(imputer.transform(X_labels.loc[X_labels['labels'] == x]), columns = X_labels.columns)
    X_imputed = X_imputed.append(group_imputed)
  X_imputed = X_imputed.drop('labels', 1)
  return X_imputed  


In [None]:
##### FUNCTIONS - MODELING

### Function to predict

def PredictValues(model, X_test, scale = None):
  Y_pred = model.predict(X_test)
  if scale == None:
    Y_pred = [round(value) for value in Y_pred]
  else:
    ttype = scale[0]
    params = scale[1]
    Y_pred = InverseScaler(ttype, Y_pred, params)
    Y_pred = [round(value) for value in Y_pred]
  return Y_pred
  

### Function to calculate error
def FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag, scale = None, verbose='false'):
  # shows evolution of the training set
  %time model.fit(X_train, Y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=verbose, early_stopping_rounds=10)
  #predicts model
  Y_pred = PredictValues(model, X_test, scale)
  #calculates error
  Error = sqrt(mean_squared_error(Y_test, Y_pred))
  # Calculates correct predictions rate
  GoodOnes = []
  PredErrors = abs(Y_pred-Y_test)/Y_test
  for PE in PredErrors:
    if PE < error_flag:
      GoodOnes.append(1)
    else:
      GoodOnes.append(0)
  correct_predictions_rate = sum(GoodOnes)/len(GoodOnes)
  
  return Error, correct_predictions_rate

### Function to calibrate parameters for XGB

def XGBoostCalibration(n_estimators,eval_metric,eval_set,LR,SS,CSBT,MD,L,A,G):
  
  # initialization
  n_estimators=n_estimators
  eval_metric=eval_metric
  min_child_weight=1.5
  colsample_bytree=0.6
  gamma=0
  learning_rate=0.01
  max_depth=5
  reg_alpha=0.5
  reg_lambda=0.5
  subsample=0.8
   
  min_error = 1000000
  for lr in LR:
    model = xgboost.XGBRegressor(silent=True, eval_metric=eval_metric, n_estimators=2*n_estimators, colsample_bytree=colsample_bytree,
                                 gamma=gamma, learning_rate=lr, max_depth=max_depth, min_child_weight=min_child_weight, reg_alpha=reg_alpha,
                                 reg_lambda=reg_lambda, subsample=subsample, seed=42)  
    RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if RMSE<min_error:
      min_error = RMSE
      learning_rate = lr
      print("Learning Rate: "+str(lr))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000
  for ss in SS:
    model = xgboost.XGBRegressor(silent=True, eval_metric=eval_metric, n_estimators=n_estimators, learning_rate=learning_rate,
                                 subsample=ss, colsample_bytree=colsample_bytree, gamma=gamma, max_depth=max_depth,
                                 min_child_weight=min_child_weight, reg_alpha=reg_alpha, reg_lambda=reg_lambda, seed=42)  
    RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if RMSE<min_error:
      min_error = RMSE
      subsample=ss
      print("Subsample: "+str(ss))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000
  for csbt in CSBT:
    model = xgboost.XGBRegressor(silent=True, eval_metric=eval_metric, n_estimators=n_estimators, learning_rate=learning_rate,
                                 subsample=subsample, colsample_bytree=csbt, gamma=gamma, max_depth=max_depth,
                                 min_child_weight=min_child_weight, reg_alpha=reg_alpha, reg_lambda=reg_lambda, seed=42)  
    RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if RMSE<min_error:
      min_error = RMSE
      colsample_bytree=csbt
      print("Column sample by tree: "+str(csbt))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000
  for md in MD:
    model = xgboost.XGBRegressor(silent=True, eval_metric=eval_metric, n_estimators=n_estimators, learning_rate=learning_rate,
                                 subsample=subsample, colsample_bytree=colsample_bytree, max_depth=md, gamma=gamma,
                                 min_child_weight=min_child_weight, reg_alpha=reg_alpha, reg_lambda=reg_lambda, seed=42)  
    RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if RMSE<min_error:
      min_error = RMSE
      max_depth=md
      print("Maximum deep: "+str(md))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000
  for l in L:
    model = xgboost.XGBRegressor(silent=True, eval_metric=eval_metric, n_estimators=n_estimators, learning_rate=learning_rate,
                                 subsample=subsample, colsample_bytree=colsample_bytree, max_depth=max_depth, reg_lambda=l,
                                 reg_alpha=reg_alpha, min_child_weight=min_child_weight, gamma=gamma, seed=42)  
    RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if RMSE<min_error:
      min_error = RMSE
      reg_lambda=l
      print("Lambda (ridge): "+str(l))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000
  for a in A:
    model = xgboost.XGBRegressor(silent=True, eval_metric=eval_metric, n_estimators=n_estimators, learning_rate=learning_rate,
                                 subsample=subsample, colsample_bytree=colsample_bytree, max_depth=max_depth, reg_lambda=reg_lambda,
                                 reg_alpha=a, min_child_weight=min_child_weight, gamma=gamma, seed=42)  
    RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if RMSE<min_error:
      min_error = RMSE
      reg_alpha=a
      print("Alpha (lasso): "+str(a))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000
  for g in G:
    model = xgboost.XGBRegressor(silent=True, eval_metric=eval_metric, n_estimators=n_estimators, learning_rate=learning_rate,
                                 subsample=subsample, colsample_bytree=colsample_bytree, max_depth=max_depth, reg_lambda=reg_lambda,
                                 reg_alpha=reg_alpha, gamma=g, min_child_weight=min_child_weight, seed=42)  
    RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if RMSE<min_error:
      min_error = RMSE
      gamma=g
      print("Gamma: "+str(g))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  return learning_rate, subsample, colsample_bytree, max_depth, reg_lambda, reg_alpha, gamma

### Function to calibrate parameters for LGB

def LGBoostCalibration(n_estimators,eval_metric,eval_set,BT,LR,SS,CSBT,MCW,MCS,L,A):
  
  # initialization
  n_estimators=n_estimators
  eval_metric=eval_metric
  boosting_type='gbdt'
  learning_rate=0.08
  subsample=0.8
  colsample_bytree=0.6
  min_child_weight=1.5
  min_child_samples=10
  reg_alpha=0.5
  reg_lambda=0.5
  
  min_error = 1000000
  for bt in BT:
    model = lightgbm.LGBMRegressor(silent=True,n_estimators=n_estimators, boosting_type=bt, learning_rate=learning_rate,
                                   subsample=subsample, colsample_bytree=colsample_bytree, min_child_weight=min_child_weight,
                                   min_child_samples=min_child_samples, reg_alpha=reg_alpha, reg_lambda=reg_lambda, seed=42)
    Error, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if Error<min_error:
      min_error = Error
      boosting_type=bt
      print("Boosting type: "+str(bt))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000    
  for lr in LR:
    model = lightgbm.LGBMRegressor(silent=True,n_estimators=n_estimators, boosting_type=boosting_type, learning_rate=lr,
                                   subsample=subsample, colsample_bytree=colsample_bytree, min_child_weight=min_child_weight,
                                   min_child_samples=min_child_samples, reg_alpha=reg_alpha, reg_lambda=reg_lambda, seed=42)
    Error, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if Error<min_error:
      min_error = Error
      learning_rate=lr
      print("Learning Rate: "+str(lr))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000    
  for ss in SS:
    model = lightgbm.LGBMRegressor(silent=True,n_estimators=n_estimators, boosting_type=boosting_type, learning_rate=learning_rate,
                                   subsample=ss, colsample_bytree=colsample_bytree, min_child_weight=min_child_weight,
                                   min_child_samples=min_child_samples, reg_alpha=reg_alpha, reg_lambda=reg_lambda, seed=42)
    Error, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if Error<min_error:
      min_error = Error
      subsample=ss
      print("Subsample: "+str(ss))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000    
  for csbt in CSBT:
    model = lightgbm.LGBMRegressor(silent=True,n_estimators=n_estimators, boosting_type=boosting_type, learning_rate=learning_rate,
                                   subsample=subsample, colsample_bytree=csbt, min_child_weight=min_child_weight,
                                   min_child_samples=min_child_samples, reg_alpha=reg_alpha, reg_lambda=reg_lambda, seed=42)
    Error, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if Error<min_error:
      min_error = Error
      colsample_bytree=csbt
      print("Colsample by tree: "+str(csbt))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
      
  min_error = 1000000    
  for mcw in MCW:
    model = lightgbm.LGBMRegressor(silent=True,n_estimators=n_estimators, boosting_type=boosting_type, learning_rate=learning_rate,
                                   subsample=subsample, colsample_bytree=colsample_bytree, min_child_weight=mcw,
                                   min_child_samples=min_child_samples, reg_alpha=reg_alpha, reg_lambda=reg_lambda, seed=42)
    Error, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if Error<min_error:
      min_error = Error
      min_child_weight=mcw
      print("Min child weight: "+str(mcw))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))
  
  min_error = 1000000    
  for mcs in MCS:
    model = lightgbm.LGBMRegressor(silent=True,n_estimators=n_estimators, boosting_type=boosting_type, learning_rate=learning_rate,
                                   subsample=subsample, colsample_bytree=colsample_bytree, min_child_weight=min_child_weight,
                                   min_child_samples=mcs, reg_alpha=reg_alpha, reg_lambda=reg_lambda, seed=42)
    Error, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if Error<min_error:
      min_error = Error
      min_child_samples=mcs
      print("Min child samples: "+str(mcs))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))  

  min_error = 1000000    
  for l in L:
    model = lightgbm.LGBMRegressor(silent=True,n_estimators=n_estimators, boosting_type=boosting_type, learning_rate=learning_rate,
                                   subsample=subsample, colsample_bytree=colsample_bytree, min_child_weight=min_child_weight,
                                   min_child_samples=min_child_samples, reg_lambda=l, reg_alpha=reg_alpha, seed=42)
    Error, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if Error<min_error:
      min_error = Error
      reg_lambda=l
      print("Lambda: "+str(l))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))      
      
  min_error = 1000000    
  for a in A:
    model = lightgbm.LGBMRegressor(silent=True,n_estimators=n_estimators, boosting_type=boosting_type, learning_rate=learning_rate,
                                   subsample=subsample, colsample_bytree=colsample_bytree, min_child_weight=min_child_weight,
                                   min_child_samples=min_child_samples, reg_lambda=reg_lambda, reg_alpha=a, seed=42)
    Error, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag)
    if Error<min_error:
      min_error = Error
      reg_alpha=a
      print("Alpha: "+str(a))
      print(eval_metric + ": "+str(min_error))
      print("Correct predictions rate: "+str(correct_predictions_rate))        
      
  return boosting_type, learning_rate, subsample, colsample_bytree, min_child_weight, min_child_samples, reg_lambda, reg_alpha

In [None]:
#calculates summary for variables
desc=dataset.describe()

# calculates distinct values for each variable
col_len = []
for variable in dataset:
    col_len.append(len(dataset[variable].value_counts()))

# calculates information gain
from info_gain import info_gain as ig
variables =list(dataset.columns.values)
variables = variables[1:304]
target = dataset['y']
InformationGain = []
for variable in variables:
    dataX = dataset[variable]
    gain  = ig.info_gain(dataX, target)
    InformationGain.append([variable,gain])
    print(str(variable)+": "+str(gain))
InformationGain=pd.DataFrame(InformationGain)
InformationGain.to_csv("/gdrive/My Drive/My Developments/ReggresionProblem/InformationGains.csv", sep = ";", decimal=',', header=False, index=False)

# calculates correlation matrix
CorM = dataset.corr()
Correlations = CorM.rename_axis(None).rename_axis(None, axis=1)
Correlations = Correlations.stack().reset_index()
Correlations.columns = ['Var1','Var2','Correlation']
Correlations.to_csv("/gdrive/My Drive/My Developments/ReggresionProblem/Correlations.csv", sep = ";", decimal=',', header=True, index=False)

In [None]:
#lists of variables
datatypes = {'x001': 'int64','x002': 'int64','x003': 'int64','x004': 'int64','x005': 'int64','x006': 'int64','x007': 'int64','x008': 'int64','x009': 'int64','x010': 'int64','x011': 'int64','x012': 'int64','x013': 'int64','x014': 'int64','x015': 'int64','x016': 'int64','x017': 'int64','x018': 'int64','x019': 'int64','x020': 'int64','x021': 'int64','x022': 'int64','x023': 'int64','x024': 'int64','x025': 'int64','x026': 'int64','x027': 'int64','x028': 'int64','x029': 'int64','x030': 'int64','x031': 'int64','x032': 'int64','x033': 'int64','x034': 'int64','x035': 'int64','x036': 'int64','x037': 'int64','x038': 'int64','x039': 'int64','x040': 'int64','x041': 'float64','x042': 'int64','x043': 'int64','x044': 'int64','x045': 'int64','x046': 'int64','x047': 'int64','x048': 'int64','x049': 'int64','x050': 'int64','x051': 'int64','x052': 'int64','x053': 'int64','x054': 'int64','x055': 'int64','x056': 'int64','x057': 'float64','x058': 'float64','x059': 'int64','x060': 'int64','x061': 'int64','x062': 'int64','x063': 'int64','x064': 'int64','x065': 'int64','x066': 'int64','x067': 'int64','x068': 'int64','x069': 'int64','x070': 'int64','x071': 'int64','x072': 'int64','x073': 'int64','x074': 'int64','x075': 'int64','x076': 'int64','x077': 'int64','x078': 'int64','x079': 'int64','x080': 'int64','x081': 'int64','x082': 'int64','x083': 'int64','x084': 'int64','x085': 'int64','x086': 'int64','x087': 'int64','x088': 'int64','x089': 'int64','x090': 'int64','x091': 'int64','x092': 'int64','x093': 'int64','x094': 'int64','x095': 'int64','x096': 'int64','x097': 'int64','x098': 'int64','x099': 'int64','x100': 'int64','x101': 'int64','x102': 'int64','x103': 'int64','x104': 'int64','x105': 'int64','x106': 'int64','x107': 'int64','x108': 'int64','x109': 'int64','x110': 'int64','x111': 'int64','x112': 'int64','x113': 'int64','x114': 'int64','x115': 'int64','x116': 'int64','x117': 'int64','x118': 'int64','x119': 'int64','x120': 'int64','x121': 'int64','x122': 'int64','x123': 'int64','x124': 'int64','x125': 'int64','x126': 'int64','x127': 'int64','x128': 'int64','x129': 'int64','x130': 'int64','x131': 'int64','x132': 'int64','x133': 'int64','x134': 'int64','x135': 'int64','x136': 'int64','x137': 'int64','x138': 'int64','x139': 'int64','x140': 'int64','x141': 'int64','x142': 'int64','x143': 'int64','x144': 'int64','x145': 'int64','x146': 'int64','x147': 'int64','x148': 'int64','x149': 'int64','x150': 'int64','x151': 'int64','x152': 'int64','x153': 'int64','x154': 'int64','x155': 'int64','x156': 'int64','x157': 'int64','x158': 'int64','x159': 'int64','x160': 'int64','x161': 'int64','x162': 'int64','x163': 'int64','x164': 'int64','x165': 'int64','x166': 'int64','x167': 'int64','x168': 'int64','x169': 'int64','x170': 'int64','x171': 'int64','x172': 'int64','x173': 'int64','x174': 'int64','x175': 'int64','x176': 'int64','x177': 'int64','x178': 'int64','x179': 'int64','x180': 'int64','x181': 'int64','x182': 'int64','x183': 'int64','x184': 'int64','x185': 'int64','x186': 'int64','x187': 'int64','x188': 'int64','x189': 'int64','x190': 'int64','x191': 'int64','x192': 'int64','x193': 'int64','x194': 'int64','x195': 'int64','x196': 'int64','x197': 'int64','x198': 'int64','x199': 'int64','x200': 'int64','x201': 'int64','x202': 'int64','x203': 'int64','x204': 'int64','x205': 'int64','x206': 'int64','x207': 'int64','x208': 'int64','x209': 'int64','x210': 'int64','x211': 'int64','x212': 'int64','x213': 'int64','x214': 'int64','x215': 'int64','x216': 'int64','x217': 'int64','x218': 'int64','x219': 'int64','x220': 'int64','x221': 'int64','x222': 'float64','x223': 'float64','x224': 'int64','x225': 'int64','x226': 'int64','x227': 'int64','x228': 'int64','x229': 'int64','x230': 'int64','x231': 'int64','x232': 'int64','x233': 'int64','x234': 'int64','x235': 'int64','x236': 'int64','x237': 'int64','x238': 'int64','x239': 'float64','x240': 'int64','x241': 'int64','x242': 'float64','x243': 'int64','x244': 'int64','x245': 'int64','x246': 'int64','x247': 'int64','x248': 'int64','x249': 'int64','x250': 'int64','x251': 'int64','x252': 'int64','x253': 'int64','x254': 'int64','x255': 'int64','x256': 'int64','x257': 'float64','x258': 'int64','x259': 'float64','x260': 'int64','x261': 'int64','x262': 'int64','x263': 'int64','x264': 'int64','x265': 'int64','x266': 'int64','x267': 'float64','x268': 'float64','x269': 'int64','x270': 'int64','x271': 'int64','x272': 'float64','x273': 'int64','x274': 'int64','x275': 'float64','x276': 'int64','x277': 'int64','x278': 'int64','x279': 'int64','x280': 'int64','x281': 'int64','x282': 'int64','x283': 'int64','x284': 'int64','x285': 'int64','x286': 'int64','x287': 'int64','x288': 'int64','x289': 'int64','x290': 'float64','x291': 'int64','x292': 'int64','x293': 'float64','x294': 'int64','x295': 'float64','x296': 'int64','x297': 'float64','x298': 'int64','x299': 'int64','x300': 'int64','x301': 'int64','x302': 'int64','x303': 'int64','x304': 'float64','y': 'int64'}
numerical = ['x002','x003','x004','x005','x007','x008','x009','x010','x011','x012','x013','x014','x015','x016','x017','x020','x021','x024','x028','x029','x030','x031','x032','x033','x034','x035','x036','x040','x041','x042','x043','x044','x045','x056','x057','x058','x059','x062','x063','x064','x065','x066','x070','x071','x072','x073','x074','x075','x076','x081','x097','x098','x099','x103','x104','x105','x106','x109','x110','x111','x113','x114','x115','x116','x117','x118','x119','x120','x121','x124','x125','x126','x127','x128','x129','x130','x131','x132','x133','x134','x135','x136','x137','x138','x139','x140','x141','x142','x143','x144','x145','x146','x149','x150','x151','x152','x153','x157','x158','x159','x160','x164','x165','x166','x167','x168','x170','x171','x172','x173','x181','x184','x185','x186','x187','x188','x189','x190','x191','x192','x193','x194','x195','x196','x197','x198','x199','x200','x201','x202','x203','x204','x205','x206','x207','x208','x209','x210','x211','x212','x213','x214','x215','x216','x217','x218','x219','x220','x221','x222','x223','x224','x225','x226','x227','x230','x231','x232','x233','x234','x235','x236','x237','x238','x239','x240','x242','x243','x250','x255','x256','x257','x258','x259','x264','x265','x266','x267','x268','x272','x273','x274','x275','x276','x277','x278','x279','x280','x281','x285','x286','x288','x289','x290','x291','x292','x293','x294','x295','x296','x297','x303','x304']
categorical = ['x018','x019','x022','x023','x037','x038','x039','x046','x047','x048','x049','x050','x051','x052','x053','x054','x055','x061','x068','x069','x077','x078','x079','x080','x100','x101','x102','x107','x108','x112','x122','x123','x148','x155','x156','x162','x163','x169','x174','x175','x176','x177','x178','x179','x182','x183','x228','x229','x241','x251','x252','x253','x254','x287','x302']
withNaN = ['x242','x295','x304','x098','x155','x259','x255','x256','x257','x302','x268','x162','x265','x266','x267','x253','x297','x275','x293','x288','x289','x290','x148','x223','x222','x041','x057','x058','x237','x238','x239','x287','x002','x003','x004','x235','x044','x045','x234','x272','x005']
#useless: variables like ID, with only one category or with more than 50% of NaN
useless = ['x001','x067','x094','x095','x096','x242','x295','x304','x098','x155','x259','x255','x256','x257','x302','x268','x162','x265','x266','x267','x253','x297','x275','x293']
selected = list(set(dataset.columns)-set(useless))
selected.sort()
#set1
selected = ['x002','x004','x005','x006','x008','x014','x015','x017','x018','x019','x020','x021','x023','x024','x025','x031','x035','x036','x040','x041','x043','x044','x046','x048','x051','x052','x054','x055','x056','x057','x059','x063','x064','x065','x075','x076','x079','x080','x081','x082','x085','x086','x087','x088','x089','x097','x099','x102','x104','x105','x113','x118','x129','x138','x139','x146','x147','x148','x154','x168','x169','x170','x171','x172','x173','x175','x180','x181','x182','x185','x186','x187','x192','x193','x194','x196','x202','x203','x204','x205','x206','x208','x209','x210','x213','x214','x215','x220','x221','x222','x224','x229','x232','x233','x234','x235','x236','x238','x239','x240','x241','x243','x245','x246','x247','x249','x250','x251','x252','x254','x258','x260','x261','x262','x263','x272','x274','x276','x277','x278','x280','x281','x282','x283','x286','x287','x291','x294','x296','x298','x299','x300','x301','x303','y']
#set2
#selected = ['x003','x005','x014','x015','x016','x018','x019','x022','x024','x031','x041','x042','x043','x044','x045','x046','x056','x059','x075','x098','x111','x148','x155','x162','x168','x186','x223','x225','x234','x235','x236','x239','x242','x243','x245','x249','x250','x257','x259','x264','x267','x268','x272','x273','x274','x280','x287','x291','x294','x295','x302','x303','x304','y']
#set3
#selected = ['x002','x005','x006','x008','x014','x015','x018','x019','x020','x023','x024','x031','x035','x041','x043','x044','x046','x048','x051','x052','x054','x055','x056','x059','x063','x065','x075','x076','x079','x081','x082','x085','x086','x088','x089','x099','x102','x104','x105','x113','x118','x129','x138','x147','x148','x168','x170','x171','x172','x173','x175','x180','x181','x182','x186','x192','x193','x194','x196','x203','x205','x206','x208','x215','x224','x232','x234','x235','x236','x238','x239','x240','x241','x243','x245','x246','x249','x250','x251','x252','x254','x263','x272','x274','x278','x280','x281','x283','x287','x291','x294','x300','x301','x303','y']

num_selected = list(set(numerical).intersection(selected))
num_selected.sort()
cat_selected = list(set(categorical).intersection(selected))
cat_selected.sort()

In [None]:
### Find most important features
#select initial variables
DF = dataset[selected]
#Transform categorical variables to dummies
DF = CategoricalToDummies(DF, cat_selected)
#split train set and test set
TestSize = 0.1
DF_train, DF_test = train_test_split(DF, test_size=TestSize, random_state=42)
# remove outliers from trainset
DF_train = RemoveZ_outliers(DF_train, proportion_to_remove = 0.01)
#split train set and test set
X_train = DF_train.loc[:, DF_train.columns != 'y']
Y_train = DF_train['y']
X_test = DF_test.loc[:, DF_test.columns != 'y']
Y_test = DF_test['y']
eval_set = [(X_train, Y_train), (X_test, Y_test)]

# Train model
eval_metric='rmse'
error_flag = 0.03
model = xgboost.XGBRegressor(silent=True,
                             eval_metric=eval_metric,
                             n_estimators=150,
                             learning_rate=0.1,
                             subsample=0.8,
                             colsample_bytree=0.5,
                             max_depth=10,
                             reg_lambda=0.5,
                             reg_alpha=0.25,
                             min_child_weight=1.5,
                             gamma=0)

RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag, verbose = True)
print("RMSE: "+str(RMSE))
print("Correct predictions rate: "+str(correct_predictions_rate))

# shows importance of the variables
var_dict = model.get_booster().get_score(importance_type= 'gain')
sorted(var_dict.items(), key = lambda x : x[1], reverse=True)
# percentile to cut off
P = 70 #99 is the top 1%
ValueP = np.percentile(list(var_dict.values()),P)
top_var = list(dict((variable, score) for variable, score in var_dict.items() if score >= ValueP).keys())
top_var.sort()
print(top_var)
print(ValueP)

In [None]:
### Calibrate hyperparameters for XGB

#select initial variables
DF = dataset[selected]
#sample
DF=DF.sample(frac=0.25)
#Transform categorical variables to dummies
DF = CategoricalToDummies(DF, cat_selected)
#split train set and test set
TestSize = 0.2
DF_train, DF_test = train_test_split(DF, test_size=TestSize, random_state=42)
# remove outliers from trainset
DF_train = RemoveZ_outliers(DF_train, proportion_to_remove = 0.01)
#split train set and test set
X_train = DF_train.loc[:, DF_train.columns != 'y']
Y_train = DF_train['y']
X_test = DF_test.loc[:, DF_test.columns != 'y']
Y_test = DF_test['y']
eval_set = [(X_train, Y_train), (X_test, Y_test)]

# Error flag defined by evaluator
error_flag = 0.03

# initialization
n_estimators=100
eval_metric='rmse'
min_child_weight=1.5
LR = [0.05,0.075,0.1,1.25,1.5]
SS = [0.3,0.4,0.5,0.6,0.7,0.8]
CSBT = [0.3,0.4,0.5,0.6,0.7,0.8]
MD = [4,5,6,7,8,10]
L = [0,0.25,0.50,0.75,1]
A = [0,0.25,0.50,0.75,1]
G = [0,1,2,5]
eval_set = [(X_train, Y_train), (X_test, Y_test)]

learning_rate, subsample, colsample_bytree, max_depth, reg_lambda, reg_alpha, gamma = XGBoostCalibration(n_estimators,eval_metric,eval_set,LR,SS,CSBT,MD,L,A,G)
print('learning_rate: '+str(learning_rate)+' subsample: '+str(subsample)+' colsample_bytree: '+str(colsample_bytree)+' max_depth: '+str(max_depth)+' reg_lambda: '+str(reg_lambda)+' reg_alpha: '+str(reg_alpha)+' gamma: '+str(gamma))

In [None]:
### Data formating

#select initial variables
DF = dataset[selected]
#Transform categorical variables to dummies
#DF = CategoricalToDummies(DF, cat_selected)
#split train set and test set
TestSize = 0.05
DF_train, DF_test = train_test_split(DF, test_size=TestSize, random_state=42)
# scale to minmax
#DF_train = Scaler(ttype='minmax', pd_data=DF_train, columns=DF_train.columns)
#DF_test = Scaler(ttype='minmax', pd_data=DF_test, columns=DF_test.columns)
# remove outliers from trainset
DF_train = RemoveZ_outliers(DF_train, proportion_to_remove = 0.03)

In [None]:
### Train final model with XGB

# Impute values in trainset
#DF_train_imp = kmImputer(DF_train_nooutliers, n_clusters = 100, strategy = 'most_frequent')
#Scale numerical variables
#DF_train_final = ScaleMinMax(DF_train, num_selected)
#DF_test_final = ScaleMinMax(DF_test, num_selected)
#final_var = list(set(DF_test_final.columns)-set(set(DF_test_final.columns)-set(DF_train_final.columns))) #just in case data test and data set do not fit
#final_var.sort()
#DF_test_final = DF_test_final[final_var]
#split train set and test set
X_train = DF_train.loc[:, DF_train.columns != 'y']
Y_train = DF_train['y']
X_test = DF_test.loc[:, DF_test.columns != 'y']
Y_test = DF_test['y']
#X_train = DF_train_final.loc[:, DF_train_final.columns != 'y']
#Y_train = DF_train_final['y']
#X_test = DF_test_final.loc[:, DF_test_final.columns != 'y']
#Y_test = DF_test_final['y']
eval_set = [(X_train, Y_train), (X_test, Y_test)]

# Error flag defined by evaluator
error_flag = 0.03
# initialization
n_estimators=500
eval_metric='rmse'
min_child_weight=1.5
eval_set = [(X_train, Y_train), (X_test, Y_test)]
learning_rate=0.1 #0.1
subsample=0.6 #0.8
colsample_bytree=0.5
max_depth=8 #10
reg_lambda=0.5
reg_alpha=0.25
gamma=1 #0

### Train model
model = xgboost.XGBRegressor(silent=True,
                             eval_metric=eval_metric,
                             n_estimators=n_estimators,
                             learning_rate=learning_rate,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             max_depth=max_depth,
                             reg_lambda=reg_lambda,
                             reg_alpha=reg_alpha,
                             min_child_weight=min_child_weight,
                             gamma=gamma) 

RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag, verbose = True)
print(eval_metric+": "+str(RMSE))
print("Correct predictions rate: "+str(correct_predictions_rate))

In [None]:
### Calibrate hyperparameters for LGB

#split train set and test set
X_train = DF_train.loc[:, DF_train.columns != 'y']
Y_train = DF_train['y']
X_test = DF_test.loc[:, DF_test.columns != 'y']
Y_test = DF_test['y']
eval_set = [(X_train, Y_train), (X_test, Y_test)]

# Error flag defined by evaluator
error_flag = 0.03

n_estimators=1000 #100
eval_metric='rmse'

BT= ['gbdt'] #boosting_type
LR = [0.05,0.075,0.1,1.25,1.5] #learning_rate
SS = [0.3,0.4,0.5,0.6,0.7,0.8] #subsample
MCW = [0.001, 0.01, 0.1, 1, 2, 3, 5] #min_child_weight
MCS = [3,5,8,10,15,20] #min_child_samples
CSBT = [0.5,0.6,0.7,0.8,1] #colsample_bytree 
L = [0,0.25,0.50,0.75,1] #reg_alpha
A = [0,0.25,0.50,0.75,1] #reg_lambda

boosting_type, learning_rate, subsample, colsample_bytree, min_child_weight, min_child_samples, reg_lambda, reg_alpha = LGBoostCalibration(n_estimators,eval_metric,eval_set,BT,LR,SS,CSBT,MCW,MCS,L,A)
print('boosting_type: '+str(boosting_type)+
      ' learning_rate: '+str(learning_rate)+
      ' subsample: '+str(subsample)+
      ' colsample_bytree: '+str(colsample_bytree)+
      ' min_child_weight: '+str(min_child_weight)+
      ' min_child_samples: '+str(min_child_samples)+
      ' reg_lambda: '+str(reg_lambda)+
      ' reg_alpha: '+str(reg_alpha))

In [None]:
### Train final model with LGB
#split train set and test set
X_train = DF_train.loc[:, DF_train.columns != 'y']
Y_train = DF_train['y']
X_test = DF_test.loc[:, DF_test.columns != 'y']
Y_test = DF_test['y']
eval_set = [(X_train, Y_train), (X_test, Y_test)]
# Error flag defined by evaluator
error_flag = 0.03
scale = ['minmax',[dataset['y'].min(),dataset['y'].max()]]
# initialization
n_estimators=5000
eval_metric='rmse'
boosting_type='gbdt'
learning_rate=0.08
subsample=0.5
colsample_bytree=0.5
min_child_weight=0.001
min_child_samples=8
reg_alpha=0.5
reg_lambda=0.5

### Train model
model = lightgbm.LGBMRegressor(silent=True,
                               n_estimators=n_estimators, 
                               boosting_type=boosting_type, 
                               learning_rate=learning_rate,
                               subsample=subsample, 
                               colsample_bytree=colsample_bytree, 
                               min_child_weight=min_child_weight,
                               min_child_samples=min_child_samples, 
                               reg_lambda=reg_lambda, 
                               reg_alpha=reg_lambda) 

RMSE, correct_predictions_rate = FitModel(model, eval_set, X_train, Y_train, eval_metric, error_flag, scale = None, verbose = True)
print(eval_metric+": "+str(RMSE))
print("Correct predictions rate: "+str(correct_predictions_rate))

In [None]:
#predicts model
Y_pred = PredictValues(model, X_test)

output = pd.concat([X_test, Y_test], axis=1)
output['Prediction']=Y_pred
output.to_csv("/gdrive/My Drive/My Developments/ReggresionProblem/output_lgb.csv", sep = ";", decimal=',', header=True, index=False)

In [None]:
import pickle
# save the model to disk
filename = '/gdrive/My Drive/My Developments/ReggresionProblem/model_lgb_rmse_26.09_62.1_cat_set1.sav'
pickle.dump(model, open(filename, 'wb'))