The goal of this notebook is to get started with xgboost and apply it to our data.

## Libraries imports

In [1]:
!cp "/content/drive/MyDrive/Statapp/file_04_HMLasso.py" "HMLasso.py"

In [2]:
!cp "/content/drive/MyDrive/Statapp/manipulate_data.py" "manipulate_data.py"

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler # To standardize the data
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

import xgboost as xgb # eXtreme Gradient Boosting
import HMLasso as hml # Lasso with High Missing Rate
import manipulate_data as manip # Useful functions

## Data imports

In [4]:
columns_types = pd.read_csv("/content/drive/MyDrive/Statapp/data_03_columns_types.csv")
data = pd.read_csv("/content/drive/MyDrive/Statapp/data_03.csv")
# data = pd.read_csv("/content/drive/MyDrive/Statapp/data_04.csv")

  data = pd.read_csv("/content/drive/MyDrive/Statapp/data_03.csv")


In [5]:
data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42232 entries, 0 to 42231
Columns: 4161 entries, HHIDPN to GHI14
dtypes: float64(4061), int64(99), object(1)
memory usage: 1.3 GB


## Trying XGBoost

This section is dedicated to the use of XGBoost as a regressor to predict the index.

### Using HMLasso

To speed up the calculations, we made the choice to use the HMLasso to select only a few variables that could be useful. To achieve this subgoal, we first proceed to training the HMLasso on (X, y) where X is the matrix of (HHIDPN, wave) individuals and y is the GHIw.

In [6]:
untimed_data = manip.drop_time(data, keep_genetic=False)
untimed_data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264618 entries, 0 to 264617
Columns: 192 entries, HHIDPN to GHIw
dtypes: float64(190), int64(2)
memory usage: 387.6 MB


In [7]:
X = untimed_data.drop(columns=["HHIDPN", "GHIw"]).values
y = untimed_data["GHIw"].values

In [8]:
y_scaled = y - y.mean()

scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

In [None]:
hml.ERRORS_HANDLING = "ignore"
lasso = hml.HMLasso(mu = 100, verbose = True)
lasso.fit(X_scaled, y_scaled)

In [16]:
columns_for_lasso = untimed_data.drop(columns = ["HHIDPN", "GHIw"]).columns
criteria = pd.Series(abs(lasso.beta_opt) > 1e-9)
columns_to_keep = list(pd.Series(columns_for_lasso)[criteria.index[criteria]])

In [34]:
# Loading data
waves = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
columns_to_keep_for_each_wave = [var.replace('w', str(wave)) for var in columns_to_keep for wave in waves] + [var for var in data.columns if 'genetic_' in var]
working_data = manip.get_sample(data, waves = waves)

working_data = working_data[['HHIDPN'] + columns_to_keep_for_each_wave + [f'GHI{wave}' for wave in range(1, 15)]]
working_data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3396 entries, 0 to 3395
Columns: 1500 entries, HHIDPN to GHI14
dtypes: float64(1498), int64(1), object(1)
memory usage: 39.0 MB


In [35]:
# Formatting the database
variables_per_type = manip.get_columns_types(working_data, columns_types)

working_data[variables_per_type["Char"]] = working_data[variables_per_type["Char"]].astype('category')
working_data[variables_per_type["Categ"]] = working_data[variables_per_type["Categ"]].astype('category')

working_data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3396 entries, 0 to 3395
Columns: 1500 entries, HHIDPN to GHI14
dtypes: category(855), float64(644), int64(1)
memory usage: 19.7 MB


In [36]:
FeatureTypes = []
for col in working_data.dtypes:
  if col == "category":
    FeatureTypes.append('c') # 'c' for categorical
  else:
    FeatureTypes.append('q') # 'q' for quantitative

### Training with different data

In [87]:
def train_model(working_data, data_to_use="all", simulation="short", params_grid=None, random_state=None, verbose=False):
  """
  Main function to train XGBoost Regressor.
  
  inputs:
  - working_data: the database on which the estimator will be trained and tested.
  - data_to_use:
     > 'all' = socioeconomic data, genetic data, precedent GHI are used for prediction
     > 'socio' = only socioeconomic data are used
     > 'sociogenetic' = only socioeconomic and genetic data are used
     > 'socioghi' = only socioeconomic data and precedent GHI are used
  - simulation:
     > 'short' = only a few hyperparameters will be tested. Does not take more than 10 minutes.
     > 'long' = a lot of hyperparameters will be tested. Can take up to 3h.
  - params_grid = the parameters to cross validate. If this option is specified, simulation is ignored.
  """

  # Creating (X, y)
  basic_columns = ["genetic_VERSION", "genetic_Section_A_or_E", "HHIDPN", "GHI14"]
  genetic_columns = [col for col in working_data.columns if 'genetic_' in col and col != 'genetic_VERSION' and col != 'genetic_Section_A_or_E']
  GHI_columns = [f'GHI{wave}' for wave in range(1, 14)]

  message = {'all' : "Socioeconomic data, genetic data, precedent GHI will be used for prediction.",
             'socio' : "Only socioeconomic data will be used for prediction.",
             'sociogenetic' : "Only socioeconomic data and genetic data will be used for prediction.",
             'socioghi' : "Only socioeconomic data and precedent GHI will be used for prediction."}
  if data_to_use == 'all':
    columns_to_delete = basic_columns
  elif data_to_use == 'socio':
    columns_to_delete = basic_columns + genetic_columns + GHI_columns
  elif data_to_use == 'sociogenetic':
    columns_to_delete = basic_columns + GHI_columns
  elif data_to_use == 'socioghi':
    columns_to_delete = basic_columns + genetic_columns
  
  if verbose:
    print(message[data_to_use])

  X = working_data.drop(columns = columns_to_delete)
  y = working_data["GHI14"]

  # Splitting into Training and Testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)


  # Performing cross-validation to train and fine-tune the model
  model = xgb.XGBRegressor(tree_method='gpu_hist', enable_categorical=True, FeatureTypes=FeatureTypes)

  if params_grid is None:
    if simulation == 'long':
      params_grid = {"eta" : [0.1, 0.05, 0.03, 0.01], # learning rate
                    "lambda" : [1, 0.5, 2], # coefficient for L2 penalization
                    "alpha" : [0, 0.5, 1], # coefficient for L1 penalization
                    "max_depth" : [3, 4, 5], # max depth of trees
                    "n_estimators" : [100, 200] # number of trees
                    }
    elif simulation == 'short':
      params_grid = {"eta" : [0.05, 0.03], # learning rate
                    "lambda" : [1, 0.5], # coefficient for L2 penalization
                    "alpha" : [0.5, 1], # coefficient for L1 penalization
                    "max_depth" : [3, 4], # max depth of trees
                    "n_estimators" : [100] # number of trees
                    }

  grid = GridSearchCV(model, params_grid, refit = True, verbose = verbose, n_jobs=-1, scoring="r2") 
  grid.fit(X_train, y_train)

  results = pd.DataFrame(grid.cv_results_)
  # results.drop(columns = [col for col in results.columns if "split" in col or "time" in col]+["params"]).sort_values(by=["rank_test_score"]).head(5)

  if verbose:
    print("Model refitted with best hyperparameters.")
    print("Best parameters : " + str(grid.best_params_))
    print("R2 score on train : ", str(grid.score(X_train, y_train)))
    print("R2 score on test : ", str(grid.score(X_test, y_test)))
  
  # Storing results
  final_results = {}
  final_results["data"] = data_to_use
  final_results["best_parameters"] = list(grid.best_params_.items())
  final_results["r2_train"] = grid.score(X_train, y_train)
  final_results["r2_test"] = grid.score(X_test, y_test)

  return final_results

In [85]:
# # QUICK SIMULATION
# params_grid = {"eta" : [0.05, 0.03], # learning rate
#               "lambda" : [1], # coefficient for L2 penalization
#               "alpha" : [0.5], # coefficient for L1 penalization
#               "max_depth" : [3], # max depth of trees
#               "n_estimators" : [100] # number of trees
#               }

# LONG SIMULATION
params_grid = {"eta" : [0.05, 0.03], # learning rate
               "lambda" : [1, 0.5], # coefficient for L2 penalization
               "alpha" : [0.5, 1], # coefficient for L1 penalization
               "max_depth" : [3, 4], # max depth of trees
               "n_estimators" : [100] # number of trees
              }

In [None]:
results = {"Random_state" : [], "Data_used" : [], "best_parameters" : [], "r2_test" : []}

number_of_simulations = 100
for random_state in range(number_of_simulations):
  for data_to_use in ['all', 'socio', 'sociogenetic', 'socioghi']:
    result = train_model(working_data, data_to_use=data_to_use, params_grid=params_grid, random_state=random_state)
    results["Random_state"].append(random_state)
    results["Data_used"].append(data_to_use)
    results["best_parameters"].append(result["best_parameters"])
    results["r2_test"].append(result["r2_test"])

results = pd.DataFrame(results).sort_values("r2_test", ascending=False)

In [97]:
results.to_csv("XGBoost_simulation.csv", index=False)

In [101]:
results.groupby(["Data_used"])["r2_test"].agg({"mean", "std", "count"})

Unnamed: 0_level_0,mean,count,std
Data_used,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
all,0.36652,5,0.036591
socio,0.207468,5,0.023647
sociogenetic,0.20557,5,0.023094
socioghi,0.364475,5,0.036558
