In [2]:
import matplotlib.pyplot as plt 
plt.style.use("ggplot") 
import numpy as np 
import pandas as pd  
import seaborn as sns 
import xgboost as xgb 
from scipy.stats import spearmanr 
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import KFold  
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [3]:
X = pd.read_csv("data/X_train_NHkHMNU.csv")
y = pd.read_csv("data/y_train_ZAN5mwg.csv")
data = pd.merge(X, y, on="ID")

In [14]:
def preprocess(X, imputer_func, scaler_func=None):
    X = pd.get_dummies(X)
    X.loc[:, X.isna().any()] = imputer_func(X.loc[:, X.isna().any()])
    if scaler_func is not None:
        X = scaler_func(X)
    return X

def train(model, data, n_splits, imputer=SimpleImputer(), scaler=StandardScaler()):
    X, y = data.drop(["TARGET", "ID", "DAY_ID"], axis=1), data["TARGET"]
    kf = KFold(n_splits=n_splits)
    results = []
    for train_idx, test_idx in tqdm(kf.split(data)):
        X_train, y_train = X.copy().loc[train_idx], y[train_idx]
        X_test, y_test = X.copy().loc[test_idx], y[test_idx]
        X_train = preprocess(X_train, imputer.fit_transform, scaler.fit_transform)
        X_test = preprocess(X_test, imputer.transform, scaler.transform)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rank = evaluate(preds, y_test)
        results.append(rank)
    return results

def evaluate(preds, y_test):
    rank = spearmanr(preds, y_test.values).correlation
    return rank

In [6]:
help(xgb.XGBRegressor)

Help on class XGBRegressor in module xgboost.sklearn:

class XGBRegressor(XGBModel, sklearn.base.RegressorMixin)
 |  XGBRegressor(*, objective: Union[str, Callable[[numpy.ndarray, numpy.ndarray], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = 'reg:squarederror', **kwargs: Any) -> None
 |  
 |  Implementation of the scikit-learn API for XGBoost regression.
 |  
 |  
 |  Parameters
 |  ----------
 |  
 |      n_estimators : int
 |          Number of gradient boosted trees.  Equivalent to number of boosting
 |          rounds.
 |  
 |      max_depth :  Optional[int]
 |          Maximum tree depth for base learners.
 |      max_leaves :
 |          Maximum number of leaves; 0 indicates no limit.
 |      max_bin :
 |          If using histogram-based algorithm, maximum number of bins per feature
 |      grow_policy :
 |          Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
 |          depth-wise. 1: favor splitting at nodes with highest loss change.
 | 

In [20]:
xgb_reg = xgb.XGBRegressor()
results = train(xgb_reg, data, 5)
sum(results) / len(results)

5it [00:08,  1.73s/it]


0.17202297044289652

In [8]:
xgb_reg.__dict__

{'n_estimators': 100,
 'objective': 'reg:squarederror',
 'max_depth': 4,
 'max_leaves': None,
 'max_bin': None,
 'grow_policy': None,
 'learning_rate': None,
 'verbosity': None,
 'booster': None,
 'tree_method': None,
 'gamma': None,
 'min_child_weight': None,
 'max_delta_step': None,
 'subsample': None,
 'sampling_method': None,
 'colsample_bytree': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'base_score': None,
 'missing': nan,
 'num_parallel_tree': None,
 'random_state': None,
 'n_jobs': None,
 'monotone_constraints': None,
 'interaction_constraints': None,
 'importance_type': None,
 'gpu_id': None,
 'validate_parameters': None,
 'predictor': None,
 'enable_categorical': False,
 'feature_types': None,
 'max_cat_to_onehot': None,
 'max_cat_threshold': None,
 'eval_metric': None,
 'early_stopping_rounds': None,
 'callbacks': None,
 '_Booster': <xgboost.core.Booster at 0x11c933250>}