In [1]:
import pandas as pd
from sklearn import model_selection

url = 'https://raw.githubusercontent.com/rabiibouhestine/Kaggle-cat-in-the-dat/master/train.csv'
df = pd.read_csv(url)

# we create a new column called kfold and fill it with -1
df["kfold"] = -1

# the next step is to randomize the rows of the data
df = df.sample(frac=1).reset_index(drop=True)

# fetch labels
y = df.target.values

# initiate the kfold class from model_selection module
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f


In [2]:
import xgboost as xgb
from sklearn import metrics
from sklearn import preprocessing

def run_xgboost_label_encoded(fold, data):
    df = data.copy()
    # load the full training data with folds
    #df = pd.read_csv(url)

    # all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]

    # fill all NaN values with NONE
    # note that I am converting all columns to "strings"
    # it doesn’t matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now it’s time to label encode the features
    for col in features:
        # initialize LabelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()
        # fit label encoder on all data
        lbl.fit(df[col])
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])

    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # get training data
    x_train = df_train[features].values
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize xgboost model
    model = xgb.XGBClassifier(n_jobs=-1, max_depth=7, n_estimators=200)
    
    # fit model on training data
    model.fit(x_train, df_train.target.values)

    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [3]:
for fold in range(5):
    run_xgboost_label_encoded(fold, df)

Fold = 0, AUC = 0.7651353896345039
Fold = 1, AUC = 0.7693134854960211
Fold = 2, AUC = 0.7663161657864546
Fold = 3, AUC = 0.7655048207000421
Fold = 4, AUC = 0.7680425346019139


In [None]:
#!pip install xgboost

Note xgboost modified  parameters a bit:

* Default max_depth for xgboost is 3, and we use 7

* the number of estimators (n_estimators) from 100 to 200.