In [55]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

# Load the data
train = pd.read_csv('wineq_train.csv')
val = pd.read_csv('../wineq_validation.csv')


# split train into x, y
x_train = train.drop(columns=['quality'])
y_train = train['quality']




# feature engineering
def fe(df):
    df['mso2'] = df['free sulfur dioxide']/(1+ 10**(df['pH'] -1.81))

    df['acidity_ratio'] = df['fixed acidity'] / df['volatile acidity']
    df['total_acid'] = df['fixed acidity'] + df['volatile acidity'] + df['citric acid']
    df['mean_acid'] = df[['fixed acidity','volatile acidity','citric acid']].mean(axis=1)
    df['std_acid'] =  df[['fixed acidity','volatile acidity','citric acid']].std(axis=1)

    df['free_sulfur/total_sulfur'] = df['free sulfur dioxide'] / df['total sulfur dioxide']
    df['sugar/alcohol'] = df['residual sugar'] / df['alcohol']

   # df['sugar/citric'] = df['residual sugar'] / df['citric acid']+1e-4

    df['BSO2'] = df['total sulfur dioxide'] - df['free sulfur dioxide']
    df['FSO2/alcohol'] = df['free sulfur dioxide'] / df['alcohol']
    df['TSO2/alcohol'] = df['total sulfur dioxide'] / df['alcohol']
    df['BSO2/alcohol'] = df['BSO2'] / df['alcohol']

    df['chlorides/TSO2'] = df['chlorides'] / df['total sulfur dioxide']
    df['sulphates/pH'] = df['sulphates'] / df['pH']

    df['alcohol/density'] = df['alcohol'] / df['density']
    df['alcohol_density'] = df['alcohol']  * df['density']
    df['sulphates/chlorides'] = df['sulphates'] / df['chlorides']
    df['alcohol/pH'] = df['alcohol'] / df['pH']
    df['alcohol/acidity'] = df['alcohol'] / df['total_acid']
    df['alkalinity'] = df['pH'] + df['alcohol']
    df['mineral'] = df['chlorides'] + df['sulphates'] + df['residual sugar']
    df['density/pH'] = df['density'] / df['pH']
    df['total_alcohol'] = df['alcohol'] + df['residual sugar']

    df['acid/density'] = df['total_acid']  / df['density']
    df['sulphate/density'] = df['sulphates']  / df['density']
    df['sulphates/acid'] = df['sulphates'] / df['volatile acidity']
    df['sulphates*alcohol'] = df['sulphates'] * df['alcohol']
    return df

x_train = fe(x_train)
x_val = fe(val)
columns = x_train.columns
# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

x_train = pd.DataFrame(x_train, columns=columns)
x_val = pd.DataFrame(x_val, columns=columns)

# Dmatrix
trainx = xgb.DMatrix(data=x_train,label=y_train)
valx = xgb.DMatrix(data=x_val)




In [35]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import cohen_kappa_score
from sklearn.feature_selection import SelectFromModel
# Feature selection using SelectFromModel
# Expanded parameter optimization using RandomizedSearchCV
def optimize_parameters(X, y):
    param_grid = {
        'max_depth': [3, 5, 7, 9, 11, 15, 19],  # broader range
        'eta': [0.01, 0.03, 0.05, 0.07, 0.1],  # broader range
        'subsample': [0.4, 0.5, 0.6, 0.7, 0.8],  # broader range
        'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],  # broader range
        'min_child_weight': [1, 5, 10, 15, 20, 25],  # broader range
        'gamma': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4],  # broader range
        'lambda': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3],  # broader range
        'alpha': [1e-4, 1e-3, 1e-2, 0.1, 1],  # broader range
        'max_delta_step': [0, 5, 10, 20, 40, 44, 50]  # broader range
    }
    
    model = xgb.XGBRegressor(tree_method='hist', objective='reg:squarederror')
    grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=1000, scoring='neg_mean_squared_error', cv=3, verbose=1, random_state=42, n_jobs=-1)
    grid_search.fit(X, y)
    return grid_search.best_params_

best_params = optimize_parameters(x_train, y_train)
print("Best parameters found: ", best_params)

# Feature selection using LOFO with optimized parameters
def lofo_feature_selection(X, y, params):
    features = X.columns
    baseline_model = xgb.XGBRegressor(**params)
    baseline_score = cross_val_score(baseline_model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    feature_scores = {}
    
    for feature in features:
        X_new = X.drop(columns=[feature])
        score = cross_val_score(baseline_model, X_new, y, cv=5, scoring='neg_mean_squared_error').mean()
        feature_scores[feature] = baseline_score - score
    
    selected_features = [feature for feature in features if feature_scores[feature] > 0]
    return selected_features

selected_features = lofo_feature_selection(x_train, y_train, best_params)
x_train_selected = x_train[selected_features]
x_val_selected = x_val[selected_features]

best_params = optimize_parameters(x_train_selected, y_train)
print("Best parameters found: ", best_params)


Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


Best parameters found:  {'subsample': 0.5, 'min_child_weight': 10, 'max_depth': 5, 'max_delta_step': 0, 'lambda': 1e-06, 'gamma': 0.0001, 'eta': 0.03, 'colsample_bytree': 0.5, 'alpha': 0.0001}
Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
Best parameters found:  {'subsample': 0.4, 'min_child_weight': 10, 'max_depth': 3, 'max_delta_step': 5, 'lambda': 0.001, 'gamma': 1e-06, 'eta': 0.07, 'colsample_bytree': 0.8, 'alpha': 0.1}


In [39]:
def cross_valid(model, train, target, test, params, num_folds=5, random_state=42):
    train_oof = np.zeros((len(train)))
    test_preds = 0

    kf = StratifiedKFold(n_splits=num_folds, random_state=random_state, shuffle=True)
    scores = []

    num_rounds = 1000
    columns = train.columns
    
    for f, (train_ind, val_ind) in enumerate(kf.split(train, target)):
        train_df, val_df = train.iloc[train_ind], train.iloc[val_ind]
        train_target, val_target = target.iloc[train_ind], target.iloc[val_ind]

        trn_data = xgb.DMatrix(train_df, label=train_target)
        val_data = xgb.DMatrix(val_df, label=val_target)

        model = xgb.train(params, 
                          trn_data,
                          num_rounds,
                          evals=[(val_data, "val_data")], 
                          verbose_eval=False, 
                          early_stopping_rounds=100
                          )

        xgb_valid_preds = model.predict(val_data)
        
        train_oof[val_ind] = xgb_valid_preds
    
        test_oof_preds = model.predict(xgb.DMatrix(test[columns]))
        
        test_preds += test_oof_preds / num_folds
        
        scores.append(cohen_kappa_score(val_target, xgb_valid_preds.round(), weights='quadratic'))
        
        print("Fold ", f, " ", cohen_kappa_score(val_target, xgb_valid_preds.round(), weights="quadratic"))
                                
    print("Mean Kappa Score: ", np.mean(scores))
    print("Kappa Score OOF: ", cohen_kappa_score(target, train_oof.round(), weights='quadratic'))

    return train_oof, test_preds, np.mean(scores)

# Prepare the test data (x_val in this case)
test_data = x_val_selected

# Call the cross-validation function with optimized parameters
train_oof, test_preds, mean_score = cross_valid(None, x_train_selected, y_train, test_data, best_params)

# Output results
print("Out-of-Fold Predictions: ", train_oof)
print("Test Predictions: ", test_preds)
print("Mean Kappa Score: ", mean_score)



Fold  0   0.5943950846483668
Fold  1   0.5331020809906648
Fold  2   0.6086648716731253
Fold  3   0.5246774214624529
Fold  4   0.5704761960116477
Mean Kappa Score:  0.5662631309572514
Kappa Score OOF:  0.5661364843007817
Out-of-Fold Predictions:  [5.25515938 5.0920186  5.84769917 ... 5.32508421 5.30393505 6.63248968]
Test Predictions:  [5.482795  4.983465  4.879381  ... 5.942912  6.0797014 5.7331095]
Mean Kappa Score:  0.5662631309572514


In [38]:
#test_preds to txt
np.savetxt('test_preds-xvbg5.txt', test_preds, fmt='%f')

In [56]:
import optuna
import warnings
warnings.filterwarnings('ignore')
# Hyperparameter tuning with Optuna
def optimize_parameters(X, y, n_trials=50):
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'eta': trial.suggest_loguniform('eta', 0.005, 0.1),
            'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 30),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1e-3),
            'lambda': trial.suggest_loguniform('lambda', 1e-8, 1e-2),
            'alpha': trial.suggest_loguniform('alpha', 1e-8, 1e-2),
            'max_delta_step': trial.suggest_int('max_delta_step', 0, 50),
            'objective': 'reg:squarederror',
            'tree_method': 'hist'
        }

        model = xgb.XGBRegressor(**params)
        score = cross_val_score(model, X, y, cv=3, scoring='neg_mean_squared_error').mean()
        return -score

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    return study.best_params

# Initial hyperparameter tuning
initial_params = optimize_parameters(x_train, y_train, n_trials=50)
print("Initial best parameters found: ", initial_params)

# Feature selection using LOFO with optimized parameters
def lofo_feature_selection(X, y, params):
    features = X.columns
    baseline_model = xgb.XGBRegressor(**params)
    baseline_score = cross_val_score(baseline_model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    feature_scores = {}
    
    for feature in features:
        X_new = X.drop(columns=[feature])
        score = cross_val_score(baseline_model, X_new, y, cv=5, scoring='neg_mean_squared_error').mean()
        feature_scores[feature] = baseline_score - score
    
    selected_features = [feature for feature in features if feature_scores[feature] > 0]
    return selected_features

selected_features = lofo_feature_selection(x_train, y_train, initial_params)
x_train_selected = x_train[selected_features]
x_val_selected = x_val[selected_features]

# Refined hyperparameter tuning with Optuna
refined_params = optimize_parameters(x_train_selected, y_train, n_trials=50)
print("Refined best parameters found: ", refined_params)

[I 2024-06-05 03:39:43,516] A new study created in memory with name: no-name-0a7ca7a8-fc37-475e-bbd7-c479ff277ec5


[I 2024-06-05 03:39:45,625] Trial 0 finished with value: 0.5658870399380255 and parameters: {'max_depth': 12, 'eta': 0.05540505625297317, 'subsample': 0.8520849927247719, 'colsample_bytree': 0.48933693093387376, 'min_child_weight': 4, 'gamma': 2.9248307424203113e-08, 'lambda': 3.9840458363852824e-07, 'alpha': 5.114239340680318e-07, 'max_delta_step': 6}. Best is trial 0 with value: 0.5658870399380255.
[I 2024-06-05 03:39:46,457] Trial 1 finished with value: 0.5530962130876532 and parameters: {'max_depth': 14, 'eta': 0.015302173651670367, 'subsample': 0.9672989480319307, 'colsample_bytree': 0.5717637923171891, 'min_child_weight': 28, 'gamma': 0.0002423564041141227, 'lambda': 1.9718564259842665e-08, 'alpha': 0.007431788221125561, 'max_delta_step': 10}. Best is trial 1 with value: 0.5530962130876532.
[I 2024-06-05 03:39:47,044] Trial 2 finished with value: 0.5497153820236258 and parameters: {'max_depth': 10, 'eta': 0.015938422993670216, 'subsample': 0.5242757535646948, 'colsample_bytree': 

Initial best parameters found:  {'max_depth': 6, 'eta': 0.028546820830845254, 'subsample': 0.449151194937244, 'colsample_bytree': 0.8408489863012947, 'min_child_weight': 15, 'gamma': 2.8676803041924297e-05, 'lambda': 0.000955406293127171, 'alpha': 0.00016533121641312633, 'max_delta_step': 9}


[I 2024-06-05 03:40:37,978] A new study created in memory with name: no-name-fb208836-8dc5-4c03-947f-a6734cb2a17d
[I 2024-06-05 03:40:38,252] Trial 0 finished with value: 0.5374617634771842 and parameters: {'max_depth': 4, 'eta': 0.04550539191204351, 'subsample': 0.7989322801688503, 'colsample_bytree': 0.9182535750678096, 'min_child_weight': 3, 'gamma': 1.2167409478901456e-07, 'lambda': 2.8634140679954307e-07, 'alpha': 1.9381627361667508e-05, 'max_delta_step': 21}. Best is trial 0 with value: 0.5374617634771842.
[I 2024-06-05 03:40:38,714] Trial 1 finished with value: 0.5703135337080407 and parameters: {'max_depth': 9, 'eta': 0.0812006073332263, 'subsample': 0.8645625291140631, 'colsample_bytree': 0.5869347503025408, 'min_child_weight': 23, 'gamma': 1.2997700618367427e-07, 'lambda': 2.7489640145856335e-06, 'alpha': 0.0002598485564737093, 'max_delta_step': 27}. Best is trial 0 with value: 0.5374617634771842.
[I 2024-06-05 03:40:39,700] Trial 2 finished with value: 0.5558553671474885 and

Refined best parameters found:  {'max_depth': 6, 'eta': 0.02549950389425285, 'subsample': 0.7623914272368713, 'colsample_bytree': 0.6363102427080771, 'min_child_weight': 6, 'gamma': 3.721909783006037e-07, 'lambda': 4.997519006656428e-06, 'alpha': 0.003825456168305369, 'max_delta_step': 27}


In [64]:
# Prepare the test data (x_val in this case)
test_data = x_val_selected
def cross_valid(train, target, test, params, num_folds=2, random_state=9):
    train_oof = np.zeros((len(train)))
    test_preds = 0

    kf = StratifiedKFold(n_splits=num_folds, random_state=random_state, shuffle=True)
    scores = []

    num_rounds = 2000
    columns = train.columns
    
    for f, (train_ind, val_ind) in enumerate(kf.split(train, target)):
        train_df, val_df = train.iloc[train_ind], train.iloc[val_ind]
        train_target, val_target = target.iloc[train_ind], target.iloc[val_ind]

        trn_data = xgb.DMatrix(train_df, label=train_target)
        val_data = xgb.DMatrix(val_df, label=val_target)

        model = xgb.train(params, 
                          trn_data,
                          num_rounds,
                          evals=[(val_data, "val_data")], 
                          verbose_eval=False, 
                          early_stopping_rounds=50
                          )

        xgb_valid_preds = model.predict(val_data)
        
        train_oof[val_ind] = xgb_valid_preds
    
        test_oof_preds = model.predict(xgb.DMatrix(test[columns]))
        
        test_preds += test_oof_preds / num_folds
        
        scores.append(cohen_kappa_score(val_target, xgb_valid_preds.round(), weights='quadratic'))
        
        print("Fold ", f, " ", cohen_kappa_score(val_target, xgb_valid_preds.round(), weights="quadratic"))
                                
    print("Mean Kappa Score: ", np.mean(scores))
    print("Kappa Score OOF: ", cohen_kappa_score(target, train_oof.round(), weights='quadratic'))

    return train_oof, test_preds, np.mean(scores)
# Call the cross-validation function with refined parameters and selected features
train_oof, test_preds, mean_score = cross_valid(x_train_selected, y_train, test_data, initial_params)

# Output results
print("Out-of-Fold Predictions: ", train_oof)
print("Test Predictions: ", test_preds)
print("Mean Kappa Score: ", mean_score)

Fold  0   0.5841133772139612
Fold  1   0.554685684260047
Mean Kappa Score:  0.5693995307370041
Kappa Score OOF:  0.5694230603626815
Out-of-Fold Predictions:  [5.4837904  5.08870697 5.92732477 ... 5.36398411 5.33497381 6.64464235]
Test Predictions:  [5.36245   5.3525195 5.0218    ... 5.9182253 6.104315  5.461135 ]
Mean Kappa Score:  0.5693995307370041


In [61]:
selected_features

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'acidity_ratio',
 'total_acid',
 'mean_acid',
 'std_acid',
 'free_sulfur/total_sulfur',
 'sugar/alcohol',
 'BSO2',
 'FSO2/alcohol',
 'TSO2/alcohol',
 'BSO2/alcohol',
 'chlorides/TSO2',
 'sulphates/pH',
 'alcohol/density',
 'alcohol_density',
 'alcohol/pH',
 'alcohol/acidity',
 'alkalinity',
 'mineral',
 'density/pH',
 'total_alcohol',
 'acid/density',
 'sulphate/density',
 'sulphates*alcohol']

In [65]:
np.savetxt('test_preds-xvbg6.txt', test_preds, fmt='%f')

In [50]:
# Define the Optuna optimization function with feature selection
def optimize_parameters(X, y, n_trials=1000):
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 30),
            'eta': trial.suggest_loguniform('eta', 0.005, 0.1),
            'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 30),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1e-3),
            'lambda': trial.suggest_loguniform('lambda', 1e-8, 1e-2),
            'alpha': trial.suggest_loguniform('alpha', 1e-8, 1e-2),
            'max_delta_step': trial.suggest_int('max_delta_step', 0, 50),
            'objective': 'reg:squarederror', # other options
            'tree_method': 'hist'
        }

        # Train the model with all features
        model = xgb.XGBRegressor(**params)
        model.fit(X, y)
        
        # Get feature importances and select top features
        feature_importances = model.feature_importances_
        feature_importance_threshold = trial.suggest_uniform('feature_importance_threshold', 0.001, 0.2)
        selected_features = X.columns[feature_importances > feature_importance_threshold]
        
        if len(selected_features) == 0:
            return float('inf')  # Avoid having no features selected
        
        # Evaluate the model with selected features using cross-validation
        X_selected = X[selected_features]
        score = cross_val_score(model, X_selected, y, cv=10, scoring='neg_mean_squared_error').mean()
        return -score

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    best_params = study.best_params
    
    # Retrieve the best trial and selected features
    best_model = xgb.XGBRegressor(**best_params)
    best_model.fit(X, y)
    feature_importances = best_model.feature_importances_
    selected_features = X.columns[feature_importances > best_params['feature_importance_threshold']]
    
    return best_params, selected_features

# Hyperparameter tuning with integrated feature selection
best_params, selected_features = optimize_parameters(x_train, y_train, n_trials=50)
print("Best parameters found: ", best_params)
print("Selected features: ", selected_features)

# Cross-validation function without rounding, using refined parameters and selected features
def cross_valid(train, target, test, params, num_folds=10, random_state=42):
    train_oof = np.zeros((len(train)))
    test_preds = 0

    kf = StratifiedKFold(n_splits=num_folds, random_state=random_state, shuffle=True)
    scores = []

    num_rounds = 5000
    
    for f, (train_ind, val_ind) in enumerate(kf.split(train, target)):
        train_df, val_df = train.iloc[train_ind][selected_features], train.iloc[val_ind][selected_features]
        train_target, val_target = target.iloc[train_ind], target.iloc[val_ind]

        trn_data = xgb.DMatrix(train_df, label=train_target)
        val_data = xgb.DMatrix(val_df, label=val_target)

        model = xgb.train(params, 
                          trn_data,
                          num_rounds,
                          evals=[(val_data, "val_data")], 
                          verbose_eval=False, 
                          early_stopping_rounds=100
                          )

        xgb_valid_preds = model.predict(val_data)
        
        train_oof[val_ind] = xgb_valid_preds
    
        test_oof_preds = model.predict(xgb.DMatrix(test[selected_features]))
        
        test_preds += test_oof_preds / num_folds
        
        scores.append(cohen_kappa_score(val_target, xgb_valid_preds.round(), weights='quadratic'))
        
        print("Fold ", f, " ", cohen_kappa_score(val_target, xgb_valid_preds.round(), weights="quadratic"))
                                
    print("Mean Kappa Score: ", np.mean(scores))
    print("Kappa Score OOF: ", cohen_kappa_score(target, train_oof.round(), weights='quadratic'))

    return train_oof, test_preds, np.mean(scores)

# Prepare the test data (x_val in this case)
x_val_selected = x_val[selected_features]

# Call the cross-validation function with refined parameters and selected features
train_oof, test_preds, mean_score = cross_valid(x_train[selected_features], y_train, x_val, best_params)

# Output results
print("Out-of-Fold Predictions: ", train_oof)
print("Test Predictions: ", test_preds)
print("Mean Kappa Score: ", mean_score)

[I 2024-06-05 03:03:55,274] A new study created in memory with name: no-name-332ba0d1-80e1-45f6-9ec7-466fc8a0f6b3


[I 2024-06-05 03:03:56,719] Trial 0 finished with value: 0.6499825904259289 and parameters: {'max_depth': 9, 'eta': 0.013166107471762247, 'subsample': 0.4331931154680867, 'colsample_bytree': 0.950337661205207, 'min_child_weight': 27, 'gamma': 8.777866407922e-08, 'lambda': 0.004550145120160577, 'alpha': 1.946930851732922e-06, 'max_delta_step': 40, 'feature_importance_threshold': 0.17126369944864508}. Best is trial 0 with value: 0.6499825904259289.
[I 2024-06-05 03:03:58,788] Trial 1 finished with value: 0.6682187956340482 and parameters: {'max_depth': 12, 'eta': 0.020647312712678955, 'subsample': 0.9815864530975658, 'colsample_bytree': 0.9944736066391745, 'min_child_weight': 12, 'gamma': 8.424004845660697e-08, 'lambda': 1.7617881062379077e-07, 'alpha': 0.0006121648841024943, 'max_delta_step': 31, 'feature_importance_threshold': 0.12787370423286534}. Best is trial 0 with value: 0.6499825904259289.
[I 2024-06-05 03:03:59,950] Trial 2 finished with value: 0.6652587982171608 and parameters:

Best parameters found:  {'max_depth': 14, 'eta': 0.026668677231640125, 'subsample': 0.8056154954425139, 'colsample_bytree': 0.7030887645087476, 'min_child_weight': 22, 'gamma': 1.1870104702912256e-05, 'lambda': 0.005464351931761784, 'alpha': 0.00013316860384637023, 'max_delta_step': 5, 'feature_importance_threshold': 0.018305190562467363}
Selected features:  Index(['fixed acidity', 'volatile acidity', 'citric acid', 'chlorides',
       'free sulfur dioxide', 'pH', 'sulphates', 'alcohol', 'mso2',
       'acidity_ratio', 'std_acid', 'free_sulfur/total_sulfur', 'BSO2',
       'FSO2/alcohol', 'BSO2/alcohol', 'chlorides/TSO2', 'alcohol/density',
       'alcohol_density', 'sulphates/chlorides', 'alcohol/acidity',
       'alkalinity', 'mineral', 'density/pH', 'total_alcohol',
       'sulphates/acid'],
      dtype='object')
Fold  0   0.7181410413492579
Fold  1   0.6586143264552927
Fold  2   0.6341732857568962
Fold  3   0.6156873892982877
Fold  4   0.7091162985035241
Fold  5   0.679859786129476

In [52]:

# Cross-validation function without rounding, using refined parameters and selected features
def cross_valid(train, target, test, params, num_folds=5, random_state=42):
    train_oof = np.zeros((len(train)))
    test_preds = 0

    kf = StratifiedKFold(n_splits=num_folds, random_state=random_state, shuffle=True)
    scores = []

    num_rounds = 5000
    
    for f, (train_ind, val_ind) in enumerate(kf.split(train, target)):
        train_df, val_df = train.iloc[train_ind][selected_features], train.iloc[val_ind][selected_features]
        train_target, val_target = target.iloc[train_ind], target.iloc[val_ind]

        trn_data = xgb.DMatrix(train_df, label=train_target)
        val_data = xgb.DMatrix(val_df, label=val_target)

        model = xgb.train(params, 
                          trn_data,
                          num_rounds,
                          evals=[(val_data, "val_data")], 
                          verbose_eval=False, 
                          early_stopping_rounds=100
                          )

        xgb_valid_preds = model.predict(val_data)
        
        train_oof[val_ind] = xgb_valid_preds
    
        test_oof_preds = model.predict(xgb.DMatrix(test[selected_features]))
        
        test_preds += test_oof_preds / num_folds
        
        scores.append(cohen_kappa_score(val_target, xgb_valid_preds.round(), weights='quadratic'))
        
        print("Fold ", f, " ", cohen_kappa_score(val_target, xgb_valid_preds.round(), weights="quadratic"))
                                
    print("Mean Kappa Score: ", np.mean(scores))
    print("Kappa Score OOF: ", cohen_kappa_score(target, train_oof.round(), weights='quadratic'))

    return train_oof, test_preds, np.mean(scores)

# Prepare the test data (x_val in this case)
x_val_selected = x_val[selected_features]

# Call the cross-validation function with refined parameters and selected features
train_oof, test_preds, mean_score = cross_valid(x_train[selected_features], y_train, x_val, best_params)

# Output results
print("Out-of-Fold Predictions: ", train_oof)
print("Test Predictions: ", test_preds)
print("Mean Kappa Score: ", mean_score)

Fold  0   0.68569009222218
Fold  1   0.6039636402360271
Fold  2   0.6803031150383088
Fold  3   0.5990041666432375
Fold  4   0.6664813724249146
Mean Kappa Score:  0.6470884773129335
Kappa Score OOF:  0.6472237020581466
Out-of-Fold Predictions:  [5.4786191  5.22046518 5.96701193 ... 5.370543   5.54881954 6.91273594]
Test Predictions:  [5.336463  5.4114547 4.9628396 ... 5.891341  6.152432  5.6637335]
Mean Kappa Score:  0.6470884773129335
