In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from tqdm import tqdm
from itertools import combinations

#import os
#os.environ['KMP_DUPLICATE_LIB_OK']='True'

from warnings import filterwarnings
filterwarnings('ignore')

import gc
gc.collect()

0

In [2]:
are_you_on_kaggle = False 

# Load data from kaggle
if are_you_on_kaggle: 
    train = pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv', index_col='id')
    test  = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv', index_col='id')
    subm  = pd.read_csv('/kaggle/input/playground-series-s5e4/sample_submission.csv')
else: # load data from local
    train = pd.read_csv('train.csv',index_col='id')
    test  = pd.read_csv('test.csv', index_col='id')
    subm  = pd.read_csv('sample_submission.csv')

In [3]:
def feature_eng(df,train=True):
    le = LabelEncoder() 

    # New Columns
    df['Episode_Title'] = df['Episode_Title'].str.replace('Episode ', '', regex=False).astype('category')
    #df['Ad_Density'] = df['Number_of_Ads'] / df['Episode_Length_minutes']
    #df['Ad_Density'].replace([np.inf, -np.inf], 0, inplace=True)

    if train:
        # Get rid of outliers
        df = df[df['Number_of_Ads']<10]

    # Fill NULL values with median
    df['Number_of_Ads'].fillna(df['Number_of_Ads'].median(), inplace=True) 
    df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].median(), inplace=True)
    df['Episode_Length_minutes'].fillna(df['Episode_Length_minutes'].median(), inplace=True)
    
    # Preprocess Categorical Columns
    categorical_cols = ['Podcast_Name','Genre','Publication_Day','Publication_Time','Episode_Sentiment']
    for c in categorical_cols:
        df[c]=le.fit_transform(df[c]) # Converts categorical column into int format
        df[c] = df[c].astype('category') # Define column type as category 

    
    # Adding 2 by 2 combinations of categorical cols as columns
    gc.collect()
    categorical_cols.append('Episode_Title')
    pair_size = [2]

    for r in pair_size:
        for cols in tqdm(list(combinations(categorical_cols, r))):
            new_col_name = '_'.join(cols)
            
            df[new_col_name] = df[list(cols)].astype(str).agg('_'.join, axis=1)
            df[new_col_name] = df[new_col_name].astype('category')
    
    return df


# Apply

train = feature_eng(train)
test = feature_eng(test,train=False)

100%|██████████| 15/15 [00:33<00:00,  2.22s/it]
100%|██████████| 15/15 [00:11<00:00,  1.33it/s]


In [None]:
def optimize_data(df):
    # Reduce Data sizes
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        #elif pd.api.types.is_categorical_dtype(df[col]):
            #df[col] = df[col].cat.codes.astype('int16')
        elif df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

def train_model(params,rounds=5000):
    # Train final model on full data
    full_dtrain = xgb.DMatrix(X, label=y, enable_categorical=True)
    final_model = xgb.train(
        params,
        full_dtrain,
        num_boost_round=rounds,
        verbose_eval=250
    )
    return final_model

def create_submission():
    test_processed = optimize_data(test)
    xgb_test = xgb.DMatrix(test_processed, enable_categorical=True)
    test_results = final_model.predict(xgb_test)

    submission = pd.DataFrame({
        "id": subm["id"],
        "Listening_Time_minutes": test_results
    })
    submission.to_csv("submission.csv", index=False)


In [None]:
def optuna_search(X, y, custom_param_range=False, trial_count=10):       
    def objective(trial):
        if not custom_param_range: # wider search range
            params = {
                'max_bin': trial.suggest_int('max_bin', 512, 2048),
                'max_depth': trial.suggest_int('max_depth', 5, 15),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
                'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
                'gamma': trial.suggest_float('gamma', 0.0, 0.5),
                'min_child_weight': trial.suggest_int('min_child_weight', 3, 10),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 1.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1.0, log=True),
            }
        else: # For Experiments
            params = {
                'device': 'cuda',
                'max_depth': 15,
                'booster': 'gbtree',
                'objective': 'reg:squarederror',
                'max_bin': trial.suggest_int('max_bin', 700, 1600),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.015, log=True),
                'gamma': trial.suggest_float('gamma', 0.2, 0.35),
                'min_child_weight': trial.suggest_int('min_child_weight', 3, 5),
                'subsample': trial.suggest_float('subsample', 0.7, 0.8),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 0.95),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 0.5, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.3, 0.8, log=True),
            }
            
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
        dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)
    
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=10000, 
            evals=[(dval, 'eval')],
            early_stopping_rounds=50,  
            verbose_eval=False
        )
        
        del dtrain, dval
        gc.collect()
        
        return model.best_score
    
    study = optuna.create_study(
        direction='minimize',
        sampler=optuna.samplers.TPESampler(n_startup_trials=10),
        pruner=optuna.pruners.MedianPruner(
            n_warmup_steps=10,
            n_min_trials=5
        )
    )
    
    study.optimize(objective, n_trials=trial_count, show_progress_bar=True)
    
    return study.best_params 

In [None]:
y = train["Listening_Time_minutes"]
X = optimize_data(train.drop("Listening_Time_minutes", axis=1))

result_params = optuna_search(X, y, custom_param_range=True, trial_count=30)