In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv
/kaggle/input/orginal-podcast-dataset/podcast_dataset.csv


In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
pd.options.mode.copy_on_write = True
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from cuml.preprocessing import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from itertools import combinations
import warnings
warnings.simplefilter('ignore')

In [3]:
def process_combinations_fast(df, columns_to_encode, pair_size, max_batch_size=2000):
    # Precompute string versions of all columns once
    str_df = df[columns_to_encode]
    le = LabelEncoder()
    str_df = str_df.astype(str)
    total_new_cols = 0
    
    for r in pair_size:
        print(f"Processing {r}-combinations")
        
        # Count total combinations for this r-value
        n_combinations = np.math.comb(len(columns_to_encode), r)
        print(f"Total {r}-combinations to process: {n_combinations}")
        
        # Process combinations in batches to manage memory
        combos_iter = combinations(columns_to_encode, r)
        batch_cols = []
        batch_names = []
        
        with tqdm(total=n_combinations) as pbar:
            while True:
                # Collect a batch of combinations
                batch_cols.clear()
                batch_names.clear()
                
                # Fill the current batch
                for _ in range(max_batch_size):
                    try:
                        cols = next(combos_iter)
                        batch_cols.append(list(cols))
                        batch_names.append('+'.join(cols))
                    except StopIteration:
                        break
                
                if not batch_cols:  # No more combinations
                    break
                
                # Process this batch vectorized
                for i, (cols, new_name) in enumerate(zip(batch_cols, batch_names)):
                    # Fast vectorized concatenation
                    result = str_df[cols[0]].copy()
                    for col in cols[1:]:
                        result += '' + str_df[col]
                    
                    df[new_name] = le.fit_transform(result) + 1
                    pbar.update(1)
                
                total_new_cols += len(batch_cols)
                if len(batch_cols) == max_batch_size:  # Only print on full batches
                    print(f"Progress: {total_new_cols}/{n_combinations} combinations processed")
        
        print(f"Completed all {r}-combinations. Total columns now: {len(df.columns)}")
    
    return df

In [4]:
# Load data
df_train = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
df_orginal = pd.read_csv("/kaggle/input/orginal-podcast-dataset/podcast_dataset.csv")
df_test = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv')

df = pd.concat([df_train, df_orginal, df_test], axis=0, ignore_index=True)

df.drop(columns=['id'], inplace=True)
df = df.drop_duplicates()

# outlier removal
df['Episode_Length_minutes'] = np.maximum(0, np.minimum(120, df['Episode_Length_minutes']))
df['Host_Popularity_percentage'] = np.maximum(20, np.minimum(100, df['Host_Popularity_percentage']))
df['Guest_Popularity_percentage'] = np.maximum(0, np.minimum(100, df['Guest_Popularity_percentage']))
df.loc[df['Number_of_Ads'] > 3, 'Number_of_Ads'] = 0

# Encode categorical features
day_mapping = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
df['Publication_Day'] = df['Publication_Day'].map(day_mapping)

time_mapping = {'Morning': 1, 'Afternoon': 2, 'Evening': 3, 'Night': 4}
df['Publication_Time'] = df['Publication_Time'].map(time_mapping)

sentiment_map = {'Negative': 1, 'Neutral': 2, 'Positive': 3}
df['Episode_Sentiment'] = df['Episode_Sentiment'].map(sentiment_map)

df['Episode_Title'] = df['Episode_Title'].str.replace('Episode ', '', regex=True)
df['Episode_Title'] = df['Episode_Title'].astype('int')

le = LabelEncoder()
for col in df.select_dtypes('object').columns:
    df[col] = le.fit_transform(df[col]) + 1

# Some Feature engineering
for col in ['Episode_Length_minutes']:
    df[col + '_sqrt'] = np.sqrt(df[col])
    df[col + '_squared'] = df[col] ** 2

for col in tqdm(['Episode_Sentiment', 'Genre', 'Publication_Day', 'Podcast_Name', 'Episode_Title',
                 'Guest_Popularity_percentage', 'Host_Popularity_percentage', 'Number_of_Ads']):
    df[col + '_EP'] = df.groupby(col)['Episode_Length_minutes'].transform('mean')

df = process_combinations_fast(df, ['Episode_Length_minutes', 'Episode_Title', 'Publication_Time', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 
                     'Publication_Day', 'Podcast_Name','Genre','Guest_Popularity_percentage'], [2,3,5,7], 1000)

df = df.astype('float32')

df_train = df.iloc[:-len(df_test)]
df_test = df.iloc[-len(df_test):].reset_index(drop=True)

df_train = df_train[df_train['Listening_Time_minutes'].notnull()]

target = df_train.pop('Listening_Time_minutes')
df_test.pop('Listening_Time_minutes')

df_train.shape, df_test.shape

  0%|          | 0/8 [00:00<?, ?it/s]

Processing 2-combinations
Total 2-combinations to process: 45


  0%|          | 0/45 [00:00<?, ?it/s]

Completed all 2-combinations. Total columns now: 66
Processing 3-combinations
Total 3-combinations to process: 120


  0%|          | 0/120 [00:00<?, ?it/s]

Completed all 3-combinations. Total columns now: 186
Processing 5-combinations
Total 5-combinations to process: 252


  0%|          | 0/252 [00:00<?, ?it/s]

Completed all 5-combinations. Total columns now: 438
Processing 7-combinations
Total 7-combinations to process: 120


  0%|          | 0/120 [00:00<?, ?it/s]

Completed all 7-combinations. Total columns now: 558


((794868, 557), (250000, 557))

In [5]:
seed1 = 42
cv = KFold(7, random_state=seed1, shuffle=True)
pred_test = np.zeros((250000,))

# XGBoost learning_rate schedule
def lr_decay(epoch):
    if epoch < 115:
        return 0.05
    else:
        return 0.01
callbacks = xgb.callback.LearningRateScheduler(lr_decay)

# XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'seed': seed1,
    'max_depth': 19,
    'learning_rate': 0.03,
    'min_child_weight': 50,
    'reg_alpha': 5,
    'reg_lambda': 1,
    'subsample': 0.85,
    'colsample_bytree': 0.6,
    'colsample_bynode': 0.5,
    'device': "cuda"
}

for idx_train, idx_valid in cv.split(df_train):
    
    X_train, y_train = df_train.iloc[idx_train], target.iloc[idx_train]
    X_valid, y_valid = df_train.iloc[idx_valid], target.iloc[idx_valid]
    X_test = df_test[X_train.columns].copy()

    features = df_train.columns
    
    encoder1 = TargetEncoder(n_folds=5, seed=seed1, stat="mean")

    for col in tqdm(features[:20]):
        X_train[col+'_te1'] = encoder1.fit_transform(X_train[[col]], y_train)
        X_valid[col+'_te1'] = encoder1.transform(X_valid[[col]])
        X_test[col+'_te1'] = encoder1.transform(X_test[[col]])

    for col in tqdm(features[20:]):
        X_train[col] = encoder1.fit_transform(X_train[[col]], y_train)
        X_valid[col] = encoder1.transform(X_valid[[col]])
        X_test[col] = encoder1.transform(X_test[[col]])

    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_valid, label=y_valid)
    dtest = xgb.DMatrix(X_test)

    # Train the model with early stopping
    model = xgb.train(
        params, 
        dtrain, 
        num_boost_round=1000000, 
        evals=[(dtrain, 'train'), (dval, 'validation')], 
        early_stopping_rounds=30, 
        verbose_eval=500,
        callbacks=[callbacks]
    )

    # Evaluate on validation set
    predictions = model.predict(dval)

    # Generate predictions for test set and save submission
    pred_test += np.maximum(0, np.minimum(120, model.predict(dtest)))
    print("----------------------------------------------------------------")

pred_test /= 7

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

[0]	train-rmse:26.49828	validation-rmse:26.46079
[411]	train-rmse:8.93218	validation-rmse:11.83156
----------------------------------------------------------------


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

[0]	train-rmse:26.49152	validation-rmse:26.50453
[334]	train-rmse:9.07550	validation-rmse:11.86320
----------------------------------------------------------------


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

[0]	train-rmse:26.49228	validation-rmse:26.50544
[135]	train-rmse:9.56676	validation-rmse:11.80707
----------------------------------------------------------------


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

[0]	train-rmse:26.49129	validation-rmse:26.50651
[136]	train-rmse:9.56759	validation-rmse:11.91929
----------------------------------------------------------------


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

[0]	train-rmse:26.48028	validation-rmse:26.57198
[135]	train-rmse:9.52735	validation-rmse:11.87880
----------------------------------------------------------------


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

[0]	train-rmse:26.49995	validation-rmse:26.45177
[278]	train-rmse:9.15073	validation-rmse:11.90420
----------------------------------------------------------------


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

[0]	train-rmse:26.49363	validation-rmse:26.49499
[480]	train-rmse:8.74329	validation-rmse:11.89639
----------------------------------------------------------------


In [6]:
df_sub = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")
df_sub.Listening_Time_minutes = pred_test
df_sub.to_csv('submission.csv', index=False)