In [None]:
import pandas as pd
import numpy as np
from rmsle import ag_rmsle_clamped_scorer, ag_rmsle_scorer
from autogluon.tabular import TabularPredictor
from itertools import combinations

In [None]:
download = False

if download:
    !kaggle competitions download -c playground-series-s5e5 -p data/

    import zipfile, pathlib, os

    zip_path = pathlib.Path('data/playground-series-s5e5.zip')
    with zipfile.ZipFile(zip_path) as z:
        z.extractall(zip_path.parent)
    os.remove(zip_path)

In [None]:
def preprocess(train_df, test_df):
    train_df, test_df = handle_columns(train_df, test_df)

    for col in ['Age', 'Weight', 'Height', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']:
        train_df, test_df = handle_outliers(train_df, test_df, col)

    train_df = remove_outliers(train_df, 'Calories')

    return train_df, test_df

def handle_columns(train_df, test_df):
    target = 'Calories'

    combined_df = pd.concat([train_df, test_df], ignore_index=True)

    combined_df['female'] = (combined_df['Sex'] == 'female').astype('int16')
    combined_df['male']   = (combined_df['Sex'] == 'male').astype('int16')
    combined_df = combined_df.drop(columns='Sex')
    combined_df['Age'] = combined_df['Age'].astype('float64')

    combined_df['BMI']  = combined_df['Weight'] / ((combined_df['Height'] / 100) ** 2)
    combined_df['BSA']  = 0.007184 * combined_df['Weight'] ** 0.425 * combined_df['Height'] ** 0.725
    combined_df['FFM']  = 0.407 * combined_df['Weight'] + 0.267 * combined_df['Height'] - 19.2
    combined_df['HRMax'] = 220.0 - combined_df['Age']
    combined_df['%HRMax'] = combined_df['Heart_Rate'] / combined_df['HRMax']
    combined_df['HRR']   = combined_df['HRMax'] - combined_df['Heart_Rate']
    combined_df['TRIMP'] = combined_df['Duration'] * combined_df['%HRMax']
    combined_df['Thermal_Load'] = (combined_df['Body_Temp'] - 37.0) * combined_df['Duration']
    combined_df['BMR'] = (
        10 * combined_df['Weight']
        + 6.25 * combined_df['Height']
        - 5 * combined_df['Age']
        + 5 * combined_df['male']
        - 161 * combined_df['female']
    )
    combined_df['Keytel'] = combined_df['Duration'] * (
        combined_df['Weight']     * (0.0475 * combined_df['male'] - 0.0302 * combined_df['female'])
      + combined_df['Heart_Rate'] * (0.151  * combined_df['male'] + 0.107  * combined_df['female'])
      - 13.17 * combined_df['male'] - 4.88 * combined_df['female']
    )

    num_cols  = combined_df.select_dtypes(['float64', 'int64']).drop(columns=[target]).columns
    new_cols  = {}

    for col1 in num_cols:
        s1 = combined_df[col1]

        new_cols[f'{col1}^2']      = s1 ** 2
        new_cols[f'{col1}^3']      = s1 ** 3
        new_cols[f'log1p({col1})'] = np.log1p(s1)

    for col1, col2 in combinations(num_cols, 2):
        s1, s2 = combined_df[col1], combined_df[col2]

        new_cols[f'{col1}+{col2}'] = s1 + s2
        new_cols[f'{col1}-{col2}'] = s1 - s2
        new_cols[f'{col1}*{col2}'] = s1 * s2

        if not (s2 == 0).any():
            new_cols[f'{col1}/{col2}'] = s1 / s2


    combined_df = pd.concat([combined_df, pd.DataFrame(new_cols, index=combined_df.index)], axis=1)

    for c in combined_df.select_dtypes('float64'):
        combined_df[c] = pd.to_numeric(combined_df[c], downcast='float')
    for c in combined_df.select_dtypes('int64'):
        combined_df[c] = pd.to_numeric(combined_df[c], downcast='integer')

    combined_df = combined_df.loc[:, combined_df.std() > 1e-4]

    train_df = combined_df.iloc[:len(train_df)].copy().set_index('id')
    test_df  = combined_df.iloc[len(train_df):].copy().drop(columns=target).set_index('id')

    return train_df, test_df

def handle_outliers(train_df, test_df, column):
    q1 = train_df[column].quantile(0.25)
    q3 = train_df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    train_df = train_df[(train_df[column] >= lower_bound) & (train_df[column] <= upper_bound)]
    test_df[column] = test_df[column].clip(lower_bound, upper_bound)
    return train_df, test_df

def remove_outliers(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [None]:
train, test = pd.read_csv('data/train.csv'), pd.read_csv('data/test.csv')
train, test = preprocess(train, test)

target = 'Calories/Duration'
train[target] = train['Calories'] / train['Duration']
train = train.drop(columns=['Calories'])

categorical_cols = [col for col in train.select_dtypes(include=['category']).columns.to_list() if col != target]
numerical_cols = [col for col in train.select_dtypes(include=['float64', 'int64']).columns.to_list() if col != target]

In [None]:
"""sample = train.sample(frac=0.1, random_state=0)
unused  = train.drop(sample.index)""" 

pred = TabularPredictor(
    label=target,
    eval_metric=ag_rmsle_clamped_scorer,
    problem_type='regression'
).fit(
    train_data=train,#sample,
    presets='high_quality',
    time_limit=3*60*60
)
"""
pred_ref = pred.refit_full(
    model='best',
    train_data_extra=unused
)"""

test[target] = pred.predict(
    data=test
)
test[target] = test[target].clip(0, 1000)
test['Calories'] = test[target] * test['Duration']
test[['Calories']].to_csv('data/submission11.csv', index=True)
!kaggle competitions submit -c playground-series-s5e5 -f data/submission11.csv -m "3 hrs autogluon with high quality model"