In [287]:
# normal imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# cleanlab
from cleanlab.regression.learn import CleanLearning

In [288]:
train = pd.read_csv('train.csv')
train.columns = train.columns.str.lower()
orig_train = train.copy()
train.head(1)

Unnamed: 0,id,podcast_name,episode_title,episode_length_minutes,genre,host_popularity_percentage,publication_day,publication_time,guest_popularity_percentage,number_of_ads,episode_sentiment,listening_time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998


In [289]:
# for kaggle 
# %load_ext cudf.pandas 
pd.set_option('display.max_columns', 500)

In [290]:
test = pd.read_csv('test.csv')
test.columns = test.columns.str.lower()
test.head(1)

Unnamed: 0,id,podcast_name,episode_title,episode_length_minutes,genre,host_popularity_percentage,publication_day,publication_time,guest_popularity_percentage,number_of_ads,episode_sentiment
0,750000,Educational Nuggets,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral


In [291]:
y_train = train["listening_time_minutes"]
train = train.drop(["listening_time_minutes"], axis=1)
combined = pd.concat([train, test])

In [292]:
combined["number_of_ads"] = combined["number_of_ads"].fillna(0)

In [293]:
# encode day cyclically  
days_dict = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Thursday": 3,
    "Friday": 4,
    "Saturday": 5,
    "Sunday": 6
}
def periodic_encode(data, map_dict, num_periods):
    if map_dict is not None:    
        data = [map_dict[x] for x in data]
    return np.cos(2 * np.pi * np.array(data) / num_periods), np.sin(2 * np.pi * np.array(data) / num_periods) 

combined["day_cos"], combined["day_sin"] = periodic_encode(combined["publication_day"], days_dict, 7) 

In [294]:
# encode time cyclically
time_dict = {
    "Morning": 9, 
    "Afternoon": 12, 
    "Evening": 18,
    "Night": 22
}

combined["time_cos"], combined["time_sin"] = periodic_encode(combined["publication_time"], time_dict, 24)

In [295]:
# encode sentiment numerically 
sentiment_dict = {
    "Negative": -1, 
    "Neutral" : 0,
    "Positive": 1
}

combined["sentiment_num"] = combined["episode_sentiment"].map(sentiment_dict)

In [296]:
# encode epsiode title numerically
combined["episode_title"] = [int(x[8:]) for x in combined["episode_title"]]

In [297]:
# get categorical columns
cat_cols = combined.select_dtypes(include=['object']).columns
print(cat_cols)

Index(['podcast_name', 'genre', 'publication_day', 'publication_time',
       'episode_sentiment'],
      dtype='object')


In [298]:
# encode categorical data
dummies = pd.get_dummies(combined[cat_cols], dtype=int)
combined = pd.concat([combined, dummies], axis=1)

In [299]:
# NaN columns
combined["NaNs"] = np.float32(0)

nan_cols = combined.columns[combined.isna().any()]

COMBO = ["NaNs"]
for i, c in enumerate(nan_cols):
    combined["NaNs"] += combined[c].isna()*2**i

    name_ads = f"{c}_nan_ads"
    combined[name_ads] = combined[c].isna()*100 + combined["number_of_ads"]

    name_sent = f"{c}_nan_sent"
    combined[name_sent] = combined[c].isna()*100 + combined["sentiment_num"]

    name_host_pop = f"{c}_nan_host_pop"
    combined[name_host_pop] = combined[c].isna()*100 + combined["host_popularity_percentage"]

    COMBO.extend([name_ads, name_sent, name_host_pop])

In [300]:
# cat combos 
for i, c in enumerate(cat_cols):
    factorize, _ = pd.factorize(combined[c])

    name_ads = f"{c}_ads"
    combined[name_ads] = factorize*100 + combined["number_of_ads"]

    name_sent = f"{c}_sent"
    combined[name_sent] = factorize*100 + combined["sentiment_num"]

    name_host_pop = f"{c}_host_pop"
    combined[name_host_pop] = factorize*100 + combined["host_popularity_percentage"]

    COMBO.extend([name_ads, name_sent, name_host_pop])

In [301]:
# multiple categorical combos
for i, c1 in enumerate(cat_cols[:-1]):
    for j, c2 in enumerate(cat_cols[i+1:]):
        n = f"{c1}_sum_{c2}"
        factorize_c1, _ = pd.factorize(combined[c1])
        factorize_c2, _ = pd.factorize(combined[c2])
        
        mul_n = f"{c1}_mul_{c2}"
        combined[n] = factorize_c1 + factorize_c2
        combined[mul_n] = factorize_c1 * factorize_c2

In [302]:
for i in combined.columns: 
    if combined[i].isna().sum() > 0:
        
        print(f"{i} has {combined[i].isna().sum()} missing values")

episode_length_minutes has 115829 missing values
guest_popularity_percentage has 194862 missing values


In [303]:
# selecting rows that don't have NaN values for both nan filled columns
elm_nan_idx = combined["episode_length_minutes"].isna()
gpp_nan_idx = combined["guest_popularity_percentage"].isna()

In [304]:
def display_results(model, data):
    # Feature importance
    feature_importance = pd.Series(model.feature_importances_, index=data.columns)
    feature_importance = feature_importance.sort_values(ascending=False)
    feature_importance[:10].plot(kind='barh', figsize=(10, 6))

    print("\n🔍 Top 10 Feature Importances:")
    print(feature_importance.head(10))


In [305]:
from sklearn.model_selection import train_test_split

def show_score(data, target, model):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    print(f"R^2 is {model.score(X_test, y_test)}")

In [306]:
# imputing with xgboost
from xgboost import XGBRegressor

drop_cols = cat_cols.to_list()
drop_cols.append("id")
combined_wo_cats = combined.drop(drop_cols, axis=1)

elm_imp_model = XGBRegressor(random_state=42)
gpp_imp_model = XGBRegressor(random_state=42)

elm_imp_model.fit(combined_wo_cats.drop(nan_cols, axis=1)[~elm_nan_idx], combined_wo_cats.loc[~elm_nan_idx, "episode_length_minutes"])
gpp_imp_model.fit(combined_wo_cats.drop(nan_cols, axis=1)[~gpp_nan_idx], combined_wo_cats.loc[~gpp_nan_idx, "guest_popularity_percentage"])

In [307]:
# validation for imputing 
show_score(combined_wo_cats.drop(nan_cols, axis=1)[~elm_nan_idx], combined_wo_cats.loc[~elm_nan_idx, "episode_length_minutes"], XGBRegressor(random_state=42))
show_score(combined_wo_cats.drop(nan_cols, axis=1)[~gpp_nan_idx], combined_wo_cats.loc[~gpp_nan_idx, "guest_popularity_percentage"], XGBRegressor(random_state=42))

R^2 is 0.017993241717660258
R^2 is 0.004951337764688257


In [308]:
# imputing missing vlaues 
combined.loc[elm_nan_idx, "episode_length_minutes"] = elm_imp_model.predict(combined_wo_cats.drop(nan_cols, axis=1)[elm_nan_idx])
combined.loc[gpp_nan_idx, "guest_popularity_percentage"] = gpp_imp_model.predict(combined_wo_cats.drop(nan_cols, axis=1)[gpp_nan_idx])

In [None]:
# now evaluating nan columns 
for i in combined.columns: 
    if combined[i].isna().sum() > 0:
        print(f"{i} has {combined[i].isna().sum()} missing values")