In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [95]:
train_df = pd.read_csv('train.csv')
train_df.columns = train_df.columns.str.lower()
train_df.head(1)

Unnamed: 0,id,podcast_name,episode_title,episode_length_minutes,genre,host_popularity_percentage,publication_day,publication_time,guest_popularity_percentage,number_of_ads,episode_sentiment,listening_time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998


In [96]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   podcast_name                 750000 non-null  object 
 2   episode_title                750000 non-null  object 
 3   episode_length_minutes       662907 non-null  float64
 4   genre                        750000 non-null  object 
 5   host_popularity_percentage   750000 non-null  float64
 6   publication_day              750000 non-null  object 
 7   publication_time             750000 non-null  object 
 8   guest_popularity_percentage  603970 non-null  float64
 9   number_of_ads                749999 non-null  float64
 10  episode_sentiment            750000 non-null  object 
 11  listening_time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


In [97]:
from sklearn.model_selection import train_test_split
train_df = train_df.dropna()
X_train, X_test, y_train, y_test = train_test_split(train_df.drop("listening_time_minutes", axis=1), train_df["listening_time_minutes"], test_size=0.2, random_state=42)

In [98]:
days_dict = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Thursday": 3,
    "Friday": 4,
    "Saturday": 5,
    "Sunday": 6
}

sentiment_dict = {
    "Negative": -1, 
    "Neutral" : 0,
    "Positive": 1
}

def periodic_encode(data, max_val):
    return np.cos(2 * np.pi * np.array(data) / max_val), np.sin(2 * np.pi * np.array(data) / max_val) 

def basic_transform(df):
    df_copy = df.copy()
    df_copy.columns = df.columns.str.lower()
    # replace string episode title with int
    df_copy["episode_title"] = [int(x[8:]) for x in df["episode_title"]]
    # periodically encode day of the week
    df_copy["day_numerical_cos"] = periodic_encode([days_dict[x] for x in df["publication_day"]], 6)[0]
    df_copy["day_numerical_sin"] = periodic_encode([days_dict[x] for x in df["publication_day"]], 6)[1]
    df_copy = df_copy.drop(columns=["publication_day"])
    # convert episode sentiment to numerical
    df_copy["episode_sentiment"] = [sentiment_dict[x] for x in df["episode_sentiment"]]
    # podcast name get dummies
    df_copy = pd.get_dummies(df_copy, columns=["podcast_name"], dtype=int)
    # genre get dummies
    df_copy = pd.get_dummies(df_copy, columns=["genre"], dtype=int)
    # time get dummies
    df_copy = pd.get_dummies(df_copy, columns=["publication_time"], dtype=int)

    return df_copy

In [99]:
from sklearn.preprocessing import StandardScaler

# y train normalization 
y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(np.array(y_train).reshape(-1, 1))

In [100]:
X_train = basic_transform(X_train)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 431238 entries, 345353 to 170019
Data columns (total 71 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   id                                431238 non-null  int64  
 1   episode_title                     431238 non-null  int64  
 2   episode_length_minutes            431238 non-null  float64
 3   host_popularity_percentage        431238 non-null  float64
 4   guest_popularity_percentage       431238 non-null  float64
 5   number_of_ads                     431238 non-null  float64
 6   episode_sentiment                 431238 non-null  int64  
 7   day_numerical_cos                 431238 non-null  float64
 8   day_numerical_sin                 431238 non-null  float64
 9   podcast_name_Athlete's Arena      431238 non-null  int32  
 10  podcast_name_Brain Boost          431238 non-null  int32  
 11  podcast_name_Business Briefs      431238 non-null  i

In [115]:
from lightgbm import LGBMRegressor

# Initialize the model with RMSE as the evaluation metric
lgbm_model = LGBMRegressor(
    random_state=42,
    objective='regression',
    metric='rmse'  # Set metric to RMSE
)

# Convert y_train to the correct format if it's a Series
if isinstance(y_train, pd.Series):
    y_train = y_train.values

# Fit the model
lgbm_model.fit(
    X_train, 
    y_train,
    eval_set=[(X_train, y_train)],  # Optional: add validation set to monitor training
    eval_metric='rmse',             # Specify RMSE for evaluation during training                 # Show training progress
)

# For scoring, ensure X_test is properly transformed and y_test is in the right format
X_test_transformed = np.array(basic_transform(X_test))
y_test_transformed = y_scaler.transform(y_test.values.reshape(-1, 1) if isinstance(y_test, pd.Series) else y_test.reshape(-1, 1))

#
# Calculate RMSE explicitly
from sklearn.metrics import mean_squared_error
import numpy as np
y_pred = y_scaler.inverse_transform(lgbm_model.predict(X_test_transformed).reshape(-1, 1))
rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_test_transformed.reshape(-1, 1)), y_pred))
print(f"RMSE: {rmse}")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015917 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1263
[LightGBM] [Info] Number of data points in the train set: 431238, number of used features: 71
[LightGBM] [Info] Start training from score -0.000000
RMSE: 10.424221545488448


In [116]:
test_df = pd.read_csv('test.csv')
test_df = test_df.dropna()
preds = y_scaler.inverse_transform(lgbm_model.predict(np.array(basic_transform(test_df))).reshape(-1, 1))
preds

KeyError: 'episode_title'