In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings as w
w.filterwarnings('ignore')

In [4]:
train=pd.read_csv("../Dataset/podcast_listen_time_train.csv")
test=pd.read_csv("../Dataset/podcast_listen_time_test.csv")

In [5]:
# For train
train['Number_of_Ads'].fillna(train['Number_of_Ads'].mean(),inplace=True)
train['Guest_Popularity_percentage'].fillna(train['Guest_Popularity_percentage'].mean(),inplace=True)
train['Episode_Length_minutes'].fillna(train['Episode_Length_minutes'].median(),inplace=True)
train.drop(columns=['id'], inplace=True)
# For test
test['Number_of_Ads'].fillna(test['Number_of_Ads'].mean(),inplace=True)
test['Guest_Popularity_percentage'].fillna(test['Guest_Popularity_percentage'].mean(),inplace=True)
test['Episode_Length_minutes'].fillna(test['Episode_Length_minutes'].median(),inplace=True)
test.drop(columns=['id'], inplace=True)

In [6]:
num_cols=train.select_dtypes(include=(['int64','float64'])).columns.tolist()
cat_cols=train.select_dtypes(include=(['object'])).columns.tolist()

In [7]:
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
import lightgbm as lgb
from sklearn.metrics import mean_squared_error,make_scorer,r2_score
target='Listening_Time_minutes'
train.drop(columns=['id'],inplace=True,errors='ignore')
x=train.drop(target,axis=1)
y=train[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

num_cols=x.select_dtypes(include=(['int64','float64'])).columns.tolist()
cat_cols=x.select_dtypes(include=(['object'])).columns.tolist()
num_pipeline=Pipeline([('Impute',SimpleImputer(strategy='mean')),
                       ('scaler',StandardScaler())])

cat_pipeline=Pipeline([('Impute',SimpleImputer(strategy='most_frequent')),
                       ('scaler',OneHotEncoder(handle_unknown='ignore',drop='first',
                                              sparse_output=False))])

col_transformer=ColumnTransformer([('num',num_pipeline,num_cols),
                        ('cat',cat_pipeline,cat_cols)])

lg=lgb.LGBMRegressor(n_estimators=6000,
    max_depth=15,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=6,
    n_jobs=-1,
    verbose=-1)
model=Pipeline([('pre',col_transformer),
               ('lg',lg)])
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print(f'MSE: {mean_squared_error(y_test,y_pred) :.2f}')
print(f'R2 score {r2_score(y_test,y_pred) * 100 :.2f}')
rmsc=np.sqrt(mean_squared_error(y_test,y_pred))
print(f'RMSC = {rmsc :.4f}')
for actual,pred in zip(y_test[:10],y_pred[:10]):
    print(f'Actual: {actual :.2f}   | Predicted: {pred :.2f}')

MSE: 164.80
R2 score 77.60
RMSC = 12.8374
Actual: 20.92   | Predicted: 32.19
Actual: 42.66   | Predicted: 37.40
Actual: 43.30   | Predicted: 36.05
Actual: 48.56   | Predicted: 48.77
Actual: 48.54   | Predicted: 53.14
Actual: 61.73   | Predicted: 61.62
Actual: 56.90   | Predicted: 53.55
Actual: 29.46   | Predicted: 32.01
Actual: 45.41   | Predicted: 32.99
Actual: 55.05   | Predicted: 68.47


In [8]:
test_predictions = model.predict(test)

test_ids = pd.read_csv('../Dataset/podcast_listen_time_test.csv')['id']
# Prepare submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'Listening_Time_minutes': test_predictions
})

# Save to CSV for submission
submission.to_csv('submission.csv', index=False)