# Initial non linear model (xgb)

In [3]:
#pip install --upgrade pyarrow pandas scikit-learn

In [6]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn import preprocessing 
import sklearn as sk
import xgboost as xgb

In [7]:
# Load DataFrames
Podcast_Train_df = pd.read_csv('Data/train.csv')
Podcast_Test_df = pd.read_csv('Data/test.csv')

In [8]:
# to make the preprocessing at the same time
Podcast_Train_df['is_train'] = 1
Podcast_Test_df['is_train'] = 0

In [9]:
Podcast_Train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
 12  is_train                     750000 non-null  int64  
dtyp

In [10]:
Podcast_Test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           250000 non-null  int64  
 1   Podcast_Name                 250000 non-null  object 
 2   Episode_Title                250000 non-null  object 
 3   Episode_Length_minutes       221264 non-null  float64
 4   Genre                        250000 non-null  object 
 5   Host_Popularity_percentage   250000 non-null  float64
 6   Publication_Day              250000 non-null  object 
 7   Publication_Time             250000 non-null  object 
 8   Guest_Popularity_percentage  201168 non-null  float64
 9   Number_of_Ads                250000 non-null  float64
 10  Episode_Sentiment            250000 non-null  object 
 11  is_train                     250000 non-null  int64  
dtypes: float64(4), int64(2), object(6)
memory usage: 22.9+ MB


In [11]:
Podcast_df = pd.concat([Podcast_Train_df,Podcast_Test_df])

In [12]:
Podcast_df

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,is_train
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998,1
1,1,Joke Junction,Episode 26,119.80,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241,1
2,2,Study Sessions,Episode 16,73.90,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531,1
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.70,2.0,Positive,46.27824,1
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,999995,Mind & Body,Episode 100,21.05,Health,65.77,Saturday,Evening,96.40,3.0,Negative,,0
249996,999996,Joke Junction,Episode 85,85.50,Comedy,41.47,Saturday,Night,30.52,2.0,Negative,,0
249997,999997,Joke Junction,Episode 63,12.11,Comedy,25.92,Thursday,Evening,73.69,1.0,Neutral,,0
249998,999998,Market Masters,Episode 46,113.46,Business,43.47,Friday,Night,93.59,3.0,Positive,,0


### The features 
* 'Episode_Length_minutes' has NaN values (average this time)
* 'Guest_Popularity_percentage' has NaN values "no guest" (=0 this time)
* 'Number_of_Ads' has one NaN, but also error values. (goes only from 0 to 3 'int'. NaN by 0, missing values by 1)
* xgb accepts categorical features. 

In [13]:
Podcast_df["Number_of_Ads"].value_counts(dropna=False)

Number_of_Ads
0.00       290455
1.00       285084
3.00       213729
2.00       210720
103.25          2
53.37           1
NaN             1
103.91          1
103.00          1
53.42           1
103.75          1
12.00           1
103.88          1
89.12           1
2063.00         1
Name: count, dtype: int64

In [14]:
Podcast_df["Number_of_Ads"] = Podcast_df["Number_of_Ads"].fillna(0)

In [15]:
Podcast_df["Number_of_Ads"].value_counts(dropna=False)

Number_of_Ads
0.00       290456
1.00       285084
3.00       213729
2.00       210720
103.25          2
53.37           1
103.91          1
103.00          1
53.42           1
103.75          1
12.00           1
103.88          1
89.12           1
2063.00         1
Name: count, dtype: int64

In [16]:
# Define the list of correct values
correct_values = [0.00, 1.00, 2.00, 3.00]

# Create a boolean mask to identify erroneous values
# Values that are NOT in the correct list 
erroneous_mask = ~Podcast_df["Number_of_Ads"].isin(correct_values) 
Podcast_df.loc[erroneous_mask, "Number_of_Ads"] = 1.00


In [17]:
Podcast_df["Number_of_Ads"].value_counts(dropna=False)

Number_of_Ads
0.0    290456
1.0    285095
3.0    213729
2.0    210720
Name: count, dtype: int64

In [18]:
# Change episode number to a number
Podcast_df["Episode_Number"] = Podcast_df["Episode_Title"].str.extract(r"(\d+)").astype(float)
Podcast_df['Guest_Popularity_percentage'] = Podcast_df['Guest_Popularity_percentage'].fillna(0) 

In [19]:
Podcast_df["Episode_Length_minutes"] = Podcast_df["Episode_Length_minutes"].fillna(Podcast_df["Episode_Length_minutes"].median())

In [20]:
Podcast_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 249999
Data columns (total 14 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   id                           1000000 non-null  int64  
 1   Podcast_Name                 1000000 non-null  object 
 2   Episode_Title                1000000 non-null  object 
 3   Episode_Length_minutes       1000000 non-null  float64
 4   Genre                        1000000 non-null  object 
 5   Host_Popularity_percentage   1000000 non-null  float64
 6   Publication_Day              1000000 non-null  object 
 7   Publication_Time             1000000 non-null  object 
 8   Guest_Popularity_percentage  1000000 non-null  float64
 9   Number_of_Ads                1000000 non-null  float64
 10  Episode_Sentiment            1000000 non-null  object 
 11  Listening_Time_minutes       750000 non-null   float64
 12  is_train                     1000000 non-null  i

In [21]:
# encode categorical
cat_cols =["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]
for col in cat_cols:
    Podcast_df[col] = Podcast_df[col].astype('category')

In [22]:
Podcast_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 249999
Data columns (total 14 columns):
 #   Column                       Non-Null Count    Dtype   
---  ------                       --------------    -----   
 0   id                           1000000 non-null  int64   
 1   Podcast_Name                 1000000 non-null  category
 2   Episode_Title                1000000 non-null  object  
 3   Episode_Length_minutes       1000000 non-null  float64 
 4   Genre                        1000000 non-null  category
 5   Host_Popularity_percentage   1000000 non-null  float64 
 6   Publication_Day              1000000 non-null  category
 7   Publication_Time             1000000 non-null  category
 8   Guest_Popularity_percentage  1000000 non-null  float64 
 9   Number_of_Ads                1000000 non-null  float64 
 10  Episode_Sentiment            1000000 non-null  category
 11  Listening_Time_minutes       750000 non-null   float64 
 12  is_train                     10000

In [23]:
# Drop unused
Podcast_df.drop(columns=["Episode_Title"], inplace=True)

In [24]:
# Split back Podcast_df
Podcast_Train_df = Podcast_df[Podcast_df['is_train']== 1].drop(columns=["is_train"])
Podcast_Test_df = Podcast_df[Podcast_df['is_train']== 0].drop(columns=["is_train", "Listening_Time_minutes"])

In [25]:
# Model introduction.
X = Podcast_Train_df.drop(columns=["Listening_Time_minutes", "id"])
y = Podcast_Train_df["Listening_Time_minutes"]

In [26]:
model = xgb.XGBRegressor(
        max_depth=6,  
        colsample_bytree=0.5, 
        subsample=0.8,  
        n_estimators=10_000,  
        learning_rate=0.02,  
        enable_categorical=True,
        min_child_weight=10,
    )

In [27]:
model.fit(X,y)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.5, device=None, early_stopping_rounds=None,
             enable_categorical=True, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.02, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=10, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=10000, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

In [28]:
y_hat = model.predict(X)

In [27]:
root_mean_squared_error(y, y_hat)

np.float64(12.08390804635549)

In [28]:
# test submission
X_test = Podcast_Test_df.drop(columns = ['id'])

In [29]:
prediction = model.predict(X_test)

In [30]:
Podcast_Test_df['prediction']=prediction

In [31]:
Podcast_Test_df

Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Episode_Number,prediction
0,750000,Educational Nuggets,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral,73.0,53.445232
1,750001,Sound Waves,27.87,Music,71.29,Sunday,Morning,0.00,0.0,Neutral,23.0,17.593576
2,750002,Joke Junction,69.10,Comedy,67.89,Friday,Evening,97.51,0.0,Positive,11.0,49.516323
3,750003,Comedy Corner,115.39,Comedy,23.40,Sunday,Morning,51.75,2.0,Positive,73.0,77.965012
4,750004,Life Lessons,72.32,Lifestyle,58.10,Wednesday,Morning,11.30,2.0,Neutral,50.0,47.876492
...,...,...,...,...,...,...,...,...,...,...,...,...
249995,999995,Mind & Body,21.05,Health,65.77,Saturday,Evening,96.40,3.0,Negative,100.0,13.814168
249996,999996,Joke Junction,85.50,Comedy,41.47,Saturday,Night,30.52,2.0,Negative,85.0,58.153175
249997,999997,Joke Junction,12.11,Comedy,25.92,Thursday,Evening,73.69,1.0,Neutral,63.0,7.422309
249998,999998,Market Masters,113.46,Business,43.47,Friday,Night,93.59,3.0,Positive,46.0,76.507927


In [32]:
Submission = Podcast_Test_df[['id','prediction']]

In [None]:
Submission

In [33]:
Submission.to_csv('Data/Submission_Jor.xgb.csv', index=False)

In [2]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [29]:
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['category', 'object']).columns.tolist()

In [None]:
X_encoded = pd.get_dummies(X, columns=cat_cols)
X_test_encoded = pd.get_dummies(X_test, columns=cat_cols)

X_encoded, X_test_encoded = X_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
X_test_scaled = scaler.transform(X_test_encoded)


In [None]:

# --- Fit SVR Models as before ---
from sklearn.svm import SVR

svr_rbf = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)
svr_rbf.fit(X_scaled, y)
y_pred_rbf = svr_rbf.predict(X_scaled)

from sklearn.metrics import root_mean_squared_error

rbf_rmse = root_mean_squared_error(y, y_pred_rbf)
print(f"SVR RBF Kernel RMSE: {rbf_rmse:.4f}")

# --- Predict on test data ---
Podcast_Test_df['prediction'] = svr_rbf.predict(X_test_scaled)

Submission = Podcast_Test_df[['id', 'prediction']]
Submission.to_csv('Data/Submission_Jor_SVR.csv', index=False)