In [68]:
# normal imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# cleanlab
from cleanlab.regression.learn import CleanLearning

In [69]:
train = pd.read_csv('train.csv')
train.columns = train.columns.str.lower()
train.head(1)

Unnamed: 0,id,podcast_name,episode_title,episode_length_minutes,genre,host_popularity_percentage,publication_day,publication_time,guest_popularity_percentage,number_of_ads,episode_sentiment,listening_time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998


In [70]:
# for kaggle 
# %load_ext cudf.pandas 
pd.set_option('display.max_columns', 500)

In [71]:
test = pd.read_csv('test.csv')
test.columns = test.columns.str.lower()
test.head(1)

Unnamed: 0,id,podcast_name,episode_title,episode_length_minutes,genre,host_popularity_percentage,publication_day,publication_time,guest_popularity_percentage,number_of_ads,episode_sentiment
0,750000,Educational Nuggets,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral


In [72]:
y_train = train["listening_time_minutes"]
train = train.drop(["listening_time_minutes"], axis=1)
combined = pd.concat([train, test])

In [73]:
# encode day cyclically  
days_dict = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Thursday": 3,
    "Friday": 4,
    "Saturday": 5,
    "Sunday": 6
}
def periodic_encode(data, map_dict, num_periods):
    if map_dict is not None:    
        data = [map_dict[x] for x in data]
    return np.cos(2 * np.pi * np.array(data) / num_periods), np.sin(2 * np.pi * np.array(data) / num_periods) 

combined["day_cos"], combined["day_sin"] = periodic_encode(combined["publication_day"], days_dict, 7) 

In [74]:
# encode time cyclically
time_dict = {
    "Morning": 9, 
    "Afternoon": 12, 
    "Evening": 18,
    "Night": 22
}

combined["time_cos"], combined["time_sin"] = periodic_encode(combined["publication_time"], time_dict, 24)

In [75]:
# encode sentiment numerically 
sentiment_dict = {
    "Negative": -1, 
    "Neutral" : 0,
    "Positive": 1
}

combined["sentiment_num"] = combined["episode_sentiment"].map(sentiment_dict)

In [76]:
# encode epsiode title numerically
combined["episode_title"] = [int(x[8:]) for x in combined["episode_title"]]

In [77]:
# get categorical columns
cat_cols = combined.select_dtypes(include=['object']).columns
print(cat_cols)

Index(['podcast_name', 'genre', 'publication_day', 'publication_time',
       'episode_sentiment'],
      dtype='object')


In [78]:
# encode categorical data
dummies = pd.get_dummies(train, columns=cat_cols, dtype=int)
combined = pd.concat([combined, dummies], axis=1)

# drop unnecessary columns
combined = combined.drop(columns=cat_cols)

In [None]:
# Nan column 

In [79]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 249999
Data columns (total 89 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   id                                1000000 non-null  int64  
 1   episode_title                     1000000 non-null  int64  
 2   episode_length_minutes            884171 non-null   float64
 3   host_popularity_percentage        1000000 non-null  float64
 4   guest_popularity_percentage       805138 non-null   float64
 5   number_of_ads                     999999 non-null   float64
 6   day_cos                           1000000 non-null  float64
 7   day_sin                           1000000 non-null  float64
 8   time_cos                          1000000 non-null  float64
 9   time_sin                          1000000 non-null  float64
 10  sentiment_num                     1000000 non-null  int64  
 11  id                                1000000 n