# Predicting Podcast Time
**Author: Benson Wainaina**

In this notebook, I aim to predict the amount of time someone listens to a podcast. I'll be exploring various techniques such as linear regression, knearest neighbors, support vector machine, decision trees, random rainforest, neural networks, and ensemble methods. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
plt.style.use("bmh")

In [None]:
data_path = "../../../Data/playground-series-s5e4/"
train = pd.read_csv(data_path+"train.csv", index_col=["id"])
test = pd.read_csv(data_path+"test.csv", index_col=["id"])
sample_submission = pd.read_csv(data_path+"sample_submission.csv")

Now that I have the **data** I want to quickly explore it to understand what I'm working with.

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
train.describe(include="O")

In [None]:
test.describe()

In [None]:
test.describe(include="O")

**Quick Observations:**
- There are podcasts that have episodes with zero length minutes.
- There seem to be outliers in the number of ads.
- Listening time has zero values.
- Episode length for train and test seem to differ by a huge margin.

## Exploratory Data Analysis

In [None]:
numeric_att = ["Episode_Length_minutes", "Host_Popularity_percentage", "Guest_Popularity_percentage", "Number_of_Ads"]
categorical_att = ["Podcast_Name", "Episode_Title", "Genre", "Publication_Day", "Publication_Time",	"Episode_Sentiment"]
target_att = ["Listening_Time_minutes"]

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(10, 8))
for i in range(4):
    
    ax[i//2, i%2].scatter(train[numeric_att[i]], train[target_att[0]])
    ax[i//2, i%2].set_title(f"{numeric_att[i]} scatter plot.")
    ax[i//2, i%2].set_xlabel(f"{numeric_att[i]}")
    ax[i//2, i%2].set_ylabel("Listening_Time_minutes")
    
fig.tight_layout()
plt.show()

In [None]:
corr_matrix = train[numeric_att+target_att].corr()
fig, ax = plt.subplots()
sns.heatmap(corr_matrix, ax=ax)
plt.show()

In [None]:
corr_matrix["Episode_Length_minutes"].sort_values(ascending=False)

In [None]:
zero_episode_length = train.loc[train["Episode_Length_minutes"]==0]
zero_episode_length

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
for i, att in enumerate(categorical_att[2:]):
    result = train.groupby(att)["Listening_Time_minutes"].mean()
    if att == "Publication_Day":
        result = result[["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]]
    elif att == "Publication_Time":
        result = result[["Morning", "Afternoon", "Evening", "Night"]]
    x, y = result.index, result.values
    ax[i//2, i%2].plot(x, y, marker="o")
    ax[i//2, i%2].set_title(f"{att} plot.")
    ax[i//2, i%2].set_xlabel(f"{att}")
    ax[i//2, i%2].set_ylabel("Mean_Listening_Time_minutes")
    ax[i//2, i%2].tick_params(axis="x", rotation=90)
    
fig.tight_layout()
plt.show()

In [None]:
no_guest = train.loc[train["Guest_Popularity_percentage"].isna()]
print(f"{(len(no_guest)/len(train))*100:.2f}% of the podcasts didn't have a \
guest and listening time was {no_guest['Listening_Time_minutes'].mean():.2f}")

In [None]:
yes_guest = train.loc[~train["Guest_Popularity_percentage"].isna()]
print(f"{(len(yes_guest)/len(train))*100:.2f}% of the podcasts had a guest \
and listening time was {yes_guest['Listening_Time_minutes'].mean():.2f}")

In [None]:
no_episode_time = train.loc[train["Episode_Length_minutes"].isna()]
print(f"Null Length episodes had listening time of {no_episode_time['Listening_Time_minutes'].mean():.2f}")

In [None]:
no_episode_time.describe(include="O")

In [None]:
yes_episode_time = train.loc[~train["Episode_Length_minutes"].isna()]
print(f"Episodes with length minutes have listening time of {yes_episode_time['Listening_Time_minutes'].mean():.2f}")

In [None]:
print(f"{(len(no_episode_time)/len(train))*100:.2f}% of the podcasts didn't have episode length.")

In [None]:
yes_episode_time.describe(include="O")

In [None]:
genre_popularity = train["Genre"].value_counts().sort_values(ascending=False)
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(genre_popularity.index, genre_popularity.values)
ax.tick_params(axis="x", rotation=90)
ax.set_title("Genre Popularity", size=20)
ax.set_ylabel("Genre Frequency")
plt.show()

In [None]:
genre_episode_listening = train.groupby("Genre")[["Episode_Length_minutes", "Listening_Time_minutes"]].mean()
genre_episode_listening = genre_episode_listening.loc[["Sports", "Technology", "True Crime", "Lifestyle", 
                                                   "Comedy", "Business", "Health", "News", "Music", "Education"]]
fig, ax = plt.subplots(figsize=(10, 5))
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
ax.bar(x-0.2, 
       genre_episode_listening["Episode_Length_minutes"].values, 
       label="Episode Length", 
       width=0.4)
ax.bar(x+0.2, 
       genre_episode_listening["Listening_Time_minutes"].values,
       label="Listening Time", 
       width=0.4)
ax.legend(loc="upper left", 
          bbox_to_anchor=(1.1, 0.9))
ax.set_title("Episode Length vs Listening Time")
ax.set_xticks(x, 
              genre_episode_listening.index)
ax.set_ylabel("Time in minutes")
ax.tick_params(axis="x", 
               rotation=90)
ax.axhline(y=46.578394)
plt.show()

In [None]:
play_df = train.loc[~train["Episode_Length_minutes"].isna() & 
                    train["Listening_Time_minutes"]!=0, :].copy()
play_df["Episode_Listening_Quotent"] = play_df["Episode_Length_minutes"]/play_df["Listening_Time_minutes"]
play_df.groupby("Genre")["Episode_Listening_Quotent"].mean()

*Was to use the correlation between Episode Length and Listening Time to fill the null values in Episode Length, however, since test data doesn't have Listening Time this idea is null and a void.*

In [None]:
play_df.groupby("Episode_Sentiment")["Guest_Popularity_percentage"].mean().sort_values(ascending=False)

In [None]:
play_df.groupby("Episode_Sentiment")["Host_Popularity_percentage"].mean().sort_values(ascending=False)

In [None]:
play_df.loc[~play_df["Guest_Popularity_percentage"].isna(), "Episode_Sentiment"].value_counts(ascending=False)

In [None]:
play_df.loc[play_df["Guest_Popularity_percentage"].isna(), "Episode_Sentiment"].value_counts(ascending=False)

In [None]:
play_df["Episode_Title"] = play_df["Episode_Title"].apply(lambda x: int(x.split()[1]))
play_df.loc[~play_df["Episode_Length_minutes"].isna(), ["Episode_Title", "Episode_Length_minutes"]].corr()

**Observations:**
- Episode length and listening time have a strong positive correlation.
- Instance with zero episode length seems to  be incorrect since listening time is 9 minutes.
- Early days of the week seem to have higher podcast listening times.
- Having a guest slightly improved listening time.
- Some podcasts didn't have episode length for some reason. These podcasts have different listening time compared to podcasts that had listening time.
- Genre popularity doesn't affect listening time.
- Episode sentiment does affect listening time.
- There seem to be a mismatch between genre and podcast name for some instances. 
- Host popularity is linked to episode sentiment but guest popularity doesn't follow this trend.
- Looks like a guest is invited to a podcast based on episode sentiment. Negative sentiment podcasts tend to attract guests.

## Model Building

I'll be creating a lot of models as earlier mentioned. To start off, I'll build a linear regression model. While building this model I want to understand how feature selection, feature engineering, and regularization affects predictions from this model.

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#### Data Processing

Here, I prepare the data to ensure it's appropriate for model building as follows:
- Filling null values.
- Creating train, test, and validation data.
- Converting categorical variables to numeric.
- Scaling the data.

In [None]:
# I assume that null values mean that there was no guest for that particular podcast
train["Guest_Popularity_percentage"] = train["Guest_Popularity_percentage"].fillna(-1)
test["Guest_Popularity_percentage"] = test["Guest_Popularity_percentage"].fillna(-1)

In [None]:
# Fill the one null with the most frequent number of ads.
train["Number_of_Ads"] = train["Number_of_Ads"].fillna(0)

In [None]:
# Here, I calculate the mean of Episode Length based on publication time and day
# Thereafter, I use this mean to fill the null values of Episode Length
publication_time_day = train.groupby(["Publication_Time", "Publication_Day"])["Episode_Length_minutes"].mean()
train["Episode_Length_minutes"] = train[["Publication_Time", "Publication_Day","Episode_Length_minutes"]].apply(lambda x: 
                                                                              publication_time_day[x["Publication_Time"], 
                                                                                                   x["Publication_Day"]] 
                                                                              if pd.isnull(x["Episode_Length_minutes"]) 
                                                                              else x["Episode_Length_minutes"], axis=1)

publication_time_day_test = train.groupby(["Publication_Time", "Publication_Day"])["Episode_Length_minutes"].mean()
test["Episode_Length_minutes"] = test[["Publication_Time", "Publication_Day","Episode_Length_minutes"]].apply(lambda x: 
                                                                              publication_time_day_test[x["Publication_Time"], 
                                                                                                        x["Publication_Day"]] 
                                                                              if pd.isnull(x["Episode_Length_minutes"]) 
                                                                              else x["Episode_Length_minutes"], axis=1)

In [None]:
test.info()