Target Encoding and Feature Variances
=====================================

This notebook will explore variance within the Podcast Listening time dataset. Given that the dataset is so large, there is a possibility of identifying combinations of features and using target encoding to train the model.

In [1]:
# import libraries and set constants

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.preprocessing as pre
import sklearn.impute as imp
import sklearn.metrics as ms
import sklearn.tree as tr
import sklearn.model_selection as mds
import sklearn.ensemble as en


DATA_DIR = "../data/kaggle"
ORIG_DIR = "../data/original"

TARGET1 = "Listening_Time_minutes"
TARGET2 = "Episode_Completion_percentage"

In [2]:
# load data

df_train = pd.read_csv(f"{DATA_DIR}/train.csv")
df_test  = pd.read_csv(f"{DATA_DIR}/test.csv")

# add the target column for the test data
df_test[TARGET1] = np.nan

df_ = pd.concat([df_train, df_test])
print(f"Shape of train and test data: {df_.shape}")

Shape of train and test data: (1000000, 12)


# Fill in Missing Values

In [3]:
df_["Number_of_Ads"] = df_["Number_of_Ads"].fillna(0).copy()

# Create Completion Percentage Target

In [4]:
df_[ TARGET2 ] = df_[TARGET1] / df_["Episode_Length_minutes"]
df_[ TARGET2 ] = df_[ TARGET2 ].replace([np.inf, -np.inf], np.nan)

median_impute = imp.SimpleImputer(missing_values=np.nan, strategy='median')
median_impute.fit(df_[[TARGET2]])

df_[ TARGET2 ] = median_impute.transform(df_[[TARGET2]])

# Bins for Episode Length

In [5]:
df_length_filled = df_["Episode_Length_minutes"].fillna(0)

df_["Length_Minutes"] = (df_length_filled.astype(np.int64)).astype('category')
df_["Length_Minutes"].head(5)

0      0
1    119
2     73
3     67
4    110
Name: Length_Minutes, dtype: category
Categories (124, int64): [0, 1, 2, 3, ..., 120, 325, 7575, 78486264]

In [6]:
df_["Length_Seconds"] = ((df_length_filled * 60).astype(np.int64)).astype('category')
df_["Length_Seconds"].head(5)

0       0
1    7188
2    4434
3    4030
4    6630
Name: Length_Seconds, dtype: category
Categories (6916, int64): [0, 74, 88, 110, ..., 7255, 19514, 454500, 4709175840]

# Additional Bins

In [7]:
df_["Name_And_Episode"] = df_["Podcast_Name"] + " " + df_["Episode_Title"]
df_["Episode_Bin"] = df_["Name_And_Episode"].astype('category')

In [8]:
df_["Pub_Time"] = (df_["Publication_Day"] + df_["Publication_Time"] + df_["Podcast_Name"]).astype('category')

In [9]:
df_["Ads_And_Seconds"] = df_["Number_of_Ads"].astype(str) + " " + df_["Length_Seconds"].astype(str)
df_["Ads_And_Seconds"] = df_["Ads_And_Seconds"].astype('category')

df_["Ads_And_Minutes"] = df_["Number_of_Ads"].astype(str) + " " + df_["Length_Minutes"].astype(str)

df_host_popularity = (df_["Host_Popularity_percentage"] * 10).astype(int)
df_["HostPop_And_Minutes"] = df_host_popularity.astype(str) + " " + df_["Length_Minutes"].astype(str)

# Split Test and Train Data

In [10]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 249999
Data columns (total 21 columns):
 #   Column                         Non-Null Count    Dtype   
---  ------                         --------------    -----   
 0   id                             1000000 non-null  int64   
 1   Podcast_Name                   1000000 non-null  object  
 2   Episode_Title                  1000000 non-null  object  
 3   Episode_Length_minutes         884171 non-null   float64 
 4   Genre                          1000000 non-null  object  
 5   Host_Popularity_percentage     1000000 non-null  float64 
 6   Publication_Day                1000000 non-null  object  
 7   Publication_Time               1000000 non-null  object  
 8   Guest_Popularity_percentage    805138 non-null   float64 
 9   Number_of_Ads                  1000000 non-null  float64 
 10  Episode_Sentiment              1000000 non-null  object  
 11  Listening_Time_minutes         750000 non-null   float64 
 12  Episod

In [11]:
# prep the cleaned training data to write
df_train_clean = df_[ df_[ TARGET1 ].notna() ].copy()

# prep the cleaned test data to write
df_test_clean = df_[ df_[ TARGET1 ].isna() ].copy()
df_test_clean.drop( TARGET1, axis=1, inplace=True )

# Target Encode the Bins

In [12]:
encoder = pre.TargetEncoder( cv=3 )

In [13]:
new_feature_list = []

for f in ["Length_Minutes", "Length_Seconds", "Episode_Bin", "Pub_Time", "Ads_And_Minutes"]:
    i=0
    
    for t in [TARGET1, TARGET2]:

        i += 1
        encoder.fit( df_train_clean[[f]], df_train_clean[t] )
        feature_name = f"TE{i}_{f}"
        df_train_clean[feature_name] = encoder.transform( df_train_clean[[f]] )
        df_test_clean[feature_name] = encoder.transform( df_test_clean[[f]] )

        new_feature_list.append(feature_name)

In [14]:
df_train_clean.head(5)

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,...,TE1_Length_Minutes,TE2_Length_Minutes,TE1_Length_Seconds,TE2_Length_Seconds,TE1_Episode_Bin,TE2_Episode_Bin,TE1_Pub_Time,TE2_Pub_Time,TE1_Ads_And_Minutes,TE2_Ads_And_Minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,...,43.149177,0.698387,43.149177,0.698387,38.762954,0.656602,47.796859,0.688175,46.914682,0.698387
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,...,90.204426,0.754779,87.336579,0.731069,43.306115,0.684215,44.051021,0.679131,88.413416,0.739964
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,...,52.73837,0.717923,53.511654,0.72407,53.760709,0.700617,46.241317,0.678773,54.712794,0.74485
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,...,48.231742,0.714823,46.787632,0.696531,53.295212,0.714867,45.097041,0.687802,47.601645,0.705516
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,...,78.389098,0.709666,86.177368,0.780497,45.500195,0.693175,49.420869,0.741055,72.731342,0.658564


# Check Variability of New Features

In [15]:
print(new_feature_list)

['TE1_Length_Minutes', 'TE2_Length_Minutes', 'TE1_Length_Seconds', 'TE2_Length_Seconds', 'TE1_Episode_Bin', 'TE2_Episode_Bin', 'TE1_Pub_Time', 'TE2_Pub_Time', 'TE1_Ads_And_Minutes', 'TE2_Ads_And_Minutes']


In [16]:
stats_df = pd.DataFrame()
#stats_funcs = [ np.unique ]
#stats_df["col_name"] = new_feature_list
#stats_df["nunique", "variance"] = None

for f in new_feature_list:

    #variance = df_train_clean[f].var()
    #unique = df_train_clean[f].nunique()
    #print(f"{f}: {unique}, {variance}")

    stats_df.at[f, "nunique"] = df_train_clean[f].nunique()
    stats_df.at[f, "variance"] = df_train_clean[f].var()
    stats_df.at[f, "std"] = df_train_clean[f].std()
    stats_df.at[f, "mean"] = df_train_clean[f].mean()
    stats_df.at[f, "median"] = df_train_clean[f].mean()
    stats_df.at[f, "min"] = df_train_clean[f].min()
    stats_df.at[f, "max"] = df_train_clean[f].max()

stats_df.sort_values(by="nunique", ascending=False)

Unnamed: 0,nunique,variance,std,mean,median,min,max
TE2_Length_Seconds,6909.0,0.033408,0.182778,0.683267,0.683267,0.038844,94.177419
TE1_Length_Seconds,6908.0,556.12025,23.582202,45.413698,45.413698,0.255305,119.73
TE1_Episode_Bin,4800.0,13.422385,3.663657,45.436502,45.436502,25.423071,65.323744
TE2_Episode_Bin,4800.0,0.000609,0.024672,0.682834,0.682834,0.371216,0.844036
TE1_Pub_Time,1344.0,4.765441,2.182989,45.437383,45.437383,35.700667,51.361138
TE2_Pub_Time,1344.0,0.000284,0.016842,0.682975,0.682975,0.552753,0.808068
TE2_Ads_And_Minutes,478.0,0.020666,0.143757,0.6829,0.6829,0.155748,94.177419
TE1_Ads_And_Minutes,476.0,560.502072,23.674925,45.435916,45.435916,0.855854,119.688319
TE1_Length_Minutes,120.0,554.219029,23.541857,45.436952,45.436952,1.494666,110.680653
TE2_Length_Minutes,120.0,0.004958,0.070411,0.682625,0.682625,0.197761,0.933313


# Build the Model

In [17]:
columns_to_drop = ["id", TARGET1, TARGET2, "Episode_Length_minutes", "Guest_Popularity_percentage" ]
float_cols = [i for i in df_train_clean.columns if df_train_clean[i].dtype == "float64" ]
x_colnames = [i for i in float_cols if i not in columns_to_drop]

X = df_train_clean[ x_colnames ]
y = df_train_clean[ TARGET1 ].values

print(X.info())

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Host_Popularity_percentage  750000 non-null  float64
 1   Number_of_Ads               750000 non-null  float64
 2   TE1_Length_Minutes          750000 non-null  float64
 3   TE2_Length_Minutes          750000 non-null  float64
 4   TE1_Length_Seconds          750000 non-null  float64
 5   TE2_Length_Seconds          750000 non-null  float64
 6   TE1_Episode_Bin             750000 non-null  float64
 7   TE2_Episode_Bin             750000 non-null  float64
 8   TE1_Pub_Time                750000 non-null  float64
 9   TE2_Pub_Time                750000 non-null  float64
 10  TE1_Ads_And_Minutes         750000 non-null  float64
 11  TE2_Ads_And_Minutes         750000 non-null  float64
dtypes: float64(12)
memory usage: 74.4 MB
None


In [18]:
# Assuming you have your features in X and your target variable in y
X_train, X_test, y_train, y_test = mds.train_test_split(X, y, test_size=0.2)

reg = en.GradientBoostingRegressor( n_estimators=9,
                                    learning_rate=0.3,
                                    criterion='squared_error' )
reg.fit(X_train, y_train)

In [19]:
y_train_pred = reg.predict(X_train)

train_rmse = ms.root_mean_squared_error(y_train, y_train_pred)
print(f"Train Set Mean Squared Error: {train_rmse:.4f}")

Train Set Mean Squared Error: 13.1938


In [20]:
importances = reg.feature_importances_
feature_imp_df = pd.DataFrame(
    {'Feature': x_colnames, 'Gini Importance': importances} ).sort_values(
         'Gini Importance', ascending=False)

feature_imp_df

Unnamed: 0,Feature,Gini Importance
10,TE1_Ads_And_Minutes,0.614359
4,TE1_Length_Seconds,0.37967
0,Host_Popularity_percentage,0.002367
11,TE2_Ads_And_Minutes,0.001283
5,TE2_Length_Seconds,0.0011
6,TE1_Episode_Bin,0.000617
7,TE2_Episode_Bin,0.000389
1,Number_of_Ads,0.000216
2,TE1_Length_Minutes,0.0
3,TE2_Length_Minutes,0.0


In [21]:
y_pred = reg.predict(X_test)

rmse = ms.root_mean_squared_error(y_test, y_pred)
print(f"Test Set Mean Squared Error: {rmse:.4f}")

Test Set Mean Squared Error: 13.1997


# Write Submission File

In [22]:
# Write predictions to submissions file
df_test_clean["Listening_Time_minutes"] = reg.predict(df_test_clean[x_colnames])
df_test_clean[["id", "Listening_Time_minutes"]].to_csv("../data/kaggle/gb_submission.csv", index=False)