In [1]:
import pandas as pd
import statsmodels.api as sm

# Load Data

In [2]:
# Define file paths
caption_file_path = 'data_exports/recommendations_caption_val_full.csv'
random_file_path = 'data_exports/recommendations_random_val_full.csv'
ncf_file_path = 'data_exports/w_clustering_batch_size512_num_epochs20_lr0_001_embedding_dim64.csv'
val_file_path = 'data_exports/joined_val_data.csv'

# Read CSV files into DataFrames
caption_df = pd.read_csv(caption_file_path)
random_df = pd.read_csv(random_file_path)
ncf_df = pd.read_csv(ncf_file_path)
val_df = pd.read_csv(val_file_path)

In [3]:
# Select only the relevant columns from val_df
val_df = val_df[['user_id', 'video_id', 'watch_ratio']]

# Optionally, check the first few rows to confirm
print(val_df.head())

   user_id  video_id  watch_ratio
0       14      8825     0.734829
1       14      2739     1.081279
2       14      7328     0.430886
3       14      2677     0.655448
4       14      8773     1.130397


# Filter Prediction Datasets Based on Validation Set

The goal is to align the predictions from each model with the corresponding entries in your validation set. Specifically, we want to ensure that for each (`user_id`, `video_id`) pair in `val_df`, we have the corresponding predictions from each of the three models. This alignment is crucial for an accurate regression analysis to assess the significance and contribution of each model's predictions.

In [4]:
# Extract unique (user_id, video_id) pairs from the validation set
val_pairs = val_df[['user_id', 'video_id']].drop_duplicates()

# Merge to filter predictions based on (user_id, video_id) pairs
filtered_ncf_df = val_pairs.merge(ncf_df, on=['user_id', 'video_id'], how='left')
filtered_caption_df = val_pairs.merge(caption_df, on=['user_id', 'video_id'], how='left')

In [5]:
val_df.shape

(1376299, 3)

In [6]:
filtered_ncf_df.shape

(1376299, 4)

In [7]:
filtered_caption_df.shape

(1376299, 4)

# Merge Predictions with the Validation Set

Next, we'll merge the filtered prediction DataFrames with the `val_df` based on both `user_id`and `video_id`. This ensures that we're comparing the correct predictions with the actual values.

In [8]:
# Merge Neural Collaborative Filtering predictions
merged_df = val_df.merge(
    filtered_ncf_df[['user_id', 'video_id', 'watch_ratio']],
    on=['user_id', 'video_id'],
    how='left',
    suffixes=('', '_ncf')
)

# Merge Caption based Video Filtering predictions
merged_df = merged_df.merge(
    filtered_caption_df[['user_id', 'video_id', 'watch_ratio']],
    on=['user_id', 'video_id'],
    how='left',
    suffixes=('', '_caption')
)

# Rename columns for clarity
merged_df.rename(columns={
    'watch_ratio': 'watch_ratio_val',
    'watch_ratio_ncf': 'watch_ratio_ncf',
    'watch_ratio_caption': 'watch_ratio_caption'
}, inplace=True)

In [9]:
print("Missing values after merging")
print(merged_df.isnull().sum())

Missing values after merging
user_id                     0
video_id                    0
watch_ratio_val             0
watch_ratio_ncf        801810
watch_ratio_caption         0
dtype: int64


Since the `Neural Collaborative Filtering (ncf_df)` model only predicts for videos present in the training set, many (`user_id`, `video_id`) pairs in our `val_df` might not have corresponding ncf predictions. 

In contrast, the Caption-based Video Filtering (`caption_df`) model predicts for all videos, so it should have no missing values in its predictions.

Given the substantial number of missing ncf predictions, it's prudent to remove rows with missing `watch_ratio_ncf` to ensure that our regression analysis only includes interactions with available ncf predictions. This approach maintains the integrity of our analysis by focusing on relevant and comparable data points.

In [10]:
# Remove rows where 'watch_ratio_ncf' is NaN
merged_df = merged_df.dropna(subset=['watch_ratio_ncf'])

In [11]:
print("Missing values after merging and removing:")
print(merged_df.isnull().sum())

Missing values after merging and removing:
user_id                0
video_id               0
watch_ratio_val        0
watch_ratio_ncf        0
watch_ratio_caption    0
dtype: int64


# Prepare Data for Regression

We'll set up the independent variables (predictions from the models) and the dependent variable (`watch_ratio_val`) for the regression analysis.

In [12]:
# Define the independent variables
X1 = merged_df[['watch_ratio_ncf', 'watch_ratio_caption']]

# Define the dependent variable
y1 = merged_df['watch_ratio_val']

# Run the Regression Analysis

Using `statsmodels`, we'll perform an Ordinary Least Squares (OLS) regression to assess the relationship between the model predictions and the actual `watch_ratio`.

In [13]:
# Fit the OLS regression model for Regression 1
model1 = sm.OLS(y1, X1).fit()

# Print the regression results summary for Regression 1
print("\nRegression: watch_ratio_ncf and watch_ratio_caption")
print(model1.summary())


Regression: watch_ratio_ncf and watch_ratio_caption
                                 OLS Regression Results                                
Dep. Variable:        watch_ratio_val   R-squared (uncentered):                   0.720
Model:                            OLS   Adj. R-squared (uncentered):              0.720
Method:                 Least Squares   F-statistic:                          7.378e+05
Date:                Wed, 13 Nov 2024   Prob (F-statistic):                        0.00
Time:                        11:13:46   Log-Likelihood:                     -4.5851e+05
No. Observations:              574489   AIC:                                  9.170e+05
Df Residuals:                  574487   BIC:                                  9.170e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                          coef    std err          t      P>|t|    

In [14]:
# # Access the p-values
model1.pvalues

watch_ratio_ncf        0.0
watch_ratio_caption    0.0
dtype: float64