In [None]:
import boto3
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
from scipy.sparse import csr_matrix, coo_matrix
from scipy.spatial.distance import cdist, cosine
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sys
import statsmodels.api as sm
from statsmodels.regression.quantile_regression import QuantReg
from statsmodels.stats.outliers_influence import variance_inflation_factor
import re
import shap
import xgboost as xgb

In [None]:
json_dir = Path("../data_pipeline/output/data") # I pulled all data down locally since I generated them, but source data are stored in S3

# Iterate over all `.json` files in target directory.
video_data = []
for json_file in json_dir.glob("*.json"):
    print(json_file.name)
    with open(json_file) as f:
        data = json.load(f)  # Load JSON content
        video_data = video_data + data

In [None]:
videos_df = pd.DataFrame(video_data)
videos_df['publish_time'] = pd.to_datetime(videos_df['publish_time'])
videos_df['year'] = videos_df['publish_time'].dt.year
videos_df['posted_day'] = videos_df['publish_time'].dt.day_name()
videos_df = videos_df[~videos_df['view_count'].isna()]
videos_df['view_count'] = videos_df['view_count'].astype(int)
videos_df['quantile'] = videos_df.groupby('year')['view_count'].transform(lambda x: x.rank(pct=True))
videos_df.head()

In [None]:
plt.hist(videos_df['view_count'], log=True)
plt.title("View Count Frequencies")
plt.show()

In [None]:
N_ROW = 2
N_COL = 5
fig, ax = plt.subplots(N_ROW, N_COL, figsize=(10,6))
row_idx = 0
col_idx = 0
for y in sorted(videos_df['year'].unique()):
    if y == 2005:
        continue
    df = videos_df[videos_df['year'] == y]
    ax[row_idx, col_idx].hist(df['view_count'], log=True)
    ax[row_idx, col_idx].set_title(y)
    col_idx += 1
    if col_idx >= N_COL:
        col_idx = 0
        row_idx += 1
fig.tight_layout()
plt.show()

In [None]:
videos_df.groupby('year')['view_count'].describe().sort_index()

In [None]:
previous_columns = videos_df.columns
# DROP Years with less than 1000 videos in data set. This should fall away as dataset grows.
year_counts = videos_df['year'].value_counts()
keep_years = sorted(year_counts[year_counts >= 1000].index)
model_df = videos_df[videos_df['year'].isin(keep_years)].copy()
# Create categorical variables for labels that appear often enough
vocabulary = pd.read_csv("~/Downloads/vocabulary.csv").set_index("Index")
all_labels = [label for sublist in model_df['labels'] for label in sublist]
label_counts = pd.Series(all_labels).value_counts()
frequent_labels = label_counts[label_counts >= 200].index
for label in frequent_labels:
    model_df[label] = model_df['labels'].apply(lambda x: 1 if label in x else 0)
    label_mapping = vocabulary['Name'].to_dict()
    # Rename the columns using the text labels instead of numeric IDs
    model_df.rename(columns={label: label_mapping[label] for label in frequent_labels}, inplace=True)
label_columns = list(set(model_df.columns) - set(previous_columns))

In [None]:
# Some features are very strongly correlated
plt.imshow(model_df[label_columns].corr())
plt.colorbar()
plt.show()

In [None]:
len(videos_df)

In [None]:
# Use VIF to identify highly correlated features and remove to address multicollinearity
X_vif = model_df[[l for l in label_columns if l != 'year_categorical'] ]

vif_data = pd.DataFrame()
vif_data["Feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]

vif_data.sort_values(by='VIF', ascending=False).head(10)

In [None]:
# Based on VIF, I am removing the following columns:
# Cycling (correlated with Bicycle), Smartphone (correlated with Mobilephone), Model aircraft (correlated wtih Radio-controlled aircraft),
# Pet (Correlated with specific types of pets)

VIF_DROP_LABELS = ['Cycling', 'Smartphone', 'Model aircraft', 'Pet', 'nan']
label_columns = list(set(model_df.columns) - set(previous_columns))
label_columns = [re.sub(r'[^a-zA-Z_]', '', str(lc)) for lc in label_columns if lc not in VIF_DROP_LABELS]

In [None]:
len(label_columns)

In [None]:
model_df['year_categorical'] = model_df['year'].apply(str)
model_df.columns = model_df.columns.str.replace(r'[^a-zA-Z_]', '', regex=True)

# SHAP values and XGBoost

In [None]:
model_df['year_categorical'].unique()

In [None]:
videos_df.columns

# TRAIN PREDICTIVE MODELS TO EVALUATE MODEL PERFORMANCE

In [None]:
posted_day_dummies = pd.get_dummies(model_df["posted_day"], prefix="posted_day")
year_dummies = pd.get_dummies(model_df["year_categorical"], prefix="year")
df_model_full = pd.concat([model_df[["quantile"] + [lc for lc in label_columns if (lc != 'year_categorical') and (lc != 'nan')and (lc != 'youtubem_id')]], posted_day_dummies, year_dummies], axis=1)
# removing numeric characters leads to duplciate values for a few variables. Just collapse them together
NUMERIC_SUFFIX_VARS = ['CallofDutyModernWarfare', 'Xbox', 'PlayStation']
for v in NUMERIC_SUFFIX_VARS:
    col_locs = [i for i, col in enumerate(df_model_full.columns) if col == v]
    # If htere's anything present for that column, use that.
    df_model_full[v] = df_model_full.iloc[:, col_locs].max(axis=1)
    # Keep only first one, drop the rest.
    cols_to_drop = col_locs[1:]  # Keep the first one
    df_model_full.drop(df_model_full.columns[cols_to_drop], axis=1, inplace=True)

output_df = pd.DataFrame()
mae_results = dict()
for yr in keep_years:
    if yr+1 not in keep_years: # can't test the following year.
        break
    df_model = df_model_full[df_model_full[f"year_{yr}"]==1]
    # Train an XGBoost model for the year
    np.random.seed(1885)
    X_train = df_model_full[df_model_full[f"year_{yr}"]==1].iloc[:,1:]
    y_train = df_model_full[df_model_full[f"year_{yr}"]==1]["quantile"]
    X_test = df_model_full[df_model_full[f"year_{yr+1}"]==1].iloc[:,1:]
    y_test = df_model_full[df_model_full[f"year_{yr+1}"]==1]["quantile"]
    model = xgb.XGBRegressor(max_depth=10, n_estimators=1000, learning_rate=.02)
    model.fit(X_train, y_train)
    mae_results[yr] = (mean_absolute_error(y_train, model.predict(X_train)), mean_absolute_error(y_test, model.predict(X_test)), len(y_test))

In [None]:
short_keep_years = [str(y)[2:] for y in mae_results.keys()]
fig, ax = plt.subplots(2,1, figsize=(4,8))
ax[0].bar(short_keep_years, [v[0] for v in mae_results.values()])
ax[0].set_xticks(short_keep_years)
ax[0].set_ylim([0,1])
ax[0].set_xlabel("Year (2000's)")
ax[0].set_ylabel("Mean Absolute Error")
ax[0].set_title("Training MAE")
ax[1].bar(short_keep_years, [v[1] for v in mae_results.values()])
ax[1].set_xticks(short_keep_years)
ax[1].set_ylim([0,1])
ax[1].set_xlabel("Year (2000's)")
ax[1].set_ylabel("Mean Absolute Error")
ax[1].set_title("Testing MAE")
plt.tight_layout()
fig.savefig("/Users/ryansloan/Desktop/mae.png")
plt.show()

In [None]:
mae_results

In [None]:
# Summarize overall Test MAE
sum([mae_results[y][1]*mae_results[y][2] for y in mae_results.keys()])/sum([mae_results[y][2] for y in mae_results.keys()])

# BUILD FULL FEATURE IMPORTANCES FOR VISUALIZATION

In [None]:
posted_day_dummies = pd.get_dummies(model_df["posted_day"], prefix="posted_day")
year_dummies = pd.get_dummies(model_df["year_categorical"], prefix="year")
df_model_full = pd.concat([model_df[["quantile"] + [lc for lc in label_columns if (lc != 'year_categorical') and (lc != 'nan')and (lc != 'youtubem_id')]], posted_day_dummies, year_dummies], axis=1)
# removing numeric characters leads to duplciate values for a few variables. Just collapse them together
NUMERIC_SUFFIX_VARS = ['CallofDutyModernWarfare', 'Xbox', 'PlayStation']
for v in NUMERIC_SUFFIX_VARS:
    col_locs = [i for i, col in enumerate(df_model_full.columns) if col == v]
    # If htere's anything present for that column, use that.
    df_model_full[v] = df_model_full.iloc[:, col_locs].max(axis=1)
    # Keep only first one, drop the rest.
    cols_to_drop = col_locs[1:]  # Keep the first one
    df_model_full.drop(df_model_full.columns[cols_to_drop], axis=1, inplace=True)

output_df = pd.DataFrame()
for yr in keep_years:
    df_model = df_model_full[df_model_full[f"year_{yr}"]==1]
    # Train an XGBoost model for the year
    X = df_model.iloc[:, 1:]
    y = df_model["quantile"]
    model = xgb.XGBRegressor()
    model.fit(X, y)
    # Evaluate feature importance using SHAP values
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    
    #shap.summary_plot(shap_values, X, max_display=25, alpha=.1)
    shap_df = pd.DataFrame(shap_values.values, columns=df_model.columns[1:])
    filtered_shap_values = shap_df*df_model[df_model.columns[1:]].reset_index(drop=True).replace(0, np.nan)
    reshaped_df = filtered_shap_values.melt(var_name='variable', value_name='value')
    print(len(reshaped_df))
    reshaped_df = reshaped_df.dropna()
    print(len(reshaped_df))
    reshaped_df['year'] = yr
    output_df = pd.concat([output_df, reshaped_df])

In [None]:
X.shape

In [None]:
output_df[~output_df['variable'].str.contains('year')].to_csv("../dashboard/shap_values.csv", index=False)