In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from scipy import stats
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

import os

In [None]:
# Set plot style for better visuals
word_ablations = [5, 20, 50, 100, 500]

sns.set(style='whitegrid')
data_ablations = {}
# current_dir = os.path.dirname(__file__)
data_ablations['data_'] = pd.read_csv('./new_csvs/data_with_img_embeddings.csv')
data_ablations['data_'] = data_ablations['data_'].drop(columns=['image_path'])
for ablation in word_ablations:
    file_path = f"./new_csvs/data_with_img_pca_embeddings_{ablation}.csv"
    data_ablations[f'data_{ablation}'] = pd.read_csv(file_path)
    data_ablations[f'data_{ablation}'] = data_ablations[f'data_{ablation}'].drop(columns=['image_path'])

In [None]:
for data_name, data in data_ablations.items():
    # Split data into training and test sets
    train_samples = data.sample(n=len(data) - 318, random_state=42)
    test_samples = data.drop(train_samples.index)

    # Transform training data
    scaler = StandardScaler()
    x_train = train_samples.drop(columns=['log_likes'])
    if 'follower_count_at_t' in x_train.columns:
        x_train['box_follower_count'], lambda_train = stats.boxcox(x_train['follower_count_at_t'] + 1)
        x_train = x_train.drop(columns='follower_count_at_t')
    y_train = train_samples[['log_likes']]
    # Scale the data
    x_train = scaler.fit_transform(x_train)

    # Transform test data
    x_test = test_samples.drop(columns=['log_likes'])
    if 'follower_count_at_t' in x_test.columns:
        x_test['box_follower_count'] = stats.boxcox(x_test['follower_count_at_t'] + 1, lmbda=lambda_train)
        x_test = x_test.drop(columns='follower_count_at_t')
    y_test = np.expm1(test_samples[['log_likes']])
    # Scale the data
    x_test = scaler.transform(x_test)

    # Note: y_train is still in log space; convert when needed
    # y_train = np.expm1(y_train)

    # Train a linear regression model
    lr_model = LinearRegression()
    lr_model.fit(x_train, y_train)

    # Predict on test data
    y_pred_lr = np.expm1(lr_model.predict(x_test))

    # Evaluate the performance
    mse_lr = mean_squared_error(y_test, y_pred_lr)
    mae_lr = mean_absolute_error(y_test, y_pred_lr)
    r2_lr = r2_score(y_test, y_pred_lr)

    # Train a Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(x_train, y_train.values.ravel())  # Flatten y_train for RandomForest

    # Predict on test data
    y_pred_rf = np.expm1(rf_model.predict(x_test))

    # Evaluate the performance
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    mae_rf = mean_absolute_error(y_test, y_pred_rf)
    r2_rf = r2_score(y_test, y_pred_rf)

    # Train a Gradient Boosting model
    gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_model.fit(x_train, y_train.values.ravel())

    # Predict on test data
    y_pred_gb = np.expm1(gb_model.predict(x_test))

    # Evaluate the performance
    mse_gb = mean_squared_error(y_test, y_pred_gb)
    mae_gb = mean_absolute_error(y_test, y_pred_gb)
    r2_gb = r2_score(y_test, y_pred_gb)

    # XGBoost
    param_grid = {
        'max_depth': [3, 4, 5],
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [100, 500, 1000],
        'colsample_bytree': [0.3, 0.7],
        'subsample': [0.3, 0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3],
        'min_child_weight': [1, 3, 5]
    }

    # Create the XGBoost regressor
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, 
                            cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

    # Fit GridSearchCV
    grid_search.fit(x_train, y_train.values.ravel())

    # Get the best parameters
    best_params = grid_search.best_params_
    print("Best parameters for XGBoost:", best_params)

    # Train the model with the best parameters
    best_xgb_model = xgb.XGBRegressor(objective='reg:squarederror', **best_params, random_state=42)
    best_xgb_model.fit(x_train, y_train.values.ravel())

    # Predict on the test set
    y_pred_xgb = np.expm1(best_xgb_model.predict(x_test))

    # Evaluate the performance
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
    r2_xgb = r2_score(y_test, y_pred_xgb)

    # Train a Support Vector Machine (SVR) model
    param_grid = {
        'C': [1, 10, 100, 1000],
        'gamma': ['scale', 'auto'],
        'epsilon': [0.1, 0.2, 0.3, 0.5, 0.8],
    }

    grid_search = GridSearchCV(SVR(kernel='rbf'), param_grid, cv=5, scoring='r2', verbose=2)
    grid_search.fit(x_train, y_train.values.ravel())

    # # Best hyperparameters from grid search
    best_params_svm = grid_search.best_params_
    # print("Best parameters for SVR:", best_params_svm)
    ###OUTPUT w comment: 1, .5, auto
    ###OUTPUT w/o comment: 'C': 1000, 'epsilon': 0.5, 'gamma': 'scale'

    svr_model = SVR(kernel='rbf', **best_params_svm)
    svr_model.fit(x_train, y_train.values.ravel())

    # Predict on test data (scaled)
    y_pred_svr = np.expm1(svr_model.predict(x_test))

    # Evaluate the performance
    mse_svr = mean_squared_error(y_test, y_pred_svr)
    mae_svr = mean_absolute_error(y_test, y_pred_svr)
    r2_svr = r2_score(y_test, y_pred_svr)
    
    print(f"ABLATION: {data_name}")
    print("Linear Regression Results:")
    print(f"MSE: {mse_lr}, MAE: {mae_lr}, R2: {r2_lr}")
    print("\nRandom Forest Results:")
    print(f"MSE: {mse_rf}, MAE: {mae_rf}, R2: {r2_rf}")
    print("\nGradient Boosting Results:")
    print(f"MSE: {mse_gb}, MAE: {mae_gb}, R2: {r2_gb}")
    print(f"\nXGBoost Results (params: {best_params}):")
    print(f"MSE: {mse_xgb}, MAE: {mae_xgb}, R2: {r2_xgb}")
    print(f"\nSVM Results (params: {best_params_svm}):")
    print(f"MSE: {mse_svr}, MAE: {mae_svr}, R2: {r2_svr}")
    plt.figure(figsize=(10,6))
    plt.scatter(y_test, y_pred_lr, label='Linear Regression', alpha=0.5)
    plt.scatter(y_test, y_pred_rf, label='Random Forest', alpha=0.5)
    plt.scatter(y_test, y_pred_gb, label='Gradient Boosting', alpha=0.5)
    plt.scatter(y_test, y_pred_xgb, label='XGBoost', alpha=0.5)
    plt.scatter(y_test, y_pred_svr, label='SVM', alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel('Actual Likes')
    plt.ylabel('Predicted Likes')
    plt.title(f"Actual vs Predicted Likes for Different Model; Ablation: {data_name}; With Comments")
    plt.legend()
    # plt.xlim(-0.134, -0.132)
    # plt.ylim(0,5)
    plt.show()

Fitting 5 folds for each of 2592 candidates, totalling 12960 fits
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.3; total time=   0.5s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.3; total time=   0.5s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.3; total time=   0.6s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.3; total time=   0.5s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_



[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6; total time=   0.8s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   1.0s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.9s
[CV] END colsample_bytree=0.3, gamma=0, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8; tota