Preprocessing and Data Preparation

In [None]:
import pandas as pd

# Load your dataset
data = pd.read_excel('merged_post_data.xlsx')  # Update this with the correct path if necessary

# Convert 'Created At' to datetime format, forcing UTC to handle mixed time zones
data['Created At'] = pd.to_datetime(data['Created At'], errors='coerce', utc=True)

# Extract day of the week and hour from 'Created At'
data['day_of_week'] = data['Created At'].dt.dayofweek  # 0=Monday, 6=Sunday
data['hour'] = data['Created At'].dt.hour              # Hour of the day

# Drop the original 'Created At' column if it’s no longer needed
data = data.drop(columns=['Created At'])

# Display the first few rows to verify the new columns
print(data[['day_of_week', 'hour']].head())



   day_of_week  hour
0          5.0   5.0
1          0.0  20.0
2          6.0  19.0
3          0.0  16.0
4          2.0  21.0


In [None]:
!pip install --upgrade --force-reinstall textblob

Defaulting to user installation because normal site-packages is not writeable
Collecting textblob
  Using cached textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Collecting nltk>=3.8 (from textblob)
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk>=3.8->textblob)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk>=3.8->textblob)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk>=3.8->textblob)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tqdm (from nltk>=3.8->textblob)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached textblob-0.18.0.post0-py3-none-any.whl (626 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (792 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import sys
print(sys.executable)

/packages/envs/pytorch-gpu-2.3.1-cuda-12.1/bin/python3.12


In [None]:
!{sys.executable} -m pip install textblob

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import nltk
nltk.download('punkt', download_dir='/home/jravi9/nltk_data')
nltk.download('averaged_perceptron_tagger', download_dir='/home/jravi9/nltk_data')

[nltk_data] Downloading package punkt to /home/jravi9/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jravi9/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
from textblob import TextBlob
import nltk
nltk.data.path.append('/home/jravi9/nltk_data')

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

# Sentiment analysis - Calculate sentiment score for each post
data['sentiment_score'] = data['Text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Keyword count - Count the number of words in each post
data['keyword_count'] = data['Text'].apply(lambda x: len(TextBlob(str(x)).words))

# Hashtag count - Count the number of hashtags in each post (assumes hashtags start with '#')
data['hashtag_count'] = data['Text'].apply(lambda x: str(x).count('#'))

# Drop the original 'Text' column if not needed in the final dataset
data = data.drop(columns=['Text'])

# Display the first few rows to verify the new text-based features
print(data[['sentiment_score', 'keyword_count', 'hashtag_count']].head())

[nltk_data] Downloading package punkt to /home/jravi9/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jravi9/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jravi9/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


   sentiment_score  keyword_count  hashtag_count
0            0.000             19              0
1            0.025             46              0
2            0.340             28              1
3            0.000             22              0
4            0.000             19              2


In [None]:
from sklearn.model_selection import train_test_split

# Define feature columns and target columns
feature_columns = [
    'followers_count', 'follows_count', 'Total Posts', 'day_of_week', 'hour',
    'sentiment_score', 'keyword_count', 'hashtag_count'
]
target_columns = ['Likes', 'Reposts']

# Create feature set (X) and target set (y)
X = data[feature_columns]
y = data[target_columns]

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes to verify
print("Feature set shape:", X.shape)
print("Target set shape:", y.shape)
print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)

Feature set shape: (201229, 8)
Target set shape: (201229, 2)
Training set shape (X_train, y_train): (160983, 8) (160983, 2)
Testing set shape (X_test, y_test): (40246, 8) (40246, 2)


Decision Tree, Random Forest and Support Vector Models

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Impute missing values in X_train and X_test
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Define evaluation function
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2 Score': r2}

# Initialize models
models = {
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'SVM': SVR()
}

# Define target variables to evaluate
targets = ['Likes', 'Reposts']

# Train and evaluate each model for each target variable
for target in targets:
    print(f"\n--- Results for '{target}' ---")
    results = {}
    for model_name, model in models.items():
        # Train the model on the current target using imputed data
        model.fit(X_train_imputed, y_train[target])

        # Make predictions on the test set
        y_pred = model.predict(X_test_imputed)

        # Evaluate the model
        results[model_name] = evaluate_model(y_test[target], y_pred)

    # Display results for each model
    for model_name, metrics in results.items():
        print(f"{model_name} Results for '{target}':")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
        print()



--- Results for 'Likes' ---
Decision Tree Results for 'Likes':
  MAE: 9.218481816901534
  RMSE: 18.29544796279807
  R2 Score: -0.11198167579830454

Random Forest Results for 'Likes':
  MAE: 8.874137158058472
  RMSE: 13.69072069032333
  R2 Score: 0.3773210906540202

SVM Results for 'Likes':
  MAE: 9.837814670645114
  RMSE: 18.782381502945185
  R2 Score: -0.17196016580454154


--- Results for 'Reposts' ---
Decision Tree Results for 'Reposts':
  MAE: 7.024520150955075
  RMSE: 16.359421215359077
  R2 Score: -0.2798868501823504

Random Forest Results for 'Reposts':
  MAE: 6.825009232613107
  RMSE: 12.089058458472294
  R2 Score: 0.3010909513075486

SVM Results for 'Reposts':
  MAE: 6.217932773179131
  RMSE: 15.553235236470051
  R2 Score: -0.15685036797467755



Optimized Random Forest Model

In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Example: Load your dataset (replace this with your actual dataset)
# Ensure `X_train_imputed`, `X_test_imputed`, `y_train`, and `y_test` are defined
# Example placeholders:
# X_train_imputed, X_test_imputed: Preprocessed feature matrices
# y_train, y_test: DataFrames containing target variables ('Likes' and 'Reposts')
# Replace this with actual data loading and preprocessing logic
# X_train_imputed, X_test_imputed, y_train, y_test = ...

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Define a function to train and evaluate for a specific target variable
def train_and_evaluate_target(target_name):
    print(f"\nTraining Random Forest for '{target_name}'")

    # Initialize RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_grid,
        n_iter=20,  # Number of different combinations to try
        scoring='neg_mean_squared_error',  # Scoring based on RMSE
        cv=3,  # 3-fold cross-validation
        random_state=42,
        n_jobs=-1  # Use all available CPU cores
    )

    # Fit RandomizedSearchCV on the training data for the target variable
    random_search.fit(X_train_imputed, y_train[target_name])

    # Best parameters from RandomizedSearchCV
    best_params = random_search.best_params_
    print(f"Best parameters found for '{target_name}': ", best_params)

    # Train the optimized Random Forest model with the best parameters
    optimized_rf = RandomForestRegressor(**best_params, random_state=42)
    optimized_rf.fit(X_train_imputed, y_train[target_name])

    # Make predictions with the optimized model
    y_pred_optimized = optimized_rf.predict(X_test_imputed)

    # Evaluate the optimized model
    optimized_results = {
        'MAE': mean_absolute_error(y_test[target_name], y_pred_optimized),
        'RMSE': np.sqrt(mean_squared_error(y_test[target_name], y_pred_optimized)),
        'R2 Score': r2_score(y_test[target_name], y_pred_optimized)
    }

    print(f"Optimized Random Forest Results for '{target_name}':")
    for metric, value in optimized_results.items():
        print(f"  {metric}: {value}")

    return optimized_results, optimized_rf

# Train and evaluate for 'Likes'
likes_results, likes_model = train_and_evaluate_target('Likes')

# Train and evaluate for 'Reposts'
reposts_results, reposts_model = train_and_evaluate_target('Reposts')

# Final results
print("\nFinal Evaluation Results:")
print("Likes Results: ", likes_results)
print("Reposts Results: ", reposts_results)



Training Random Forest for 'Likes'


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/packages/envs/pytorch-gpu-2.3.1-cuda-12.1/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/packages/envs/pytorch-gpu-2.3.1-cuda-12.1/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/packages/envs/pytorch-gpu-2.3.1-cuda-12.1/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/packages/envs/pytorch-gpu-2.3.1-cuda-12.1/lib/python3.12/site-

Best parameters found for 'Likes':  {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40}
Optimized Random Forest Results for 'Likes':
  MAE: 9.418351021872175
  RMSE: 13.790862225951022
  R2 Score: 0.3681785368189914

Training Random Forest for 'Reposts'


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/packages/envs/pytorch-gpu-2.3.1-cuda-12.1/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/packages/envs/pytorch-gpu-2.3.1-cuda-12.1/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/packages/envs/pytorch-gpu-2.3.1-cuda-12.1/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/packages/envs/pytorch-gpu-2.3.1-cuda-12.1/lib/python3.12/site-

Best parameters found for 'Reposts':  {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40}
Optimized Random Forest Results for 'Reposts':
  MAE: 7.184896001069873
  RMSE: 12.12936002515979
  R2 Score: 0.2964232461378222

Final Evaluation Results:
Likes Results:  {'MAE': 9.418351021872175, 'RMSE': 13.790862225951022, 'R2 Score': 0.3681785368189914}
Reposts Results:  {'MAE': 7.184896001069873, 'RMSE': 12.12936002515979, 'R2 Score': 0.2964232461378222}


In [None]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


XGBoost Model

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define a function to train and evaluate an XGBoost model for a specific target variable
def train_and_evaluate_xgb(target_name):
    print(f"\nTraining XGBoost Model for '{target_name}'")

    # Initialize the XGBoost model
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

    # Train the model on the target variable
    xgb_model.fit(X_train_imputed, y_train[target_name])

    # Make predictions on the test set
    y_pred_xgb = xgb_model.predict(X_test_imputed)

    # Evaluate the XGBoost model
    xgb_results = {
        'MAE': mean_absolute_error(y_test[target_name], y_pred_xgb),
        'RMSE': np.sqrt(mean_squared_error(y_test[target_name], y_pred_xgb)),
        'R2 Score': r2_score(y_test[target_name], y_pred_xgb)
    }

    print(f"XGBoost Results for '{target_name}':")
    for metric, value in xgb_results.items():
        print(f"  {metric}: {value}")

    return xgb_results, xgb_model

# Train and evaluate the XGBoost model for 'Likes'
likes_results, likes_model = train_and_evaluate_xgb('Likes')

# Train and evaluate the XGBoost model for 'Reposts'
reposts_results, reposts_model = train_and_evaluate_xgb('Reposts')

# Display final results
print("\nFinal Evaluation Results:")
print("Likes Results: ", likes_results)
print("Reposts Results: ", reposts_results)


Training XGBoost Model for 'Likes'
XGBoost Results for 'Likes':
  MAE: 10.493507848696245
  RMSE: 14.838222630026472
  R2 Score: 0.2685657739639282

Training XGBoost Model for 'Reposts'
XGBoost Results for 'Reposts':
  MAE: 7.800660039702633
  RMSE: 12.875401376225458
  R2 Score: 0.2072116732597351

Final Evaluation Results:
Likes Results:  {'MAE': 10.493507848696245, 'RMSE': 14.838222630026472, 'R2 Score': 0.2685657739639282}
Reposts Results:  {'MAE': 7.800660039702633, 'RMSE': 12.875401376225458, 'R2 Score': 0.2072116732597351}


Optimized XGBoost Model

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

# Define a function to train and evaluate an XGBoost model for a specific target variable
def train_and_evaluate_xgb(target_name):
    print(f"\nTraining XGBoost Model for '{target_name}'")

    # Initialize the XGBoost model
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

    # Initialize RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_grid,
        n_iter=20,  # Number of parameter combinations to try
        scoring='neg_mean_squared_error',  # Optimize for RMSE
        cv=3,  # 3-fold cross-validation
        random_state=42,
        n_jobs=-1  # Use all CPU cores
    )

    # Fit RandomizedSearchCV on the training data for the target variable
    random_search.fit(X_train_imputed, y_train[target_name])

    # Best parameters from RandomizedSearchCV
    best_params = random_search.best_params_
    print(f"Best parameters found for '{target_name}': ", best_params)

    # Train the optimized XGBoost model with the best parameters
    optimized_xgb = XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
    optimized_xgb.fit(X_train_imputed, y_train[target_name])

    # Make predictions with the optimized model
    y_pred_optimized_xgb = optimized_xgb.predict(X_test_imputed)

    # Evaluate the optimized model
    optimized_xgb_results = {
        'MAE': mean_absolute_error(y_test[target_name], y_pred_optimized_xgb),
        'RMSE': np.sqrt(mean_squared_error(y_test[target_name], y_pred_optimized_xgb)),
        'R2 Score': r2_score(y_test[target_name], y_pred_optimized_xgb)
    }

    print(f"Optimized XGBoost Results for '{target_name}':")
    for metric, value in optimized_xgb_results.items():
        print(f"  {metric}: {value}")

    return optimized_xgb_results, optimized_xgb

# Train and evaluate the XGBoost model for 'Likes'
likes_results, likes_model = train_and_evaluate_xgb('Likes')

# Train and evaluate the XGBoost model for 'Reposts'
reposts_results, reposts_model = train_and_evaluate_xgb('Reposts')

# Display final results
print("\nFinal Evaluation Results:")
print("Likes Results: ", likes_results)
print("Reposts Results: ", reposts_results)



Training XGBoost Model for 'Likes'
Best parameters found for 'Likes':  {'subsample': 0.8, 'n_estimators': 400, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.8}
Optimized XGBoost Results for 'Likes':
  MAE: 9.134765558342947
  RMSE: 13.749280693573356
  R2 Score: 0.3719828724861145

Training XGBoost Model for 'Reposts'
Best parameters found for 'Reposts':  {'subsample': 0.8, 'n_estimators': 400, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.8}
Optimized XGBoost Results for 'Reposts':
  MAE: 7.056903942805093
  RMSE: 12.139804382120126
  R2 Score: 0.29521113634109497

Final Evaluation Results:
Likes Results:  {'MAE': 9.134765558342947, 'RMSE': 13.749280693573356, 'R2 Score': 0.3719828724861145}
Reposts Results:  {'MAE': 7.056903942805093, 'RMSE': 12.139804382120126, 'R2 Score': 0.29521113634109497}
