# Sentiment and Approval Rating: Cleaning, Training, and Prediction Pipeline

## Setup

Fetching Packages

In [0]:
# Importing packages
import pandas as pd
import numpy as np
import pyspark as spark
from pyspark.sql import SparkSession
import glob
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from transformers import pipeline
import sklearn
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Reading in data

In [0]:

## approval ratings
appr = pd.read_csv('/Workspace/Shared/approval/data/approval_ratings/approval.csv')

# Get all CSV files in comments and posts folders
comment_csv_files = glob.glob(os.path.join("/Workspace/Shared/approval/data/comments", "*.csv"))
posts_csv_files = glob.glob(os.path.join("/Workspace/Shared/approval/data/posts", "*.csv"))

# Read and concatenate all CSV files into respective dfs for comments and posts
coms = pd.concat((pd.read_csv(f) for f in comment_csv_files), ignore_index=True)
posts = pd.concat((pd.read_csv(f) for f in posts_csv_files), ignore_index=True)

In [0]:
print(len(coms), len(posts))


Merging subreddit names onto comments section by post

In [0]:
# Getting subreddits for each comment
coms = coms.merge(posts[['post_id', 'subreddit']], on='post_id', how='left')


In [0]:
coms.head()

In [0]:
appr.head()

## Sentiment Analysis

Using TextBlob package for sentiment anlaysis (POS > 0, NEG < 0)

In [0]:
# Sentiment analysis w. textblob
coms['sentiment'] = coms['body'].apply(lambda x: TextBlob(str(x)).sentiment.polarity) # get sentiment scores

# Normalize sentiment by year and subreddit
coms['year'] = coms['created_utc'].dt.year
yearly_scores = coms.groupby(['year', 'subreddit'])['score'].transform(lambda x: (x - x.mean()) / x.std())

# Multiply sentiment by normalized score
coms['sentiment_score'] = coms['sentiment'] * yearly_scores

## Cleaning

In [0]:
# Date for comments data
coms['created_utc'] = pd.to_datetime(coms['created_utc'])  # Convert to datetime
coms['date'] = coms['created_utc'].dt.date # Creating date variable
# finding the week it belongs to (the previous Monday) and shifting all the values by one week to better reflect the approvals
coms['week'] = (coms['created_utc'] - pd.to_timedelta(coms['created_utc'].dt.weekday - 7, unit='d')).dt.date # Shift all values by one week and remove time

# Date for approval rating data
appr['date'] = pd.to_datetime(appr['date'])
appr['week'] = (appr['date'] - pd.to_timedelta(appr['date'].dt.weekday, unit='d')).dt.date # Get the previous Monday and remove time
appr['date'] = appr['date'].dt.date

Restructure dataframe to:
1) Get mean sentiment scores for each subreddit
2) Group by date
3) Pivot table so that (a) each feature is a subreddit, (b) each observation is a date, and (c) each cell is the mean sentiment score in that subreddit on that date

In [0]:
# Group by date and subreddit, calculate mean sentiment
mean_sentiment = coms.groupby(['week', 'subreddit'])['sentiment_score'].mean().reset_index()

# Pivot to make subreddits as columns
pivoted_coms = mean_sentiment.pivot(
    index='week',
    columns='subreddit',
    values='sentiment_score'
).reset_index()

# Rename the columns to clarify they're sentiment values
pivoted_coms.columns = ['week'] + [f'sentiment_{sub}' for sub in pivoted_coms.columns[1:]]

# Show result
pivoted_coms.head()

Merging pivoted comments and approval dataframes on date

In [0]:
mean_approval = appr.groupby(['week', 'party'])['Approving'].mean().reset_index()
# Merging comments and approval
coms_appr = pd.merge(pivoted_coms, mean_approval, on='week', how='left')
coms_appr.head()

## Model

Preprocessing

In [0]:
# Creating dummy variable for political party
coms_appr['party'] = coms_appr['party'].replace({'D': 0, 'R': 1})
# Create a column for the president based on the week value
def get_president(week):
    if week < pd.Timestamp('2017-01-20'):
        return 'Obama'
    elif week < pd.Timestamp('2021-01-20'):
        return 'Trump1'
    elif week < pd.Timestamp('2025-01-20'):
        return 'Biden'
    else:
        return 'Trump2'

coms_appr['president'] = coms_appr['week'].apply(get_president)
coms_appr['president'].value_counts()

In [0]:
coms_appr[coms_appr['president']=="Trump2"]

Creating pipeline to train XGBoost model

*NOTE: This pipeline is programmed to work with Spark in Databricks. However, some elements are not compatible with the trial version of Databricks.*

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from sparkxgb import XGBoostRegressor

# Assume df is your preprocessed DataFrame
label_col = "Approving"
feature_cols = [col for col in coms_appr.columns if col != label_col] if not coms_appr.columns.empty else []

# Assemble features into a single vectorassembler 

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

# XGBoost Regressor
xgb_reg = XGBoostRegressor(
    featuresCol="features",
    labelCol=label_col,
    missing=float("nan"),
    predictionCol="prediction",
    objective="reg:squarederror",
    numRound=100,
    eta=0.1,
    maxDepth=6,
    nthread=4
)

# Spark Pipeline
pipeline = Pipeline(stages=[assembler, xgb_reg])

# Fit model
model = pipeline.fit(coms_appr)

# Predict
predictions = model.transform(coms_appr)


XGBoost model (no pipeline)

In [0]:
coms_appr.head()

In [0]:
# XGboost model
coms_appr = coms_appr.dropna(subset=["Approving"])

# Define features and target
X = coms_appr.drop(columns=["Approving", "week", "president"]) # omitting target feature and date
y = coms_appr["Approving"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=10, stratify=coms_appr["president"]
)

# Initialize and train the model
model = xgb.XGBRegressor(random_state=10, missing=np.nan) # telling model to treat missing values as np.nan
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
