In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
import pandas as pd
import numpy as np
from utils import one_hot_encode_columns
from joblib import load

In [10]:
anes_survey = 'anes_preprocessed_data.csv'
anes_df = pd.read_csv(anes_survey, low_memory=False)

anes_df = anes_df[['date', 'state', 'gender', 'race', 'age_group', 'edu', 'income', 'vote', 
                   'Yahoo', 'CNN', 'New York Times', 'Breitbart', 'Fox', 'Washington Post',
                   'The Guardian', 'USA Today', 'BBC', 'NPR', 'Buzzfeed']]

# perform one-hot encoding on the categorical column
categorical_columns = ['state', 'gender', 'race']
anes_df = one_hot_encode_columns(anes_df, categorical_columns)

# Separate features and target
X = anes_df.drop(['date', 'vote'], axis=1)
y = anes_df['vote']

# Step 2: Train-Test Split
# 80% train+val, 20% test
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Within train_full: 25% for validation
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [6]:
# Step 3: Predict News Usage from Demographics (Multitarget)
news_cols = ['Yahoo', 'CNN', 'New York Times', 'Breitbart', 'Fox', 'Washington Post',
             'The Guardian', 'USA Today', 'BBC', 'NPR', 'Buzzfeed']
demo_cols = [col for col in X.columns if col not in news_cols]

X_demo_train = X_train[demo_cols]
X_demo_val = X_val[demo_cols]
X_demo_test = X_test[demo_cols]

y_news_train = X_train[news_cols]
y_news_val = X_val[news_cols]
y_news_test = X_test[news_cols]

# Use MultiOutputClassifier
base_model = RandomForestClassifier(random_state=42)
multi_clf = MultiOutputClassifier(base_model)
multi_clf.fit(X_demo_train, y_news_train)

# Predict on train, val, test
predicted_news_train = pd.DataFrame(multi_clf.predict(X_demo_train), 
                                    columns=news_cols, index=X_train.index)
predicted_news_val = pd.DataFrame(multi_clf.predict(X_demo_val), 
                                  columns=news_cols, index=X_val.index)
predicted_news_test = pd.DataFrame(multi_clf.predict(X_demo_test), 
                                   columns=news_cols, index=X_test.index)

# Optional: Evaluate per outlet
for i, outlet in enumerate(news_cols):
    print(f"News Usage Prediction - {outlet}")
    print(classification_report(y_news_val[outlet], predicted_news_val[outlet]))


News Usage Prediction - Yahoo
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1151
           1       0.08      0.01      0.02       150

    accuracy                           0.87      1301
   macro avg       0.48      0.50      0.48      1301
weighted avg       0.79      0.87      0.83      1301

News Usage Prediction - CNN
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      1058
           1       0.35      0.12      0.18       243

    accuracy                           0.79      1301
   macro avg       0.58      0.53      0.53      1301
weighted avg       0.73      0.79      0.75      1301

News Usage Prediction - New York Times
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      1074
           1       0.36      0.15      0.22       227

    accuracy                           0.80      1301
   macro avg       0.60      0.

In [7]:
# Step 4: Predict Vote from Demographics + Predicted News Usage
X_vote_train = pd.concat([X_demo_train, predicted_news_train], axis=1)
X_vote_val = pd.concat([X_demo_val, predicted_news_val], axis=1)
X_vote_test = pd.concat([X_demo_test, predicted_news_test], axis=1)


vote_model = load('models/news_usage_model.pkl')
vote_model.fit(X_vote_train, y_train)

# Evaluate on validation and test set
print("Validation Performance:")
y_val_pred = vote_model.predict(X_vote_val)
print(classification_report(y_val, y_val_pred))


Validation Performance:
              precision    recall  f1-score   support

           1       0.73      0.67      0.70       742
           2       0.61      0.68      0.64       559

    accuracy                           0.67      1301
   macro avg       0.67      0.67      0.67      1301
weighted avg       0.68      0.67      0.67      1301



In [15]:
print("Test Performance:")
y_test_pred = vote_model.predict(X_vote_test)
print(classification_report(y_test, y_test_pred))


Test Performance:
              precision    recall  f1-score   support

           1       0.72      0.64      0.68       731
           2       0.59      0.67      0.63       570

    accuracy                           0.66      1301
   macro avg       0.66      0.66      0.65      1301
weighted avg       0.66      0.66      0.66      1301

