- As we have to hand in one file for data, one file for our analysis and one file for our text, I would suggst to merge the entirety of the analysis into this wb. 
- maybe we can have the data generation in a seperate file. 

- I would also suggest putting all the code into functions that we can comment out the fn calls to not have to run the entire code over and over again

##### Imports & Configs:

In [None]:
# Standard library imports
from collections import Counter
from dataclasses import dataclass
import csv
import random
import re
import string
from typing import List, Set

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from prettytable import PrettyTable

# NLTK imports
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.util import ngrams

# Gensim imports
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.doc2vec import TaggedDocument

# Scikit-learn imports
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import (
    accuracy_score,
    adjusted_rand_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    f1_score,
    mean_squared_error,
    r2_score,
    silhouette_score
)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.manifold import trustworthiness

# Transformers and datasets imports
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

# Other ML/DL imports
import tensorflow_hub as hub
from umap.umap_ import UMAP
import hdbscan
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

##### Import Datasets:

In [None]:
# Import Data Frames
generated = pd.read_csv('0_data/statements.csv')
parliament = pd.read_csv('0_data/final_labeled_dataset.csv')

In [None]:
generated.info()

In [None]:
parliament.info()

In [None]:
generated.statement.sample(5)

In [None]:
parliament.translated_text.sample(5)

In [None]:
generated.isna().sum()

In [None]:
parliament.isna().sum()

##### Preprocessing Class:

In [None]:
@dataclass
class Preprocess:
    """
    """
    _stopwords: Set[str] = None
    _lemmatizer: WordNetLemmatizer = None

    def __post_init__(self):
        self._stopwords = set(nltk.corpus.stopwords.words('english'))
        self._lemmatizer = WordNetLemmatizer()

    def rm_stopwords(self, text: str) -> str:
        return ' '.join([word for word in text.split()
                         if word not in self._stopwords])

    def lemmatize_doc(self, tokens: List[str]) -> List[str]:
        return [self._lemmatizer.lemmatize(word) for word in tokens
                if word.isalpha() and word.lower() not in self._stopwords and len(word) > 2]

    def trigrams(self, text: str) -> List[tuple]:
        tokens = self.tokenize_doc(text)
        return list(ngrams(tokens, 3))

    @staticmethod
    def basic_clean(text: str) -> str:
        return re.sub(r'[^a-z\s]', '', str(text).lower())

    @staticmethod
    def tokenize_doc(text: str) -> List[str]:
        return word_tokenize(text.lower())

In [None]:
preprocessor = Preprocess()

## Descriptive Analytics

In [None]:
generated_da = generated.copy()
parliament_da = parliament.copy()

In [None]:
parliament_da['clean_with_stopwords'] = parliament_da['translated_text'].apply(preprocessor.basic_clean)
generated_da['clean_with_stopwords'] = generated_da['statement'].apply(preprocessor.basic_clean)

In [None]:
# Length + Style Metrics (with stopwords)
parliament_da['char_count'] = parliament_da['clean_with_stopwords'].str.len()
parliament_da['word_count'] = parliament_da['clean_with_stopwords'].str.split().str.len()
parliament_da['source'] = 'Real'

generated_da['char_count'] = generated_da['clean_with_stopwords'].str.len()
generated_da['word_count'] = generated_da['clean_with_stopwords'].str.split().str.len()
generated_da['source'] = generated_da['provider'].str.capitalize()

In [None]:
parliament_da['clean_no_stopwords'] = parliament_da['clean_with_stopwords'].apply(preprocessor.rm_stopwords)
generated_da['clean_no_stopwords'] = generated_da['clean_with_stopwords'].apply(preprocessor.rm_stopwords)

In [None]:
def get_word_counts(texts):
    words = []
    for text in texts:
        tokens = re.findall(r'\b\w+\b', text)
        words.extend(tokens)
    return Counter(words)

In [None]:
# Word frequency analysis
real_words = get_word_counts(parliament_da['clean_no_stopwords'])
llm_words = get_word_counts(generated_da['clean_no_stopwords'])

In [None]:
# Top 20
real_top20 = pd.DataFrame(real_words.most_common(20), columns=['word', 'real_count'])
llm_top20 = pd.DataFrame(llm_words.most_common(20), columns=['word', 'llm_count'])

In [None]:
# Merge top word frequencies
word_counts = pd.merge(real_top20, llm_top20, on='word', how='outer').fillna(0)

In [None]:
# Combine for analysis
df_da = pd.concat([
    parliament_da[['char_count', 'word_count', 'source']],
    generated_da[['char_count', 'word_count', 'source']]
])

In [None]:
# Histogram: Character Count
plt.figure(figsize=(12, 5))
sns.histplot(data=df_da, x='char_count', hue='source', bins=40, element='step', stat='count', common_norm=False)
plt.title("Character Count Distribution by Source")
plt.xlabel("Character Count")
plt.ylabel("Count")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# compute word frequencies and normalize
def top_word_freqs(texts, label, total_words=None, top_n=20):
    words = []
    for text in texts:
        tokens = re.findall(r'\b\w+\b', str(text))
        words.extend(tokens)
    counter = Counter(words)
    if total_words is None:
        total_words = sum(counter.values())
    top_words = counter.most_common(top_n)
    df = pd.DataFrame(top_words, columns=['word', 'count'])
    df['frequency'] = df['count'] / total_words * 100
    df['source'] = label
    return df[['word', 'frequency', 'source']]

In [None]:
# Split LLM data
df_chatgpt = generated_da[generated_da['source'] == 'Chatgpt']
df_deepseek = generated_da[generated_da['source'] == 'Deepseek']

In [None]:
# Generate top 20 frequency tables
real_freqs = top_word_freqs(parliament_da['clean_no_stopwords'], 'Real')
chatgpt_freqs = top_word_freqs(df_chatgpt['clean_no_stopwords'], 'ChatGPT')
deepseek_freqs = top_word_freqs(df_deepseek['clean_no_stopwords'], 'DeepSeek')

In [None]:
# Combine all
df_words_long = pd.concat([real_freqs, chatgpt_freqs, deepseek_freqs], ignore_index=True)

In [None]:
# Plot grouped bar plot
plt.figure(figsize=(14, 6))
sns.barplot(data=df_words_long, x='word', y='frequency', hue='source')
plt.title("Top Shared Words by Relative Frequency (%) — Grouped Bar Plot")
plt.ylabel("Frequency (%)")
plt.xlabel("Word")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot: Type-Token Ratio
plt.figure(figsize=(12, 5))
sns.boxplot(data=df_da, x='source', y='ttr')
plt.title("Type-Token Ratio (TTR) by Source")
plt.ylabel("TTR")
plt.grid(True)
plt.tight_layout()
plt.show()

## Analysis

### Topic modeling

In [None]:
SEED = 42

In [None]:
def tune_umap_parameters(embeddings, n_calls: int=50, verbose: bool=False):

    space = [
        Integer(10, 50, name='n_neighbors'),
        Real(0.0, 0.3, name='min_dist'),
        Categorical(['euclidean'], name='metric')
    ]

    def objective(params, embeddings, n_components=2):
        n_neighbors, min_dist, metric = params

        reducer = UMAP(
            n_neighbors=n_neighbors,
            min_dist=min_dist,
            metric=metric,
            n_components=n_components,
            random_state=SEED
        )

        embedding = reducer.fit_transform(embeddings)

        trust_score = trustworthiness(
            embeddings,
            embedding,
            n_neighbors=min(20, len(embeddings) - 1)
        )

        return -trust_score

    result = gp_minimize(
        lambda params: objective(params, embeddings),
        space,
        n_calls=n_calls,
        random_state=SEED,
        verbose=verbose
    )

    best_params = {
        'n_neighbors': result.x[0],
        'min_dist': result.x[1],
        'metric': result.x[2]
    }

    print("\nBest parameters:")
    for param, value in best_params.items():
        print(f"{param}: {value}")

    print(f"\nBest score: {-result.fun:.4f}")

    best_reducer = UMAP(
        **best_params,
        n_components=2,
        random_state=SEED
    )

    return best_params, best_reducer

In [None]:
def tune_hdbscan_parameters(embeddings,  n_calls: int=50, verbose: bool=False):

    space = [
        Integer(3, 15, name='min_cluster_size'),
        Integer(3, 10, name='min_samples'),
        Real(0.0, 0.5, name='cluster_selection_epsilon'),
        Categorical(['euclidean'], name='metric')
    ]

    def objective(params, embeddings, n_runs=5):
        min_cluster_size, min_samples, cluster_selection_epsilon, metric = params

        cluster_results = []
        silhouette_scores = []

        for _ in range(n_runs):
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                cluster_selection_epsilon=cluster_selection_epsilon,
                metric=metric
            )

            labels = clusterer.fit_predict(embeddings)
            cluster_results.append(labels)

            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            if n_clusters > 1:
                mask = labels != -1
                if np.sum(mask) > 1:
                    sil_score = silhouette_score(embeddings[mask], labels[mask])
                    silhouette_scores.append(sil_score)

        stability_scores = []
        for i in range(len(cluster_results)):
            for j in range(i + 1, len(cluster_results)):
                ari = adjusted_rand_score(cluster_results[i], cluster_results[j])
                stability_scores.append(ari)

        mean_stability = np.mean(stability_scores) if stability_scores else 0
        mean_silhouette = np.mean(silhouette_scores) if silhouette_scores else 0

        noise_ratio = np.sum(cluster_results[-1] == -1) / len(cluster_results[-1])

        composite_score = (0.4 * mean_stability +
                          0.4 * mean_silhouette -
                          0.2 * noise_ratio)

        return -composite_score

    result = gp_minimize(
        lambda params: objective(params, embeddings),
        space,
        n_calls=n_calls,
        random_state=SEED,
        verbose=verbose
    )

    best_params = {
        'min_cluster_size': result.x[0],
        'min_samples': result.x[1],
        'cluster_selection_epsilon': result.x[2],
        'metric': result.x[3]
    }

    print("\nBest HDBSCAN parameters:")
    for param, value in best_params.items():
        print(f"{param}: {value}")

    print(f"\nBest score: {-result.fun:.4f}")

    clusterer = hdbscan.HDBSCAN(**best_params)
    labels = clusterer.fit_predict(embeddings)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    noise_points = sum(1 for label in labels if label == -1)

    print(f"\nNumber of clusters: {n_clusters}")
    print(f"Number of noise points: {noise_points} ({noise_points/len(labels):.2%})")

    return best_params, clusterer

In [None]:
def get_closest_words(topic_vector, word_vectors, n=10):
    similarities = cosine_similarity([topic_vector], word_vectors)[0]
    return np.argsort(similarities)[-n:][::-1]

In [None]:
def tm_cleaning(doc):
    doc = preprocessor.basic_clean(doc)
    tokens = preprocessor.tokenize_doc(doc)
    tokens = preprocessor.lemmatize_doc(tokens)
    return tokens

In [None]:
df_chatgpt = generated[generated.provider == 'chatgpt']
df_deepseek = generated[generated.provider == 'deepseek']

In [None]:
docs_chatgpt = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_chatgpt.statement.apply(tm_cleaning).tolist())]
docs_deepseek = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_deepseek.statement.apply(tm_cleaning).tolist())]

In [None]:
texts_chatgpt = [' '.join(doc.words) if hasattr(doc, 'words') else doc for doc in docs_chatgpt]
texts_deepseek = [' '.join(doc.words) if hasattr(doc, 'words') else doc for doc in docs_deepseek]

In [None]:
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
scaler = StandardScaler()
embeddings_chatgpt = scaler.fit_transform(embed(texts_chatgpt).numpy())
embeddings_deepseek = scaler.fit_transform(embed(texts_deepseek).numpy())

In [None]:
best_umap_params_chatgpt, reducer_chatgpt = tune_umap_parameters(embeddings_chatgpt)

In [None]:
best_umap_params_deepseek, reducer_deepseek = tune_umap_parameters(embeddings_deepseek)

In [None]:
best_hdbscan_params_chatgpt, clusterer_chatgpt = tune_hdbscan_parameters(embeddings_chatgpt)

In [None]:
best_hdbscan_params_deepseek, clusterer_deepseek = tune_hdbscan_parameters(embeddings_deepseek)

In [None]:
umap_embeddings_chatgpt = reducer_chatgpt.fit_transform(embeddings_chatgpt)
umap_embeddings_deepseek = reducer_deepseek.fit_transform(embeddings_deepseek)

In [None]:
cluster_labels_chatgpt = clusterer_chatgpt.fit_predict(umap_embeddings_chatgpt)
cluster_labels_deepseek = clusterer_deepseek.fit_predict(umap_embeddings_deepseek)

In [None]:
umap_hdbscan_results = {
    'ChatGPT': (umap_embeddings_chatgpt, cluster_labels_chatgpt),
    'DeepSeek': (umap_embeddings_deepseek, cluster_labels_deepseek)
}

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
fig.suptitle('Document Clusters Comparison', fontsize=16, y=1.05)

for idx, (name, res) in enumerate(umap_hdbscan_results.items()):
    scatter = axes[idx].scatter(res[0][:, 0],
                               res[0][:, 1],
                               c=res[1],
                               cmap='Spectral',
                               alpha=0.6)
    fig.colorbar(scatter, ax=axes[idx])
    axes[idx].set_title(f'{name}')
    axes[idx].set_xlabel('UMAP 1')
    axes[idx].set_ylabel('UMAP 2')

plt.tight_layout()
plt.show()

In [None]:
n_clusters_chatgpt = len(np.unique(cluster_labels_chatgpt[cluster_labels_chatgpt != -1]))

topic_vectors = []

for i in range(n_clusters_chatgpt):
    cluster_docs = embeddings_chatgpt[cluster_labels_chatgpt == i]
    centroid = np.mean(cluster_docs, axis=0)
    topic_vectors.append(centroid)

topic_vectors_chatgpt = np.array(topic_vectors)

In [None]:
n_clusters_deepseek = len(np.unique(cluster_labels_deepseek[cluster_labels_deepseek != -1]))

topic_vectors = []

for i in range(n_clusters_deepseek):
    cluster_docs = embeddings_deepseek[cluster_labels_deepseek == i]
    centroid = np.mean(cluster_docs, axis=0)
    topic_vectors.append(centroid)

topic_vectors_deepseek = np.array(topic_vectors)

##### Results:

In [None]:
def top_words(cluster_labels,
              texts,
              topic_vectors,
              embeddings,
              n_words=10):

    n_clusters = len(np.unique(cluster_labels[cluster_labels != -1]))

    count_vectorizer = CountVectorizer()
    doc_term_matrix = count_vectorizer.fit_transform(texts)
    vocabulary = count_vectorizer.get_feature_names_out()

    c_tf_idf_matrix = np.zeros((n_clusters, len(vocabulary)))

    for cluster_id in range(n_clusters):

        cluster_docs = doc_term_matrix[cluster_labels == cluster_id]

        if cluster_docs.shape[0] == 0:
            continue

        cluster_tf = np.array(cluster_docs.sum(axis=0).flatten())[0]
        total_docs = len(texts)
        cluster_size = cluster_docs.shape[0]

        tf_idf = cluster_tf * np.log1p(total_docs / (cluster_size + 1))
        c_tf_idf_matrix[cluster_id] = tf_idf

    all_top_words = []

    for topic_idx in range(c_tf_idf_matrix.shape[0]):
        top_n_idx = c_tf_idf_matrix[topic_idx].argsort()[-n_words:][::-1]
        top_words = [vocabulary[idx] for idx in top_n_idx]
        all_top_words.append(top_words)

        print(f"\nTopic {topic_idx + 1} Top Words:")
        print(", ".join(top_words))

    return [item for sublist in all_top_words for item in sublist]

In [None]:
top_words_chatgpt = top_words(cluster_labels_chatgpt,
                              texts_chatgpt,
                              topic_vectors_chatgpt,
                              embeddings_chatgpt,
                              n_words=10)

In [None]:
top_words_deepseek = top_words(cluster_labels_deepseek,
                               texts_deepseek,
                               topic_vectors_deepseek,
                               embeddings_deepseek,
                               n_words=10)

In [None]:
unique_words_chatgpt = set(top_words_chatgpt) - set(top_words_deepseek)
print(f'Unique words in ChatGPT: {unique_words_chatgpt}')

unique_words_deepseek = set(top_words_deepseek) - set(top_words_chatgpt)
print(f'Unique words deepseek: {unique_words_deepseek}')

common_words = set(top_words_deepseek) & set(top_words_chatgpt)
print(f'Common words: {common_words}')

In [None]:
def evaluate_topic_modeling(cluster_labels, embeddings, texts, topic_vectors, model_results=None):
    valid_mask = cluster_labels != -1
    clustering_metrics = {}

    if np.sum(valid_mask) > 1:
        clustering_metrics = {
            'silhouette_score': silhouette_score(
                embeddings[valid_mask],
                cluster_labels[valid_mask]
            ),
            'calinski_harabasz_score': calinski_harabasz_score(
                embeddings[valid_mask],
                cluster_labels[valid_mask]
            ),
            'davies_bouldin_score': davies_bouldin_score(
                embeddings[valid_mask],
                cluster_labels[valid_mask]
            )
        }

    tokenized_texts = [text.split() for text in texts]
    dictionary = Dictionary(tokenized_texts)

    topic_words = []
    for i in range(len(topic_vectors)):
        topic_mask = cluster_labels == i
        topic_texts = [text for text, mask in zip(texts, topic_mask) if mask]
        words = ' '.join(topic_texts).split()
        word_freq = {}
        for word in words:
            word_freq[word] = word_freq.get(word, 0) + 1
        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        topic_words.append([word for word, freq in sorted_words[:10]])  # top 10 words

    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v'
    )

    coherence_metrics = {
        'c_v_coherence': coherence_model.get_coherence()
    }

    topic_similarities = cosine_similarity(topic_vectors)
    np.fill_diagonal(topic_similarities, 0)
    distinctiveness_metrics = {
        'mean_similarity': np.mean(topic_similarities),
        'max_similarity': np.max(topic_similarities)
    }

    topic_sizes = np.bincount(cluster_labels[cluster_labels != -1])
    size_metrics = {
        'size_std': np.std(topic_sizes),
        'size_range': np.ptp(topic_sizes),
        'noise_ratio': np.sum(cluster_labels == -1) / len(cluster_labels)
    }

    evaluation_results = {
        'clustering_metrics': clustering_metrics,
        'coherence': coherence_metrics,
        'distinctiveness': distinctiveness_metrics,
        'size_metrics': size_metrics
    }

    print("\nEvaluation Results")
    print("-" * 50)

    for category, metrics in evaluation_results.items():
        print(f"\n{category.replace('_', ' ').title()}:")
        for metric, value in metrics.items():
            print(f"{metric.replace('_', ' ').title()}: {value:.4f}")

    return evaluation_results

In [None]:
eval_res_chatgpt = evaluate_topic_modeling(cluster_labels_chatgpt,
                                            embeddings_chatgpt,
                                            texts_chatgpt,
                                            topic_vectors_chatgpt,
                                            model_results=umap_hdbscan_results)

In [None]:
eval_res_deepseek = evaluate_topic_modeling(cluster_labels_deepseek,
                                            embeddings_deepseek,
                                            texts_deepseek,
                                            topic_vectors_deepseek,
                                            model_results=umap_hdbscan_results)

### Sentiment Analysis

In [None]:
nltk.download('vader_lexicon')

In [None]:
# Sentiment analysis
sia = SentimentIntensityAnalyzer()
def get_sentiment(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

In [None]:
parliament['sentiment'] = parliament['translated_text'].apply(get_sentiment)
generated['sentiment'] = generated['completion'].apply(get_sentiment)

##### Results:

In [None]:
def get_descriptive_printout(sentiment):
    return [sentiment.mean(), sentiment.std(), sentiment.min(), sentiment.max(), sentiment.count()]

sentiment_original = get_descriptive_printout(parliament['sentiment'])
is_openai = generated['client'] == 'chatgpt'
is_deepseek = generated['client'] == 'deepseek'
sentiment_openai_deepseek = get_descriptive_printout(generated[is_deepseek]['sentiment'])
sentiment_openai_chatgpt = get_descriptive_printout(generated[is_openai]['sentiment'])

In [None]:
# Create table
sentiment_table = PrettyTable()
sentiment_table.field_names = ['Source', 'Mean', 'Std Dev', 'Min', 'Max', 'Count']
sentiment_table.add_row(['Original', *sentiment_original])
sentiment_table.add_row(['ChatGPT', *sentiment_openai_chatgpt])
sentiment_table.add_row(['DeepSeek', *sentiment_openai_deepseek])

In [None]:
# Print table
print('Sentiment Analysis (Vader) Results:')
print('> uncleaned data')
print(sentiment_table)

### Extremity Regression

##### Ridge Regression:

In [None]:
def reg_preprocess_text(dataset):
    return dataset.map(
        lambda x: ' '.join(
            preprocessor.lemmatize_doc(
                preprocessor.tokenize_doc(
                    preprocessor.rm_stopwords(
                        preprocessor.basic_clean(x)  # Remove ['translated_text'] access
                    )
                )
            )
        )
    )

X = parliament['translated_text']
X = reg_preprocess_text(X)

In [None]:
y = parliament['label']

In [None]:
# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Train Ridge regression model
model = Ridge()
model.fit(X_train_vec, y_train)

In [None]:
# Predict
y_pred = model.predict(X_test_vec)

In [None]:
# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

In [None]:
# Plot predictions
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([-1, 1], [-1, 1], '--', color='gray')
plt.title("Predicted vs. Actual Extremity")
plt.xlabel("Actual Extremity")
plt.ylabel("Predicted Extremity")
plt.tight_layout()
plt.show()

##### RoBERTa:

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Load dataset
df = parliament[parliament["translated_text"].notna() & parliament["label"].notna()]

In [None]:
# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Prepare datasets
train_df = train_df.rename(columns={"label": "labels"})
test_df = test_df.rename(columns={"label": "labels"})
train_dataset = Dataset.from_pandas(train_df[["translated_text", "labels"]])
test_dataset = Dataset.from_pandas(test_df[["translated_text", "labels"]])

In [None]:
# Use RoBERTa for regression
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
def tokenize(example):
    return tokenizer(example["translated_text"], padding="max_length", truncation=True)

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

In [None]:
# Load model for regression
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=1,
    problem_type="regression"
)
model.config.hidden_dropout_prob = 0.3  # reduce overfitting

In [None]:
# Define evaluation metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.squeeze()
    return {
        "mse": mean_squared_error(labels, preds),
        "r2": r2_score(labels, preds)
    }

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./spectrum_bert_results",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=12,
    weight_decay=0.01,
    logging_dir="./spectrum_logs"
)

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# train
trainer.train()

In [None]:
# evaluate
trainer.evaluate()

In [None]:
model.save_pretrained("roberta_best")
tokenizer.save_pretrained("roberta_best")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

statements = generated["statement"].tolist()

model_path = "roberta_best"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

model.eval()

# Predict
extremity_scores = []
for text in statements:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        score = outputs.logits.item()
        extremity_scores.append(score)

generated["extremity_score"] = extremity_scores

In [None]:
plt.figure(figsize=(8, 5))
sns.kdeplot(data=generated, x="extremity_score", hue="provider", fill=True, common_norm=False, alpha=0.5)
plt.axvline(0, linestyle="--", color="gray")
plt.title("Distribution of Predicted Extremity Scores by Provider")
plt.xlabel("Extremity Score (-1 = Left, +1 = Right)")
plt.ylabel("Density")
plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(data=generated, x="provider", y="extremity_score")
plt.title("Extremity Score Distribution per LLM")
plt.axhline(0, linestyle="--", color="gray")
plt.ylabel("Predicted Extremity")
plt.show()

In [None]:
print(generated.groupby("provider")["extremity_score"].agg(["mean", "std", "min", "max", "median"]))

In [None]:
chatgpt_df = generated[generated["provider"].str.lower() == "chatgpt"]
num_left_leaning = (chatgpt_df["extremity_score"] < 0).sum()
total = len(chatgpt_df)
print(f"ChatGPT statements leaning left (< 0): {num_left_leaning} out of {total} ({(num_left_leaning/total)*100:.2f}%)")