- As we have to hand in one file for data, one file for our analysis and one file for our text, I would suggst to merge the entirety of the analysis into this wb. 
- maybe we can have the data generation in a seperate file. 

- I would also suggest putting all the code into functions that we can comment out the fn calls to not have to run the entire code over and over again

In [None]:
# Import Packages
import pandas as pd
from prettytable import PrettyTable

### Importing Data Sets
with open('../0_data/statements.csv', 'r') as file:

    # headers = ['prompt', 'client', 'opt1', 'opt2', 'opt3', 'completion']
    df_generated = pd.read_csv(file, index_col=False)
    
df_generated.head()

## EDA

In [None]:
import numpy
import pandas as pd
import csv
import matplotlib.pyplot as plt
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import seaborn as sns
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
df_real = pd.read_csv('/content/drive/MyDrive/NLP/eu_debate_transcripts_translated.csv')

In [None]:
df_llm = pd.read_csv('/content/drive/MyDrive/NLP/statements.csv')

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/NLP/extremity_merged_dataset.csv')

In [None]:
# Cleaning with stopwords
def clean_text_basic(text):
    return re.sub(r'[^a-z\s]', '', str(text).lower())

df_real['clean_with_stopwords'] = df_real['translated_text'].apply(clean_text_basic)
df_llm['clean_with_stopwords'] = df_llm['statement'].apply(clean_text_basic)
df_train['clean_with_stopwords'] = df_train['translated_text'].apply(clean_text_basic)

In [None]:
# Length + Style Metrics (with stopwords)
df_real['char_count'] = df_real['clean_with_stopwords'].str.len()
df_real['word_count'] = df_real['clean_with_stopwords'].str.split().str.len()
df_real['source'] = 'Real'

df_llm['char_count'] = df_llm['clean_with_stopwords'].str.len()
df_llm['word_count'] = df_llm['clean_with_stopwords'].str.split().str.len()
df_llm['source'] = df_llm['provider'].str.capitalize()

df_train['char_count'] = df_train['clean_with_stopwords'].str.len()
df_train['word_count'] = df_train['clean_with_stopwords'].str.split().str.len()
df_train['source'] = 'Train'

In [None]:
# Cleaning without stopwords
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df_real['clean_no_stopwords'] = df_real['clean_with_stopwords'].apply(remove_stopwords)
df_llm['clean_no_stopwords'] = df_llm['clean_with_stopwords'].apply(remove_stopwords)
df_train['clean_no_stopwords'] = df_train['clean_with_stopwords'].apply(remove_stopwords)

In [None]:
# Combine for analysis
df_eda = pd.concat([
    df_real[['char_count', 'word_count', 'ttr', 'source']],
    df_llm[['char_count', 'word_count', 'ttr', 'source']],
    df_train[['char_count', 'word_count', 'ttr', 'source']]
])

In [None]:
# Word frequency analysis
def get_word_counts(texts):
    words = []
    for text in texts:
        tokens = re.findall(r'\b\w+\b', text)
        words.extend(tokens)
    return Counter(words)

real_words = get_word_counts(df_real['clean_no_stopwords'])
llm_words = get_word_counts(df_llm['clean_no_stopwords'])
train_words = get_word_counts(df_train['clean_no_stopwords'])

In [None]:
# Top 20
real_top20 = pd.DataFrame(real_words.most_common(20), columns=['word', 'real_count'])
llm_top20 = pd.DataFrame(llm_words.most_common(20), columns=['word', 'llm_count'])
train_top20 = pd.DataFrame(train_words.most_common(20), columns=['word', 'train_count'])

In [None]:
# Merge top word frequencies
word_counts = pd.merge(real_top20, llm_top20, on='word', how='outer').fillna(0)
word_counts = pd.merge(word_counts, train_top20, on='word', how='outer').fillna(0)

In [None]:
# Histogram: Character Count
plt.figure(figsize=(12, 5))
sns.histplot(data=df_eda, x='char_count', hue='source', bins=40, element='step', stat='count', common_norm=False)
plt.title("Character Count Distribution by Source")
plt.xlabel("Character Count")
plt.ylabel("Count")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot: Type-Token Ratio
plt.figure(figsize=(12, 5))
sns.boxplot(data=df_eda, x='source', y='ttr')
plt.title("Type-Token Ratio (TTR) by Source")
plt.ylabel("TTR")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Split LLM data
df_chatgpt = df_llm[df_llm['source'] == 'Chatgpt']
df_deepseek = df_llm[df_llm['source'] == 'Deepseek']

In [None]:
# Helper function to compute word frequencies and normalize
def top_word_freqs(texts, label, total_words=None, top_n=20):
    words = []
    for text in texts:
        tokens = re.findall(r'\b\w+\b', str(text))
        words.extend(tokens)
    counter = Counter(words)
    if total_words is None:
        total_words = sum(counter.values())
    top_words = counter.most_common(top_n)
    df = pd.DataFrame(top_words, columns=['word', 'count'])
    df['frequency'] = df['count'] / total_words * 100
    df['source'] = label
    return df[['word', 'frequency', 'source']]

In [None]:
# Generate top 20 frequency tables
real_freqs = top_word_freqs(df_real['clean_no_stopwords'], 'Real')
train_freqs = top_word_freqs(df_train['clean_no_stopwords'], 'Train')
chatgpt_freqs = top_word_freqs(df_chatgpt['clean_no_stopwords'], 'ChatGPT')
deepseek_freqs = top_word_freqs(df_deepseek['clean_no_stopwords'], 'DeepSeek')

In [None]:
# Combine all
df_words_long = pd.concat([real_freqs, chatgpt_freqs, deepseek_freqs, train_freqs], ignore_index=True)

In [None]:
# Plot grouped bar plot
plt.figure(figsize=(14, 6))
sns.barplot(data=df_words_long, x='word', y='frequency', hue='source')
plt.title("Top Shared Words by Relative Frequency (%) — Grouped Bar Plot")
plt.ylabel("Frequency (%)")
plt.xlabel("Word")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

## Data Preparation

In [None]:
# inner joining data framers into one

### Defining preprocessing pipelines

## Analysis

In [None]:
# Topic modeling

In [None]:
# Sentiment Analysis

In [None]:
# Extremity Regression

In [None]:
def main():
    # run all functions in here:
    pass

if __name__ == "__main__": 
    main()