In [1]:
import sys
import os
#sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # for scripts
project_root = '/Users/emilykruger/Documents/GitHub/CSH-Internship'
functions_dir = os.path.join(project_root, 'src/functions')
daegc_dir = os.path.join(project_root, 'src/DAEGC')

sys.path.append(project_root) #for local notebook
sys.path.append(functions_dir) #for local notebook


import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import regex as re
from src.functions.linguistic_features import remove_emojis, remove_tags, count_emojis, preprocess_text, count_pos_tags
from textstat import flesch_reading_ease
import subprocess
from tqdm import tqdm
from transformers import AutoTokenizer, pipeline
import multiprocessing as mp

# to get cluster labels
import torch
from src.DAEGC.DAEGC import DAEGC
from functions.daegc_helpers import *

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



# Building Code for Linguistic Features

In this notebook, code will be written to extract linguistic features from the dataset. It will be done on a small subsample. Afterwards code will be transferred to a script to run on the full dataset.

## Loading Data

### Initially

In [None]:
groups = pd.read_csv('../data/selected_groups_with_transcriptions.csv.gzip', compression='gzip')
channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression='gzip')

In [None]:
groups = groups.drop(columns=['Unnamed: 0'], axis=1)
groups['group_or_channel'] = 'group'
groups.head(5)

In [None]:
channels = channels.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
channels['group_or_channel'] = 'channel'
channels.head(5)

In [None]:
#take random sample of 100k rows of both df where either message or fwd_message contains data and combine
sample_groups = groups[groups['message'].notnull() | groups['fwd_message'].notnull()].sample(n=1000, random_state=42)
sample_channels = channels = channels[channels['message'].notnull() | channels['fwd_message'].notnull()].sample(n=1000, random_state=42)
combined = pd.concat([sample_groups, sample_channels], ignore_index=True, axis=0)
combined.head(5)

In [None]:
#keep only UID and message
messages = combined[['UID_key', 'message', 'fwd_message', 'group_or_channel']]

#remove emojis
cleaned_messages = []
for message in messages['message'].astype(str):
    cleaned_messages.append(remove_emojis(message))

cleaned_fwd_messages = []
for message in messages['fwd_message'].astype(str):
    cleaned_fwd_messages.append(remove_emojis(message))

messages['message_string'] = cleaned_messages
messages['fwd_message_string'] = cleaned_fwd_messages
messages['message_string'] = messages['message_string'].astype(str)
messages['fwd_message_string'] = messages['fwd_message_string'].astype(str)

#if message, take message else take fwd_message
messages['final_message'] = messages['message'].where(messages['message'].notnull(), messages['fwd_message'])
messages['final_message_string'] = messages['message_string'].where(messages['message_string'] != 'nan', messages['fwd_message_string'])

In [None]:
messages['preprocessed_message'] = messages['final_message_string'].apply(preprocess_text)

#delete uneccessary columns
messages = messages.drop(columns=['message', 'fwd_message', 'message_string', 'fwd_message_string'], axis=1)

In [None]:
messages.to_csv('../data/messages_sample.csv.gzip', compression='gzip')

### For Re-Running Below Code

In [None]:
#for re-running
messages = pd.read_csv('../data/samples/messages_sample_2000.csv.gzip', compression='gzip').drop('Unnamed: 0', axis=1)

## Count-Based Features & POS-Tagging

In [None]:
#num sentences
messages['sent_count'] = messages['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x else 0)
#num words
messages['word_count'] = messages['final_message_string'].apply(lambda x: len(re.findall(r'\w+', x)) if x else 0)
#avg sentence length (words per sentence)
messages['avg_sent_length'] = messages.apply(lambda row: row['word_count'] / row['sent_count'] if row['sent_count'] > 0 else 0, axis=1)
#avg word length (characters per word)
messages['avg_word_length'] = messages.apply(lambda row: len(row['final_message_string'].replace(' ', '')) / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
#num exclamations (multiple ! coutn as one exclamation)
messages['exclamation_count'] = messages['final_message_string'].apply(lambda x: len(re.findall(r'!+', x)) if x else 0)
#num questions (multiple ? count as one question)
messages['question_count'] = messages['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
#num emojis 
messages['emoji_count'] = messages['final_message'].apply(lambda x: count_emojis(x) if x else 0)

In [None]:
#use count_pos_tags func to count nouns, verbs and adj
messages['noun_count'] = messages['final_message_string'].apply(lambda x: count_pos_tags(x)[0])
messages['verb_count'] = messages['final_message_string'].apply(lambda x: count_pos_tags(x)[1])
messages['adj_count'] = messages['final_message_string'].apply(lambda x: count_pos_tags(x)[2])

## Flesch Reading Ease

In [None]:
#use TextStat to compute Flesch Reading Ease score on final_message_string
messages['flesch_reading_ease'] = messages['final_message_string'].apply(flesch_reading_ease)

In [None]:
messages.head(5)
messages.to_csv('../data/messages_with_features.csv.gzip', compression='gzip')

## HuggingFace Complexity Classifier Model Exploration

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, DistilBertForSequenceClassification
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained('MiriUll/distilbert-german-text-complexity')
model = AutoModelForSequenceClassification.from_pretrained('MiriUll/distilbert-german-text-complexity')

In [None]:
inputs = tokenizer("Mit solchen Drohungen kommt sie nie mehr zurück ", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
predicted_class_id

In [None]:
pipe = pipeline("text-classification", model="MiriUll/distilbert-german-text-complexity")
print(pipe('Das ist ein einfacher Satz.'))
print(pipe('Obwohl der junge Wissenschaftler sich intensiv auf seine Forschungsarbeit konzentrierte, war er oft von den unvorhersehbaren und lauten Bauarbeiten im Nachbargebäude abgelenkt, die seine produktivsten Stunden regelmäßig störten.'))

## Kaggle Emoji Sentiment Dataset

In [None]:
emojis = pd.read_csv('../data/archive/Emoji_Sentiment_Data_v1.0.csv')
#emoji sentiment column based on max value of positive neutral or negative
emojis['sentiment'] = emojis[['Positive', 'Neutral', 'Negative']].idxmax(axis=1)

In [None]:
emojis

# Re-formatting Liwc Dict

In [None]:
file_path = '../data/LIWC2007_German.dic'
skiprows = 70  # Specify the number of rows to skip

data = []

with open(file_path, 'r', encoding='latin1') as file:
    # Step 1: Skip the specified number of rows
    for _ in range(skiprows):
        next(file)
    
    # Read the file line-by-line
    for line in file:
        split_line = line.strip().split('\t')
        word = split_line[0]
        categories = split_line[1:]
        data.append([word, categories])

# Step 2: Create DataFrame with flexible columns
# Define headers
headers = ['word', 'categories']

# Step 3: Create DataFrame
df = pd.DataFrame(data, columns=headers)

# Print the DataFrame to check the result
df

In [None]:
df = df.explode('categories')
df['categories'] = df['categories'].astype(int)

In [None]:
df

In [None]:
liwc_categories = {
    1: 'Pronoun',
    2: 'I',
    3: 'We',
    4: 'Self',
    5: 'You',
    6: 'Other',
    7: 'Negate',
    8: 'Assent',
    9: 'Article',
    10: 'Preps',
    11: 'Number',
    12: 'Affect',
    13: 'Posemo',
    14: 'Posfeel',
    15: 'Optim',
    16: 'Negemo',
    17: 'Anx',
    18: 'Anger',
    19: 'Sad',
    20: 'Cogmech',
    21: 'Cause',
    22: 'Insight',
    23: 'Discrep',
    24: 'Inhib',
    25: 'Tentat',
    26: 'Certain',
    27: 'Senses',
    28: 'See',
    29: 'Hear',
    30: 'Feel',
    31: 'Social',
    32: 'Comm',
    33: 'Othref',
    34: 'Friends',
    35: 'Family',
    36: 'Humans',
    37: 'Time',
    38: 'Past',
    39: 'Present',
    40: 'Future',
    41: 'Space',
    42: 'Up',
    43: 'Down',
    44: 'Incl',
    45: 'Excl',
    46: 'Motion',
    47: 'Occup',
    48: 'School',
    49: 'Job',
    50: 'Achieve',
    51: 'Leisure',
    52: 'Home',
    53: 'Sports',
    54: 'TV',
    55: 'Music',
    56: 'Money',
    57: 'Metaph',
    58: 'Relig',
    59: 'Death',
    60: 'Physcal',
    61: 'Body',
    62: 'Sexual',
    63: 'Eating',
    64: 'Sleep',
    65: 'Groom',
    66: 'Swear',
    67: 'Nonfl',
    68: 'Fillers'
}

In [None]:
df['cat_name'] = df['categories'].map(liwc_categories)

In [None]:
df

In [None]:
#change the order of the columns so that its word, cat_name, categories
df = df[['word', 'cat_name', 'categories']]

#write df to txt file but omit index and column header
df.to_csv('../data/liwc_german_2007.txt', sep='\t', index=False, header=False)

# Making txt file for GAWK script

In [None]:
#load data
filename = 'messages_sample_10'
sample = pd.read_csv(f'../data/samples/{filename}.csv.gzip', compression='gzip').drop('Unnamed: 0', axis=1)

In [None]:
sample

In [None]:
#only keep UID_key and final_message_string and save as txt without "" around messages

sample = sample[['UID_key', 'final_message_string']]
sample.to_csv(f'../data/samples/{filename}.txt', sep='\t', index=False, header=False, quoting=3)

## Changing sampling strategy

In [None]:
sample_size = 10 #how big of a sample to take from each dataset
random_state = 42

########## LOAD AND PREPARE DATASET ##########

#load two datasets, drop unnecessary columns and add column to indicate group or channel
groups = pd.read_csv('../data/selected_groups_with_transcriptions.csv.gzip', compression='gzip').drop(columns=['Unnamed: 0'], axis=1)
channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression='gzip').drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], axis=1)


groups['group_or_channel'] = 'group'
channels['group_or_channel'] = 'channel'


#take random sample of both df where either message or fwd_message (or transcribedmessage if group) contains data and combine
sample_groups = groups[groups['message'].notnull() | groups['fwd_message'].notnull() | groups['transcribed_message'].notnull()].sample(n=sample_size, random_state=random_state)
sample_channels = channels = channels[channels['message'].notnull() | channels['fwd_message'].notnull()].sample(n=sample_size, random_state=random_state)

In [None]:
combined = pd.concat([sample_groups, sample_channels], ignore_index=True, axis=0)

#keep only necessary columns
messages = combined[['UID_key', 'message', 'fwd_message', 'transcribed_message', 'group_or_channel']]

#remove emojis and links
cleaned_messages = []
for message in messages['message'].astype(str):
    message = remove_tags(message)
    cleaned_messages.append(remove_emojis(message))

cleaned_fwd_messages = []
for message in messages['fwd_message'].astype(str):
    message = remove_tags(message)
    cleaned_fwd_messages.append(remove_emojis(message))

messages['message_string'] = cleaned_messages
messages['fwd_message_string'] = cleaned_fwd_messages
messages['message_string'] = messages['message_string'].astype(str)
messages['fwd_message_string'] = messages['fwd_message_string'].astype(str)

#if message, take message else take fwd_message else take transcribed message
messages['final_message'] = np.where(messages['message'].notnull(), messages['message'],
                                    np.where(messages['fwd_message'].notnull(), messages['fwd_message'],
                                             messages['transcribed_message'])).astype(str)
messages['final_message_string'] = np.where(messages['message_string'] != 'nan', messages['message_string'],
                                    np.where(messages['fwd_message_string'] != 'nan', messages['fwd_message_string'],
                                             messages['transcribed_message'])).astype(str)

In [None]:
messages

In [None]:
messages['final_message_string'] = messages['final_message_string'].apply(lambda x: ' '.join(x.split()))

# Subprocessing

In [None]:
! gawk -f ../src/analysis/liwc_category_ratios.awk ../data/liwc_german_2007.txt ../data/samples/messages_sample_200.txt > ../results/liwc_ratios.csv

In [None]:
# Load the output file but remove last column
liwc_ratios = pd.read_csv('../results/liwc_ratios.csv', sep=',')
liwc_ratios = liwc_ratios.iloc[:, :-1]
liwc_ratios

In [None]:
ling_features = pd.read_csv('../results/messages_with_features_200.csv.gzip', compression='gzip').drop('Unnamed: 0', axis=1)

In [None]:
ling_features

In [None]:
#concat liwc_ratios and ling_features based on UID_key
merged = pd.merge(ling_features, liwc_ratios, on='UID_key', how='inner')

In [None]:
merged

# Sentiment Bert

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

sentiment_model = pipeline(model="aari1995/German_Sentiment")

In [None]:
sentence = ["Ich liebe die Bahn. Pünktlich wie immer ... -.-","Krasser Service"]
result = sentiment_model(sentence)
print(result)
#Output:
#[{'label': 'negative', 'score': 0.4935680031776428},{'label': 'positive', 'score': 0.5790663957595825}]

In [None]:
from transformers import AutoTokenizer

# Assuming 'sentiment_model' is already loaded
# Load the tokenizer corresponding to your sentiment model
tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')  # Replace 'model_name' with the actual model name

sentiment_aari = []

for message in messages['final_message_string']:
    # Encode the message, truncate to max length of the model, and only keep the input_ids
    inputs = tokenizer.encode(message, return_tensors='pt', max_length=512, truncation=True)
    # Decode back to text string, to feed into the sentiment model as expected
    truncated_message = tokenizer.decode(inputs[0], skip_special_tokens=True)
    result = sentiment_model(truncated_message)
    sentiment_aari.append(result[0]['label'])

messages['sentiment_aari'] = sentiment_aari

In [None]:
import numpy as np
from tqdm import tqdm

In [None]:
pos_sent = []
neg_sent = []
neutral_sent = []

for message in tqdm(messages['final_message_string'], desc = 'Extracting Sentiment'):
    # if message is empty, don't calculate sentiment
    if message == '' or message == 'nan':
        pos_sent.append(np.nan)
        neg_sent.append(np.nan)
        neutral_sent.append(np.nan)
    else:
        # encode & decode message and truncate to max length that model can handle
        result = sentiment_model(message[:512])
        sent = (result[0]['label'])
        if sent == 'positive':
            pos_sent.append(1)
            neg_sent.append(0)
            neutral_sent.append(0)
        elif sent == 'negative':
            pos_sent.append(0)
            neg_sent.append(1)
            neutral_sent.append(0)
        elif sent == 'neutral':
            pos_sent.append(0)
            neg_sent.append(0)
            neutral_sent.append(1)
        else:
            pos_sent.append(np.nan)
            neg_sent.append(np.nan)
            neutral_sent.append(np.nan)

messages['positive_sentiment'] = pos_sent
messages['negative_sentiment'] = neg_sent
messages['neutral_sentiment'] = neutral_sent

In [None]:
from tqdm import tqdm
import numpy as np

# Initialize sentiment lists
pos_sent = [np.nan] * len(messages['final_message_string'])
neg_sent = [np.nan] * len(messages['final_message_string'])
neutral_sent = [np.nan] * len(messages['final_message_string'])

# Map sentiment labels to list indices
sentiment_map = {
    'positive': (1, 0, 0),
    'negative': (0, 1, 0),
    'neutral': (0, 0, 1)
}

# Process messages
for idx, message in tqdm(enumerate(messages['final_message_string']), desc='Extracting Sentiment', total=len(messages['final_message_string'])):
    # Skip empty messages
    if message in ('', 'nan'):
        continue

    # Run sentiment analysis
    result = sentiment_model(message[:512])  # Use the pipeline directly with the message text
    sent = result[0]['label']

    # Update sentiment lists
    if sent in sentiment_map:
        pos_sent[idx], neg_sent[idx], neutral_sent[idx] = sentiment_map[sent]

# Assign results back to DataFrame
messages['positive_sentiment'] = pos_sent
messages['negative_sentiment'] = neg_sent
messages['neutral_sentiment'] = neutral_sent

In [None]:
for message in tqdm(messages['final_message_string']):
    sentiment = sentiment_model(message[:512])

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
text = "Erneuter Streik in der S-Bahn"
model = AutoModelForSequenceClassification.from_pretrained('ssary/XLM-RoBERTa-German-sentiment')
tokenizer = AutoTokenizer.from_pretrained('ssary/XLM-RoBERTa-German-sentiment')
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
    outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
sentiment_classes = ['negative', 'neutral', 'positive']
print(sentiment_classes[predictions.argmax()]) # for the class with highest probability
print(predictions) # for each class probability

In [None]:
import pandas as pd

In [None]:
messages = pd.read_csv('../data/samples/messages_sample_200.csv.gzip', compression='gzip').drop('Unnamed: 0', axis=1)
messages['final_message_string'] = messages['final_message_string'].astype(str)

In [None]:
# predict sentiment on all messages
sentiment = []
neg_prob = []
neu_prob = []
pos_prob = []

for message in messages['final_message_string']:
    inputs = tokenizer(message, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_classes = ['negative', 'neutral', 'positive']
    sentiment.append(sentiment_classes[predictions.argmax()])
    neg_prob.append(predictions[0][0].item())
    neu_prob.append(predictions[0][1].item())
    pos_prob.append(predictions[0][2].item())

messages['sentiment'] = sentiment
messages['neg_prob'] = neg_prob
messages['neu_prob'] = neu_prob
messages['pos_prob'] = pos_prob

In [None]:
#print all messages with their sentiment
for i, row in messages.iterrows():
    print(f'{row["final_message_string"]} - {row["sentiment"]}\n')

In [None]:
messages[['final_message_string', 'sentiment', 'sentiment_aari']]

In [None]:
#print all messages with their sentiment
for i, row in messages.iterrows():
    print(f'{row["final_message_string"]}\nRoberta: {row["sentiment"]}\nAari: {row["sentiment_aari"]}\n', '-'*50)

In [None]:
test = pd.read_csv('../data/samples/messages_sample_200.csv.gzip', compression = 'gzip')

In [None]:
test

In [None]:
channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression = 'gzip')

In [None]:
channels.columns

# Google Perspective API

In [None]:
import pandas as pd
import numpy as np
from time import sleep
from tqdm import tqdm
import random

from googleapiclient import discovery
import json
from config import API_KEY

In [None]:
client = discovery.build(
"commentanalyzer",
"v1alpha1",
developerKey=API_KEY,
discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
static_discovery=False,
)

In [None]:
df = pd.read_csv('../results/post-aggregation/author_200.csv.gzip', compression = 'gzip')
df['final_message_string'] = df['final_message_string'].astype(str)
df['toxicity'] = 0

In [None]:
def toxicity_detection(sentences, client):
    toxic = []
    for sent in sentences:
        analyze_request = {
            'comment': { 'text': f"{sent}" },
            'languages' : ["de"],
            'requestedAttributes': {'TOXICITY': {}},
        }

        response = client.comments().analyze(body=analyze_request).execute()
        j = json.dumps(response, indent=2)
        #print(json.loads(j)['attributeScores']['TOXICITY']['summaryScore']['value'])
        toxic.append(json.loads(j)['attributeScores']['TOXICITY']['summaryScore']['value'])
    avg = sum(toxic)/len(toxic)
    print(avg)
    return avg

In [None]:
# def toxicity_detection(sentences):
#     toxic = []
#     for sent in sentences:
#         analyze_request = {
#             'comment': { 'text': f"{sent}" },
#             'languages' : ["de"],
#             'requestedAttributes': {'TOXICITY': {}},
#         }

#         response = client.comments().analyze(body=analyze_request).execute()
#         j = json.dumps(response, indent=2)
#         #print(json.loads(j)['attributeScores']['TOXICITY']['summaryScore']['value'])
#         toxic.append(json.loads(j)['attributeScores']['TOXICITY']['summaryScore']['value'])
#     return sum(toxic)/len(toxic)


# # n= 10000
# # list_df = [sample[i:i+n] for i in range(0,len(sample),n)]


# #final_toxic_list = []
# # for df in list_df:
# for i in tqdm(range(len(sample_df))):
#     row = sample_df.iloc[i]
#     #toxic = []
#     if row['toxicity'] == 0: 

#         tmp = [sent.strip() for sent in re.split(r'[.!?]', row.final_message_string) if len(sent.split()) > 5]

#         if (len(tmp) > 100):
#             tmp = random.sample(tmp, 100)
#         if (len(tmp) > 1):
#             row['toxicity'] = toxicity_detection(tmp)

#     sample_df.at[i, 'toxicity'] = row['toxicity']

#     #df.at[i, 'toxicity'] = toxic
#     #final_toxic_list.append(df)

# # con = pd.concat(final_toxic_list)
# # con.to_csv('fa_toxic.csv')

In [None]:
#split df into chunks
n= 20
list_df = [df[i:i+n] for i in range(0,len(df),n)]

#iterate over chunks and rows to extract toxicity score
final_toxic_list = []
for df in list_df:
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        if row['toxicity'] == 0: 
            #split message into list of sentences to pass to toxicity detection function
            tmp = [sent.strip() for sent in re.split(r'[.!?]', row['final_message_string']) if len(sent.split()) > 5]

            if (len(tmp) > 100):
                tmp = random.sample(tmp, 100)
            #print(tmp)
            if (len(tmp) > 1):
                row['toxicity'] = toxicity_detection(tmp, client)
            else:
                print('no sentence')
        df.at[df.index[i], 'toxicity'] = row['toxicity']
        print('df.at...', df.at[df.index[i], 'toxicity'])
    final_toxic_list.append(df)

#concat chunks
df_after = pd.concat(final_toxic_list)

In [None]:
df[df['final_message_string'] == 'nan']

In [None]:
final_toxic_list

In [None]:
len(df_after[df_after['toxicity'] != 0])

In [None]:
len(df_after)

In [None]:
results = pd.read_csv('../results/post-aggregation/author_200.csv.gzip', compression = 'gzip')

In [None]:
results[results['toxicity'] != 0]

# Forwarded Messages

In [None]:
sample_df.describe()

In [None]:
sample_df[sample_df['forwarded_message'] == 1]

In [None]:
new_sample = pd.read_csv('../data/samples/messages_sample_200.csv.gzip', compression = 'gzip')

In [None]:
new_sample

In [None]:
new_sample[new_sample['forwarded_message'] == 1]

In [None]:
new_sample['final_message_string'] = new_sample['final_message_string'].astype(str)
new_sample['final_message'] = new_sample['final_message'].astype(str)

In [None]:
new_sample['sent_count'] = new_sample['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x != '' and x != 'nan' else 0)

In [None]:
new_sample['question_count'] = new_sample['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
#num emojis 
new_sample['emoji_count'] = new_sample['final_message'].apply(lambda x: count_emojis(x) if x else 0)

In [None]:
from tqdm import tqdm
from transformers import AutoTokenizer, pipeline

In [None]:
nouns = []
verbs = []
adjectives = []


for message in tqdm(new_sample['final_message_string'], desc = 'Extracting POS Tag counts'):
        noun, verb, adj = count_pos_tags(message)
        nouns.append(noun)
        verbs.append(verb)
        adjectives.append(adj)
                        
new_sample['noun_count'] = nouns
new_sample['verb_count'] = verbs
new_sample['adj_count'] = adjectives

In [None]:
new_sample

# Aggregation

In [None]:
sample_size = 200
pre_agg = pd.read_csv(f'../results/pre-aggregation/liwcANDfeatures_results_{sample_size}.csv.gzip', compression='gzip')

In [None]:
pre_agg = pd.get_dummies(pre_agg, columns=['group_or_channel', 'flesch_reading_ease_class'])

In [None]:
pre_agg.columns

In [None]:
# Aggregation dictionary
agg_dict = {
    # COUNT
    'UID_key': 'count',

    # SUM
    'own_message': 'sum',
    'forwarded_message': 'sum',
    'noun_count': 'sum',
    'verb_count': 'sum',
    'adj_count': 'sum',
    'positive_sentiment': 'sum',
    'negative_sentiment': 'sum',
    'neutral_sentiment': 'sum',
    'group_or_channel_channel': 'sum',
    'group_or_channel_group': 'sum',
    'flesch_reading_ease_class_difficult': 'sum',
    'flesch_reading_ease_class_easy': 'sum',
    'flesch_reading_ease_class_fairly difficult': 'sum',
    'flesch_reading_ease_class_fairly easy': 'sum',
    'flesch_reading_ease_class_standard': 'sum',
    'flesch_reading_ease_class_unclassified': 'sum',
    'flesch_reading_ease_class_very confusing': 'sum',
    'flesch_reading_ease_class_very easy': 'sum',

    # AVG
    'sent_count': 'mean',
    'word_count': 'mean',
    'avg_sent_length': 'mean',
    'avg_word_length': 'mean',
    'exclamation_count': 'mean',
    'question_count': 'mean',
    'emoji_count': 'mean',
    'flesch_reading_ease': 'mean',
    'liwc_I': 'mean',
    'liwc_We': 'mean',
    'liwc_You': 'mean',
    'liwc_Other': 'mean',
    'liwc_Affect': 'mean',
    
    # ' '.JOIN
    'fwd_message': lambda x: ' '.join(x.dropna().astype(str)),
    'fwd_message_string': lambda x: ' '.join(x.dropna().astype(str)),
    'final_message': lambda x: ' '.join(x.dropna().astype(str)),
    'final_message_string': lambda x: ' '.join(x.dropna().astype(str)),
}

In [None]:
#dict for aggregatopn
agg = pre_agg.groupby(['author', 'date']).agg(agg_dict)

In [None]:
agg

# Parallelization

In [None]:
def analysis(df):    
    df['sent_count'] = df['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x != '' and x != 'nan' else 0)
    #num words
    df['word_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'\w+', x)) if x != '' and x != 'nan' else 0)
    #avg sentence length (words per sentence)
    df['avg_sent_length'] = df.apply(lambda row: row['word_count'] / row['sent_count'] if row['sent_count'] > 0 else 0, axis=1)
    #avg word length (characters per word)
    df['avg_word_length'] = df.apply(lambda row: len(row['final_message_string'].replace(' ', '')) / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
    #num exclamations (multiple ! coutn as one exclamation)
    df['exclamation_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'!+', x)) if x else 0)
    #num questions (multiple ? count as one question)
    df['question_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
    #num emojis 
    df['emoji_count'] = df['final_message'].apply(lambda x: count_emojis(x) if x else 0)

    print('Simple count based features extracted.')

    ########## COUNT OF SELECTED POS TAGS ##########

    #count nouns, verbs and adj
    nouns = []
    verbs = []
    adjectives = []

    for message in tqdm(df['final_message_string'], desc = 'Extracting POS Tag counts'):
        noun, verb, adj = count_pos_tags(message)
        nouns.append(noun)
        verbs.append(verb)
        adjectives.append(adj)
                        
    df['noun_count'] = nouns
    df['verb_count'] = verbs
    df['adj_count'] = adjectives

    ########## FLESCH READING EASE SCORE ##########

    textstat.set_lang('de')
    #compute Flesch Reading Ease score on non-empty df
    df['flesch_reading_ease'] = df['final_message_string'].apply(lambda x: textstat.flesch_reading_ease(x) if x.strip() != '' and x != 'nan' else np.nan)

    #classify scores based on: https://pypi.org/project/textstat/
    flesch_classes = []
    for score in df['flesch_reading_ease']:
        if score >= 0 and score < 30:
            flesch_classes.append('very confusing')
        elif score >= 30 and score < 50:
            flesch_classes.append('difficult')
        elif score >= 50 and score < 60:
            flesch_classes.append('fairly difficult')
        elif score >=60 and score < 70:
            flesch_classes.append('standard')
        elif score >=70 and score < 80:
            flesch_classes.append('fairly easy')
        elif score >=80 and score < 90:
            flesch_classes.append('easy')
        elif score >=90 and score < 101:
            flesch_classes.append('very easy')
        else:
            flesch_classes.append('unclassified')
        
    df['flesch_reading_ease_class'] = flesch_classes

    print('Flesch Reading Ease score extracted.')

    ########## SENTIMENT ANALYSIS ##########

    #load tokenizer and sentiment model
    print('Loading sentiment model...')
    sentiment_model = pipeline(model='aari1995/German_Sentiment')
    tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')  

    pos_sent = []
    neg_sent = []
    neutral_sent = []

    for message in tqdm(df['final_message_string'], desc = 'Extracting Sentiment'):
        #if message is empty, don't calculate sentiment
        if message == '' or message == 'nan':
            pos_sent.append(np.nan)
            neg_sent.append(np.nan)
            neutral_sent.append(np.nan)
        else:
            #truncate message to max length model can handle
            result = sentiment_model(message[:512])
            sent = (result[0]['label'])
            if sent == 'positive':
                pos_sent.append(1)
                neg_sent.append(0)
                neutral_sent.append(0)
            elif sent == 'negative':
                pos_sent.append(0)
                neg_sent.append(1)
                neutral_sent.append(0)
            elif sent == 'neutral':
                pos_sent.append(0)
                neg_sent.append(0)
                neutral_sent.append(1)
            else:
                pos_sent.append(np.nan)
                neg_sent.append(np.nan)
                neutral_sent.append(np.nan)

    df['positive_sentiment'] = pos_sent
    df['negative_sentiment'] = neg_sent
    df['neutral_sentiment'] = neutral_sent
    print('Sentiment extracted.')

    return df

In [None]:
def pool_cluster_metrics(n_cores, network_dict_list):
    rep_list = []

    pool = Pool(n_cores)

    for result in tqdm(
        pool.imap_unordered(func=calculate_cluster_results, iterable=network_dict_list),
        total=len(network_dict_list)
        ):
            rep_list.append(result)

    pool.close()
    return rep_list

In [None]:
#split the dataframe into n_cores parts and return list of dicts
def split_df(n_cores, df):
    df_list = np.array_split(df, n_cores)
    return [df_part.to_dict('records') for df_part in df_list]

In [None]:
n_cores=4
df_list = split_df(4, new_sample)

In [None]:
final = pool_cluster_metrics(n_cores, df_list)

# Post-Aggregation Features

In [None]:
df = pd.read_csv(f'../data/aggregated/author_date_{sample_size}.csv.gzip', compression='gzip')

In [None]:
df= df[df['message_count'] > 1] 

In [None]:
df.head(5)

In [None]:
# List of count columns to convert to percentages
count_columns = [
    'own_message',
    'forwarded_message',
    'positive_sentiment',
    'negative_sentiment',
    'neutral_sentiment',
    'group_or_channel_channel',
    'group_or_channel_group',
    'flesch_reading_ease_class_difficult',
    'flesch_reading_ease_class_easy',
    'flesch_reading_ease_class_fairly difficult',
    'flesch_reading_ease_class_fairly easy',
    'flesch_reading_ease_class_standard',
    'flesch_reading_ease_class_unclassified',
    'flesch_reading_ease_class_very confusing',
    'flesch_reading_ease_class_very easy'
]

# Convert counts to percentages row by row
for index, row in df.iterrows():
    for col in count_columns:
        df.at[index, col] = row[col] / row['message_count']

In [None]:
df.head()

In [None]:
test = pd.read_csv('../results/post-aggregation/author_date_200.csv.gzip', compression='gzip')

In [None]:
test[test['message_count'] > 1]

# Parallelization - Again

In [None]:
df = pd.read_csv('../data/samples/messages_sample_200.csv.gzip', compression='gzip')
df_non = df.copy()

In [None]:
mp.cpu_count()

In [None]:
def analysis(df):    
    ########## FEATURE EXTRACTION ##########

    #num sentences
    df['sent_count'] = df['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x != '' and x != 'nan' else 0)
    #num words
    df['word_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'\w+', x)) if x != '' and x != 'nan' else 0)
    #avg sentence length (words per sentence)
    df['avg_sent_length'] = df.apply(lambda row: row['word_count'] / row['sent_count'] if row['sent_count'] > 0 else 0, axis=1)
    #avg word length (characters per word)
    df['avg_word_length'] = df.apply(lambda row: len(row['final_message_string'].replace(' ', '')) / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
    #num exclamations (multiple ! coutn as one exclamation)
    df['exclamation_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'!+', x)) if x else 0)
    #num questions (multiple ? count as one question)
    df['question_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
    #num emojis 
    df['emoji_count'] = df['final_message'].apply(lambda x: count_emojis(x) if x else 0)

    print('Simple count based features extracted.')

    ########## COUNT OF SELECTED POS TAGS ##########

    #count nouns, verbs and adj
    nouns = []
    verbs = []
    adjectives = []

    for message in tqdm(df['final_message_string'], desc = 'Extracting POS Tag counts'):
        noun, verb, adj = count_pos_tags(message)
        nouns.append(noun)
        verbs.append(verb)
        adjectives.append(adj)
                        
    df['noun_count'] = nouns
    df['verb_count'] = verbs
    df['adj_count'] = adjectives

    ########## FLESCH READING EASE SCORE ##########

    textstat.set_lang('de')
    #compute Flesch Reading Ease score on non-empty df
    df['flesch_reading_ease'] = df['final_message_string'].apply(lambda x: textstat.flesch_reading_ease(x) if x.strip() != '' and x != 'nan' else np.nan)

    #classify scores based on: https://pypi.org/project/textstat/
    flesch_classes = []
    for score in df['flesch_reading_ease']:
        if score >= 0 and score < 30:
            flesch_classes.append('very confusing')
        elif score >= 30 and score < 50:
            flesch_classes.append('difficult')
        elif score >= 50 and score < 60:
            flesch_classes.append('fairly difficult')
        elif score >=60 and score < 70:
            flesch_classes.append('standard')
        elif score >=70 and score < 80:
            flesch_classes.append('fairly easy')
        elif score >=80 and score < 90:
            flesch_classes.append('easy')
        elif score >=90 and score < 101:
            flesch_classes.append('very easy')
        else:
            flesch_classes.append('unclassified')
        
    df['flesch_reading_ease_class'] = flesch_classes

    print('Flesch Reading Ease score extracted.')

    ########## SENTIMENT ANALYSIS ##########

    #load tokenizer and sentiment model
    print('Loading sentiment model...')
    sentiment_model = pipeline(model='aari1995/German_Sentiment')
    tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')  

    pos_sent = []
    neg_sent = []
    neutral_sent = []

    for message in tqdm(df['final_message_string'], desc = 'Extracting Sentiment'):
        #if message is empty, don't calculate sentiment
        if message == '' or message == 'nan':
            pos_sent.append(np.nan)
            neg_sent.append(np.nan)
            neutral_sent.append(np.nan)
        else:
            #truncate message to max length model can handle
            result = sentiment_model(message[:512])
            sent = (result[0]['label'])
            if sent == 'positive':
                pos_sent.append(1)
                neg_sent.append(0)
                neutral_sent.append(0)
            elif sent == 'negative':
                pos_sent.append(0)
                neg_sent.append(1)
                neutral_sent.append(0)
            elif sent == 'neutral':
                pos_sent.append(0)
                neg_sent.append(0)
                neutral_sent.append(1)
            else:
                pos_sent.append(np.nan)
                neg_sent.append(np.nan)
                neutral_sent.append(np.nan)

    df['positive_sentiment'] = pos_sent
    df['negative_sentiment'] = neg_sent
    df['neutral_sentiment'] = neutral_sent
    print('Sentiment extracted.')

In [None]:
def parallelize_dataframe(df, func, num_partitions):
    # Split the dataframe into smaller chunks
    df_split = np.array_split(df, num_partitions)
    # Create a pool of workers
    with mp.Pool(num_partitions) as pool:
        # Apply the function to each chunk
        for df in pool.map(func, df_split):
    return df

In [None]:
df_split = np.array_split(df, 4)

In [None]:
len(df_split)

In [None]:
time_start_non_parallel = time.time()
########## FEATURE EXTRACTION ##########

#num sentences
df_non['sent_count'] = df_non['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x != '' and x != 'nan' else 0)
#num words
df_non['word_count'] = df_non['final_message_string'].apply(lambda x: len(re.findall(r'\w+', x)) if x != '' and x != 'nan' else 0)
#avg sentence length (words per sentence)
df_non['avg_sent_length'] = df_non.apply(lambda row: row['word_count'] / row['sent_count'] if row['sent_count'] > 0 else 0, axis=1)
#avg word length (characters per word)
df_non['avg_word_length'] = df_non.apply(lambda row: len(row['final_message_string'].replace(' ', '')) / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
#num exclamations (multiple ! coutn as one exclamation)
df_non['exclamation_count'] = df_non['final_message_string'].apply(lambda x: len(re.findall(r'!+', x)) if x else 0)
#num questions (multiple ? count as one question)
df_non['question_count'] = df_non['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
#num emojis 
df_non['emoji_count'] = df_non['final_message'].apply(lambda x: count_emojis(x) if x else 0)

print('Simple count based features extracted.')

########## COUNT OF SELECTED POS TAGS ##########

#count nouns, verbs and adj
nouns = []
verbs = []
adjectives = []

for message in tqdm(df_non['final_message_string'], desc = 'Extracting POS Tag counts'):
    noun, verb, adj = count_pos_tags(message)
    nouns.append(noun)
    verbs.append(verb)
    adjectives.append(adj)
                    
df_non['noun_count'] = nouns
df_non['verb_count'] = verbs
df_non['adj_count'] = adjectives

########## FLESCH READING EASE SCORE ##########

textstat.set_lang('de')
#compute Flesch Reading Ease score on non-empty df_non
df_non['flesch_reading_ease'] = df_non['final_message_string'].apply(lambda x: textstat.flesch_reading_ease(x) if x.strip() != '' and x != 'nan' else np.nan)

#classify scores based on: https://pypi.org/project/textstat/
flesch_classes = []
for score in df_non['flesch_reading_ease']:
    if score >= 0 and score < 30:
        flesch_classes.append('very confusing')
    elif score >= 30 and score < 50:
        flesch_classes.append('difficult')
    elif score >= 50 and score < 60:
        flesch_classes.append('fairly difficult')
    elif score >=60 and score < 70:
        flesch_classes.append('standard')
    elif score >=70 and score < 80:
        flesch_classes.append('fairly easy')
    elif score >=80 and score < 90:
        flesch_classes.append('easy')
    elif score >=90 and score < 101:
        flesch_classes.append('very easy')
    else:
        flesch_classes.append('unclassified')
    
df_non['flesch_reading_ease_class'] = flesch_classes

print('Flesch Reading Ease score extracted.')

########## SENTIMENT ANALYSIS ##########

#load tokenizer and sentiment model
print('Loading sentiment model...')
sentiment_model = pipeline(model='aari1995/German_Sentiment')
tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')  

pos_sent = []
neg_sent = []
neutral_sent = []

for message in tqdm(df_non['final_message_string'], desc = 'Extracting Sentiment'):
    #if message is empty, don't calculate sentiment
    if message == '' or message == 'nan':
        pos_sent.append(np.nan)
        neg_sent.append(np.nan)
        neutral_sent.append(np.nan)
    else:
        #truncate message to max length model can handle
        result = sentiment_model(message[:512])
        sent = (result[0]['label'])
        if sent == 'positive':
            pos_sent.append(1)
            neg_sent.append(0)
            neutral_sent.append(0)
        elif sent == 'negative':
            pos_sent.append(0)
            neg_sent.append(1)
            neutral_sent.append(0)
        elif sent == 'neutral':
            pos_sent.append(0)
            neg_sent.append(0)
            neutral_sent.append(1)
        else:
            pos_sent.append(np.nan)
            neg_sent.append(np.nan)
            neutral_sent.append(np.nan)

df_non['positive_sentiment'] = pos_sent
df_non['negative_sentiment'] = neg_sent
df_non['neutral_sentiment'] = neutral_sent
print('Sentiment extracted.')

time_end_non_parallel = time.time()
print(f'Non-parallel execution time: {time_end_non_parallel - time_start_non_parallel} seconds')

# Threading

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import numpy as np
import pandas as pd
from transformers import pipeline, AutoTokenizer

In [None]:
messages = pd.read_csv(f'../data/samples/messages_sample_200.csv.gzip', compression='gzip').drop(columns=['Unnamed: 0'], axis=1)
messages['final_message_string'] = messages['final_message_string'].astype(str)

In [None]:
# Load tokenizer and sentiment model
print('Loading sentiment model...')
sentiment_model = pipeline(model='aari1995/German_Sentiment')
tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')

# Define a function to process a single message
def analyze_sentiment(message):
    if message == '' or message == 'nan':
        return np.nan, np.nan, np.nan
    else:
        # Truncate message to max length model can handle
        result = sentiment_model(message[:512])
        sent = result[0]['label']
        if sent == 'positive':
            return 1, 0, 0
        elif sent == 'negative':
            return 0, 1, 0
        elif sent == 'neutral':
            return 0, 0, 1
        else:
            return np.nan, np.nan, np.nan

# Initialize lists to store sentiment results
pos_sent = []
neg_sent = []
neutral_sent = []

In [None]:
# Use ThreadPoolExecutor to parallelize sentiment analysis
print('Starting sentiment extraction...')
with ThreadPoolExecutor(max_workers=4) as executor:
    # Submit tasks and store futures
    futures = [executor.submit(analyze_sentiment, msg) for msg in messages['final_message_string']]
    # Process results as they become available
    for future in tqdm(as_completed(futures), total=len(futures), desc='Extracting Sentiment'):
        pos, neg, neutral = future.result()
        pos_sent.append(pos)
        neg_sent.append(neg)
        neutral_sent.append(neutral)

# Add sentiment results to the DataFrame
messages['positive_sentiment'] = pos_sent
messages['negative_sentiment'] = neg_sent
messages['neutral_sentiment'] = neutral_sent
print('Sentiment extraction done.')

# was_forwarded

In [None]:
sample_size = 200
pre_agg = pd.read_csv(f'../results/pre-aggregation/liwcANDfeatures_results_{sample_size}.csv.gzip', compression='gzip')
print('Dataset loaded.')

In [None]:
pre_agg = pd.get_dummies(pre_agg, columns=['group_or_channel'])
print('Dummies for categorial variables created.')
messages = pre_agg[['author', 'own_message', 'forwarded_message', 'fwd_author', 'UID_key', 'group_name', 'date']]
pre_agg = pre_agg[pre_agg['own_message'] == 1]

In [None]:
agg_dict = {
    # SUM
    'noun_count': 'sum',
    'verb_count': 'sum',
    'adj_count': 'sum',
    'positive_sentiment': 'sum',
    'negative_sentiment': 'sum',
    'neutral_sentiment': 'sum',
    'group_or_channel_channel': 'sum',
    'group_or_channel_group': 'sum',

    # AVG
    'sent_count': 'mean',
    'word_count': 'mean',
    'avg_sent_length': 'mean',
    'avg_word_length': 'mean',
    'exclamation_count': 'mean',
    'question_count': 'mean',
    'emoji_count': 'mean',
    'flesch_reading_ease': 'mean',
    'liwc_I': 'mean',
    'liwc_We': 'mean',
    'liwc_You': 'mean',
    'liwc_Other': 'mean',
    'liwc_Affect': 'mean',
    
    # ' '.JOIN
    'final_message': lambda x: ' '.join(x.dropna().astype(str)),
    'final_message_string': lambda x: ' '.join(x.dropna().astype(str)),
}

# Aggregation dictionary for message ratios
agg_dict_messages = {
    'own_message': 'sum',
    'forwarded_message': 'sum',
    'UID_key': 'count'
}

########## RENAMING COLUMNS ##########

rename_dict = {'group_or_channel_channel': 'channel_messages', 'group_or_channel_group': 'group_messages', 'UID_key': 'total_message_count'}


print('Aggregating per author and group...')
#aggregate linguistic features
agg_author_group = pre_agg.groupby(['author', 'group_name']).agg(agg_dict)
agg_author_group = agg_author_group.rename(columns=rename_dict)
#aggregate message ratios
agg_author_group_messages = messages.groupby(['author', 'group_name']).agg(agg_dict_messages)
agg_author_group_messages = agg_author_group_messages.rename(columns=rename_dict)
#concat based on author and group columns
agg_author_group = pd.merge(
    left = agg_author_group,
    right = agg_author_group_messages,
    how = 'outer',
    left_on = ['author', 'group_name'],
    right_on = ['author', 'group_name']
)

In [None]:
agg_author_group.index[0]

In [None]:
agg_author_group.head()

# Edgelist

In [None]:
import pandas as pd
from itertools import combinations
from collections import Counter
sample_size = 200

In [None]:
df = pd.read_csv(f'../data/samples/messages_sample_{sample_size}.csv.gzip', compression='gzip')

In [None]:
# get list of authors in each group
grouped_authors = df.groupby('group_name')['author'].apply(list)

# get combinations of two authors in each group
edges = []
for authors in grouped_authors:
    if len(authors) > 1:
        edges += combinations(sorted(set(authors)), 2)

# count occurences of combo to determine edge weight
edge_weights = Counter(edges)

# save as df
edgelist = pd.DataFrame(edge_weights.items(), columns=['edge', 'weight'])
edgelist[['author_1', 'author_2']] = pd.DataFrame(edgelist['edge'].tolist(), index=edgelist.index)
edgelist = edgelist.drop(columns='edge')
edgelist.to_csv(f'../data/edgelists/author_{sample_size}_edgelist.csv', index=False)

# Final weighted edgelist with columns 'author_1', 'author_2', and 'weight'
print(edgelist)

# FINNALY FIXED Toxicity Code

In [None]:
def toxicity_detection(message, client):
    analyze_request = {
        'comment': { 'text': f"{message}" },
        'languages' : ["de"],
        'requestedAttributes': {'TOXICITY': {}},
    }
    response = client.comments().analyze(body=analyze_request).execute()
    toxic =response['attributeScores']['TOXICITY']['summaryScore']['value']
    return toxic

In [None]:
import random
#initialize column
toxicity = []

for i in tqdm(range(len(results))):
    row = results.iloc[i]
    message = row['final_message_string']
    if row['own_message'] == 1:
        tox = toxicity_detection(message, client)
        toxicity.append(tox)
    else:
        toxicity.append(np.nan)

results['toxicity'] = toxicity

# Get Cluster Assignments

In [2]:
features = pd.read_csv('../results/post-aggregation/author_full.csv.gzip', compression='gzip')
# load "raw" dataset to analyse adjacency matrix
data = pd.read_csv('../data/samples/messages_sample_full.csv.gzip', compression='gzip')

  features = pd.read_csv('../results/post-aggregation/author_full.csv.gzip', compression='gzip')


In [3]:
model_3 = DAEGC(30, 128, 9, 6)

In [4]:
model_4 = DAEGC(30, 128, 9, 6)

In [5]:
model_5 = DAEGC(30, 128, 9, 6)

In [6]:
model_3.load_state_dict(torch.load('../model/DAEGC_3Clusters.pkl'))
model_3.eval()

DAEGC(
  (gat): GAT(
    (conv1): GATLayer (30 -> 128)
    (conv2): GATLayer (128 -> 9)
  )
  (gat_layer1): GATLayer (9 -> 9)
)

In [7]:
model_4.load_state_dict(torch.load('../model/DAEGC_4Clusters.pkl'))
model_4.eval()

DAEGC(
  (gat): GAT(
    (conv1): GATLayer (30 -> 128)
    (conv2): GATLayer (128 -> 9)
  )
  (gat_layer1): GATLayer (9 -> 9)
)

In [8]:
# load model weights
model_5.load_state_dict(torch.load('../model/DAEGC_5Clusters.pkl'))
model_5.eval()

DAEGC(
  (gat): GAT(
    (conv1): GATLayer (30 -> 128)
    (conv2): GATLayer (128 -> 9)
  )
  (gat_layer1): GATLayer (9 -> 9)
)

In [9]:
adj, adj_norm = create_adj_matrix(data)

Authors: 16885
Adjacency tensor shape: torch.Size([16885, 16885])
Normalized adjacency tensor shape: torch.Size([16885, 16885])


In [10]:
x = create_feature_matrix(features)

Feature matrix created.
Feature tensor shape: torch.Size([16885, 30])


In [11]:
M = get_M(adj_norm)

Transition matrix shape: (16885, 16885)


In [12]:
# get soft embeddings & cluster assignments
with torch.no_grad():
    _, z_3, q_3 = model_3(x, adj_norm, M)
    _, z_4, q_4 = model_4(x, adj_norm, M)
    _, z_5, q_5 = model_5(x, adj_norm, M)

In [25]:
# get cluster assignments
q3_labels = torch.argmax(q_3, dim=1)
# count number of nodes in each cluster
q3_cluster_count = Counter(q3_labels.numpy())
print('Cluster Assignment for 3 Clusters:', q3_cluster_count)

Cluster Assignment for 3 Clusters: Counter({3: 11542, 0: 3307, 5: 2036})


In [18]:
# get cluster assignments
q4_labels = torch.argmax(q_4, dim=1)
# count number of nodes in each cluster
q4_cluster_count = Counter(q4_labels.numpy())
print('Cluster Assignment for 4 Clusters:', q4_cluster_count)

Cluster Assignment for 4 Clusters: Counter({3: 11542, 0: 3284, 5: 2034, 1: 25})


In [20]:
# get cluster assignments
q5_labels = torch.argmax(q_5, dim=1)
# count number of nodes in each cluster
q5_cluster_count = Counter(q5_labels.numpy())
print('Cluster Assignment for 5 Clusters:', q5_cluster_count)

Cluster Assignment for 5 Clusters: Counter({3: 11264, 4: 3620, 5: 976, 1: 961, 0: 64})


In [21]:
# add cluster assignments to features
features['cluster_3'] = q3_labels.numpy()
features['cluster_4'] = q4_labels.numpy()
features['cluster_5'] = q5_labels.numpy()

In [22]:
features

Unnamed: 0,author,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,channel_messages,group_messages,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,final_message,final_message_string,own_message,forwarded_message,total_message_count,was_forwarded,own_message_count,forwarded_message_count,action_quotient,sentiment_quotient,avg_flesch_reading_ease_class,toxicity,cluster_3,cluster_4,cluster_5
0,!!pv--roland--vp!!,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,"<a href=""https://youtu.be/i8SOiIPx-KQ"">https:/...",,0.571429,0.0,7,0,4,0,,,unclassified,,3,3,3
1,!bex,1.0,0.0,0.0,0.000000,1.000000,0.000000,0.0,1.0,1.000000,2.000000,2.000000,5.000000,0.000000,0.000000,0.000000,31.750000,0.000000,0.000000,0.000000,0.000000,0.000000,ANTIFA MOFU,ANTIFA MOFU,1.000000,0.0,1,0,1,0,,0.000000,difficult,0.249241,3,3,3
2,!pv---l. k. ---vp!,1.0,2.0,1.0,0.000000,1.000000,0.000000,0.0,1.0,2.000000,14.000000,7.000000,4.857143,0.000000,0.000000,0.000000,84.100000,0.000000,0.000000,0.071429,0.000000,0.071429,"Wohin bist Du denn geflüchtet, ist doch fast ...","Wohin bist Du denn geflüchtet, ist doch fast ü...",1.000000,0.0,1,0,1,0,2.000000,0.000000,easy,0.426917,3,3,3
3,!pv---lotti scarlotta ---vp!,26.0,12.0,10.0,0.500000,0.000000,0.500000,0.0,1.0,7.500000,76.500000,8.500000,6.595588,0.000000,0.500000,2.500000,47.725000,0.044972,0.007463,0.000000,0.011194,0.056167,"Ich wünsche gute ""Verdaulichkeit"". 😉 Blaues Ge...","Ich wünsche gute ""Verdaulichkeit"". Blaues Gemü...",1.000000,0.0,2,0,2,0,1.200000,,difficult,0.105663,0,0,4
4,!pv--tom--pv!,95.0,58.0,19.0,0.222222,0.361111,0.388889,0.0,1.0,3.027778,16.222222,4.616204,5.016480,0.555556,0.138889,0.055556,82.231143,0.001736,0.004934,0.003968,0.032971,0.072214,"Na ja, andererseits sagte Trump und sein Gefol...","Na ja, andererseits sagte Trump und sein Gefol...",0.947368,0.0,38,0,36,0,3.052632,0.615385,easy,,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16880,😎arki 😎,,,,,,,,,,,,,,,,,,,,,,,,0.000000,1.0,1,1,0,1,,,unclassified,,3,3,3
16881,🙏🏻❤️tinu❤️🙏🏻,,,,,,,,,,,,,,,,,,,,,,,,0.000000,1.0,1,0,0,1,,,unclassified,,3,3,3
16882,🚨vwievendetta & 🐕,,,,,,,,,,,,,,,,,,,,,,,,0.000000,1.0,2,0,0,2,,,unclassified,,3,3,3
16883,🤓 müller,,,,,,,,,,,,,,,,,,,,,,,,0.000000,1.0,1,0,0,1,,,unclassified,,3,3,3


In [23]:
features.to_csv('../results/author_full_clusters.csv.gzip', compression='gzip', index=False)

In [24]:
# save node embeddings
np.save('../results/author_full_z_3.npy', z_3.numpy())
np.save('../results/author_full_z_4.npy', z_4.numpy())
np.save('../results/author_full_z_5.npy', z_5.numpy())

In [29]:
test = pd.read_csv('../results/author_full_features_and_clusters.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 88888: invalid continuation byte

In [30]:
test = pd.read_csv('../results/post-aggregation/author_full.csv.gzip', compression='gzip')

  test = pd.read_csv('../results/post-aggregation/author_full.csv.gzip', compression='gzip')


In [31]:
test.dtypes

author                            object
noun_count                       float64
verb_count                       float64
adj_count                        float64
positive_sentiment               float64
negative_sentiment               float64
neutral_sentiment                float64
channel_messages                 float64
group_messages                   float64
sent_count                       float64
word_count                       float64
avg_sent_length                  float64
avg_word_length                  float64
exclamation_count                float64
question_count                   float64
emoji_count                      float64
flesch_reading_ease              float64
liwc_I                           float64
liwc_We                          float64
liwc_You                         float64
liwc_Other                       float64
liwc_Affect                      float64
final_message                     object
final_message_string              object
own_message     

In [36]:
import random

In [37]:
# get list of unique ids of len(df) to anonymize authors
ids = list(range(len(test)))
random.shuffle(ids)

# create dict to map authors to ids
author_to_id = dict(zip(test['author'], ids))


In [42]:
agg_data = test.copy()

In [54]:
data = pd.read_csv(f'../data/samples/messages_sample_full.csv.gzip', compression='gzip')

In [44]:
# anonymize auhtor according to dict in both agg_data and data
agg_data['author_id'] = agg_data['author'].map(author_to_id)

In [55]:
data['author_id'] = data['author'].map(author_to_id)

In [56]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,UID_key,author,fwd_message,fwd_author,date,group_or_channel,own_message,forwarded_message,group_name,fwd_message_string,final_message,final_message_string,author_id
0,0,0,1930547378214939428038991,lord hol,,,2021-01-01,group,1,0,Alles Ausser Mainstream Chat,,Du wirst hier keine Meinungsfreiheit erfahren....,Du wirst hier keine Meinungsfreiheit erfahren....,6056
1,1,1,3440874592547339069585367,luis martinez,,,2021-01-01,group,1,0,1Research7Intelligence Room,,"This character is presumed to be John Podesta,...","This character is presumed to be John Podesta,...",7132
2,2,2,3440904592547339069585367,andrew,,,2021-01-01,group,1,0,1Research7Intelligence Room,,die feuerwehrfahrzeuge sehen aber nicht wie di...,die feuerwehrfahrzeuge sehen aber nicht wie di...,11501
3,3,3,3440914592547339069585367,d. zerone,Berlin-Reinickendorf 🧨 Neukölln 01.01.2021 4k ...,News ❤️,2021-01-01,group,0,1,1Research7Intelligence Room,Berlin-Reinickendorf Neukölln 01.01.2021 4k _...,,,2266
4,4,4,3440924592547339069585367,luis martinez,,,2021-01-01,group,1,0,1Research7Intelligence Room,,"weil ich für die Gruppe nicht beleidigen kann,...","weil ich für die Gruppe nicht beleidigen kann,...",7132


In [50]:
# rename author_id to author and drop old auhtor column
agg_data = agg_data.drop(columns='author', axis=1)
agg_data = agg_data.rename(columns={'author_id': 'author'})

data = data.drop(columns='author', axis=1)
data = data.rename(columns={'author_id': 'author'})

In [57]:
data = data.drop(columns='author', axis=1)
data = data.rename(columns={'author_id': 'author'})

In [51]:
agg_data.head()

Unnamed: 0,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,channel_messages,group_messages,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,final_message,final_message_string,own_message,forwarded_message,total_message_count,was_forwarded,own_message_count,forwarded_message_count,action_quotient,sentiment_quotient,avg_flesch_reading_ease_class,toxicity,author
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"<a href=""https://youtu.be/i8SOiIPx-KQ"">https:/...",,0.571429,0.0,7,0,4,0,,,unclassified,,9885
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,2.0,5.0,0.0,0.0,0.0,31.75,0.0,0.0,0.0,0.0,0.0,ANTIFA MOFU,ANTIFA MOFU,1.0,0.0,1,0,1,0,,0.0,difficult,0.249241,9920
2,1.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,14.0,7.0,4.857143,0.0,0.0,0.0,84.1,0.0,0.0,0.071429,0.0,0.071429,"Wohin bist Du denn geflüchtet, ist doch fast ...","Wohin bist Du denn geflüchtet, ist doch fast ü...",1.0,0.0,1,0,1,0,2.0,0.0,easy,0.426917,8703
3,26.0,12.0,10.0,0.5,0.0,0.5,0.0,1.0,7.5,76.5,8.5,6.595588,0.0,0.5,2.5,47.725,0.044972,0.007463,0.0,0.011194,0.056167,"Ich wünsche gute ""Verdaulichkeit"". 😉 Blaues Ge...","Ich wünsche gute ""Verdaulichkeit"". Blaues Gemü...",1.0,0.0,2,0,2,0,1.2,,difficult,0.105663,13832
4,95.0,58.0,19.0,0.222222,0.361111,0.388889,0.0,1.0,3.027778,16.222222,4.616204,5.01648,0.555556,0.138889,0.055556,82.231143,0.001736,0.004934,0.003968,0.032971,0.072214,"Na ja, andererseits sagte Trump und sein Gefol...","Na ja, andererseits sagte Trump und sein Gefol...",0.947368,0.0,38,0,36,0,3.052632,0.615385,easy,,14964


In [52]:
agg_data.to_csv('../results/post-aggregation/author_full.csv.gzip', compression='gzip', index=False)


In [58]:
data.to_csv('../data/samples/messages_sample_full.csv.gzip', compression='gzip', index=False)

In [59]:
# save dict
import json

with open('../data/author_to_id.json', 'w') as f:
    json.dump(author_to_id, f)

In [60]:
test_data = pd.read_csv('../data/samples/messages_sample_full.csv.gzip', compression='gzip')

In [61]:
test_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,UID_key,fwd_message,fwd_author,date,group_or_channel,own_message,forwarded_message,group_name,fwd_message_string,final_message,final_message_string,author
0,0,0,1930547378214939428038991,,,2021-01-01,group,1,0,Alles Ausser Mainstream Chat,,Du wirst hier keine Meinungsfreiheit erfahren....,Du wirst hier keine Meinungsfreiheit erfahren....,6056
1,1,1,3440874592547339069585367,,,2021-01-01,group,1,0,1Research7Intelligence Room,,"This character is presumed to be John Podesta,...","This character is presumed to be John Podesta,...",7132
2,2,2,3440904592547339069585367,,,2021-01-01,group,1,0,1Research7Intelligence Room,,die feuerwehrfahrzeuge sehen aber nicht wie di...,die feuerwehrfahrzeuge sehen aber nicht wie di...,11501
3,3,3,3440914592547339069585367,Berlin-Reinickendorf 🧨 Neukölln 01.01.2021 4k ...,News ❤️,2021-01-01,group,0,1,1Research7Intelligence Room,Berlin-Reinickendorf Neukölln 01.01.2021 4k _...,,,2266
4,4,4,3440924592547339069585367,,,2021-01-01,group,1,0,1Research7Intelligence Room,,"weil ich für die Gruppe nicht beleidigen kann,...","weil ich für die Gruppe nicht beleidigen kann,...",7132


In [62]:
test_agg_data = pd.read_csv('../results/post-aggregation/author_full.csv.gzip', compression='gzip')

  test_agg_data = pd.read_csv('../results/post-aggregation/author_full.csv.gzip', compression='gzip')


In [63]:
test_agg_data.head()

Unnamed: 0,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,channel_messages,group_messages,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,final_message,final_message_string,own_message,forwarded_message,total_message_count,was_forwarded,own_message_count,forwarded_message_count,action_quotient,sentiment_quotient,avg_flesch_reading_ease_class,toxicity,author
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"<a href=""https://youtu.be/i8SOiIPx-KQ"">https:/...",,0.571429,0.0,7,0,4,0,,,unclassified,,9885
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,2.0,5.0,0.0,0.0,0.0,31.75,0.0,0.0,0.0,0.0,0.0,ANTIFA MOFU,ANTIFA MOFU,1.0,0.0,1,0,1,0,,0.0,difficult,0.249241,9920
2,1.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,14.0,7.0,4.857143,0.0,0.0,0.0,84.1,0.0,0.0,0.071429,0.0,0.071429,"Wohin bist Du denn geflüchtet, ist doch fast ...","Wohin bist Du denn geflüchtet, ist doch fast ü...",1.0,0.0,1,0,1,0,2.0,0.0,easy,0.426917,8703
3,26.0,12.0,10.0,0.5,0.0,0.5,0.0,1.0,7.5,76.5,8.5,6.595588,0.0,0.5,2.5,47.725,0.044972,0.007463,0.0,0.011194,0.056167,"Ich wünsche gute ""Verdaulichkeit"". 😉 Blaues Ge...","Ich wünsche gute ""Verdaulichkeit"". Blaues Gemü...",1.0,0.0,2,0,2,0,1.2,,difficult,0.105663,13832
4,95.0,58.0,19.0,0.222222,0.361111,0.388889,0.0,1.0,3.027778,16.222222,4.616204,5.01648,0.555556,0.138889,0.055556,82.231143,0.001736,0.004934,0.003968,0.032971,0.072214,"Na ja, andererseits sagte Trump und sein Gefol...","Na ja, andererseits sagte Trump und sein Gefol...",0.947368,0.0,38,0,36,0,3.052632,0.615385,easy,,14964


In [64]:
# get author ids that are in agg_data but not in data
missing_authors = set(test_agg_data['author']) - set(test_data['author'])

In [65]:
missing_authors

set()

In [66]:
grouped_authors = test_data.groupby('group_name')['author'].apply(set)

# get unique authors and map them to indices
authors = sorted(set(str(author) for author in test_data['author']))
author_idx_map = {author: idx for idx, author in enumerate(authors)}

# Check for missing authors
missing_authors = [author for author in authors if author not in author_idx_map]
if missing_authors:
    print(f"Missing authors in author_idx_map: {missing_authors}")

In [67]:
missing_authors

[]

In [68]:
# get combinations of two authors in each group
edges = []
for authors_in_group in grouped_authors:
    if len(authors_in_group) > 1:
        edges += combinations(authors_in_group, 2)

# count occurrences of each combination to determine edge weight
edge_weights = Counter(edges)

In [69]:
# Create empty lists for COO sparse matrix format (row, col, data)
row_indices = []
col_indices = []
data = []

In [70]:
for (author_1, author_2), weight in edge_weights.items():
    idx_1 = author_idx_map[author_1]
    idx_2 = author_idx_map[author_2]

    # Add both directions since the matrix is symmetric
    row_indices.append(idx_1)
    col_indices.append(idx_2)
    data.append(weight)
    
    row_indices.append(idx_2)
    col_indices.append(idx_1)
    data.append(weight)

KeyError: 16384

In [72]:
idx_test = author_idx_map['16384']

In [74]:
create_feature_matrix(agg_data)

Feature matrix created.
Feature tensor shape: torch.Size([16885, 29])


tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.2492],
        [2.0000, 1.0000, 0.0000,  ..., 2.0000, 0.0000, 0.4269],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [77]:
dataset = test_agg_data.fillna(0)
# Create empty lists for COO sparse matrix format (row, col, data)
row_indices = []
col_indices = []
data = []
feature_columns = dataset.columns
feature_columns = [feat for feat in feature_columns if (feat != 'final_message_string') & (feat != 'final_message') & (feat != 'author') & (feat != 'avg_flesch_reading_ease_class')]