In [94]:
import sys
import os
#sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # for scripts
project_root = '/Users/emilykruger/Documents/GitHub/CSH-Internship'
functions_dir = os.path.join(project_root, 'src/functions')
sys.path.append(project_root) #for local notebook
sys.path.append(functions_dir) #for local notebook


import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import regex as re
from src.functions.linguistic_features import remove_emojis, remove_tags, count_emojis, preprocess_text, count_pos_tags
from textstat import flesch_reading_ease
import subprocess
from tqdm import tqdm
from transformers import AutoTokenizer, pipeline
import multiprocessing as mp

# Building Code for Linguistic Features

In this notebook, code will be written to extract linguistic features from the dataset. It will be done on a small subsample. Afterwards code will be transferred to a script to run on the full dataset.

## Loading Data

### Initially

In [2]:
groups = pd.read_csv('../data/selected_groups_with_transcriptions.csv.gzip', compression='gzip')
channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression='gzip')

  groups = pd.read_csv('../data/selected_groups_with_transcriptions.csv.gzip', compression='gzip')
  channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression='gzip')


In [3]:
groups = groups.drop(columns=['Unnamed: 0'], axis=1)
groups['group_or_channel'] = 'group'
groups.head(5)

Unnamed: 0,UID_key,initial_ID,mid_message,mid_file,group_name,posting_date,message,fwd_message,fwd_posting_date_message,posting_date_file,link_url,media_file,media_file_type,fwd_posting_date_file,fwd_link_url,fwd_media_file,fwd_media_file_type,author,fwd_author,day,week,weekday,message_hash,fwd_message_hash,website,replied_to,year,month,day_of_year,duration,filepath,filename,filename_if_joined,transcribed_message,newsguard_domain,newsguard_score,group_or_channel
0,209982-6408240263544091615,0,209982.0,209982.0,Helfende Q Hände 👐👏,2021-11-09 22:50:15+00:00,Oki 😃,,,2021-11-09 22:50:15+00:00,messages199.html#go_to_message209980,,,,,,,9.267455e+17,,2021-11-09 12:00:00+00:00,45.0,2.0,-3.596942e+18,,,209980-6408240263544091615,2021.0,11.0,313.0,,,,,,,,group
1,209983-6408240263544091615,1,209983.0,,Helfende Q Hände 👐👏,2021-11-09 22:50:18+00:00,Von weiblich bis männlich,,,,,,,,,,,-9.029508e+18,,2021-11-09 12:00:00+00:00,45.0,2.0,-7.438051e+18,,,,2021.0,11.0,313.0,,,,,,,,group
2,209984-6408240263544091615,2,209984.0,,Helfende Q Hände 👐👏,2021-11-09 22:50:23+00:00,Von direkt zu zurückhaltend,,,,,,,,,,,-9.029508e+18,,2021-11-09 12:00:00+00:00,45.0,2.0,-5.141813e+18,,,,2021.0,11.0,313.0,,,,,,,,group
3,209985-6408240263544091615,3,209985.0,,Helfende Q Hände 👐👏,2021-11-09 22:50:28+00:00,Wie bei WOW,,,,,,,,,,,-9.029508e+18,,2021-11-09 12:00:00+00:00,45.0,2.0,4.179569e+18,,,,2021.0,11.0,313.0,,,,,,,,group
4,209986-6408240263544091615,4,209986.0,,Helfende Q Hände 👐👏,2021-11-09 22:50:47+00:00,Neuer Charakter und XP sammeln,,,,,,,,,,,-9.029508e+18,,2021-11-09 12:00:00+00:00,45.0,2.0,3.772441e+18,,,,2021.0,11.0,313.0,,,,,,,,group


In [4]:
channels = channels.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
channels['group_or_channel'] = 'channel'
channels.head(5)

Unnamed: 0,UID_key,mid_message,mid_file,group_name,posting_date,message,fwd_message,fwd_posting_date_message,posting_date_file,link_url,media_file,media_file_type,fwd_posting_date_file,fwd_link_url,fwd_media_file,fwd_media_file_type,author,fwd_author,day,week,weekday,message_hash,fwd_message_hash,website,replied_to,year,month,day_of_year,fwd_urls,urls,newsguard_scores,fwd_regex,fwd_cryptocurrency,fwd_cryptolabel,regex,cryptocurrency,cryptolabel,group_or_channel
0,16262-1145969155139279504,16262.0,,Tagesereignisse der Offenbarung,2021-06-22 12:01:01+00:00,,EIN FILMISCHES DENKMAL = DENK MAL WIEDER DARAN...,22.06.2021 07:54:10,,,,,,,,,-1.145969e+18,-8.581437e+18,2021-06-22 12:00:00+00:00,25.0,2.0,,-1.816379e+18,,,2021.0,6.0,173.0,,,,,,,,,,channel
1,16263-1145969155139279504,16263.0,,Tagesereignisse der Offenbarung,2021-06-22 12:02:26+00:00,,Ein weiteres sehr wichtiges „DENK MAL“ liefert...,22.06.2021 08:25:19,,,,,,,,,-1.145969e+18,-8.581437e+18,2021-06-22 12:00:00+00:00,25.0,2.0,,8.99959e+18,,,2021.0,6.0,173.0,,,,,,,,,,channel
2,16265-1145969155139279504,16265.0,,Tagesereignisse der Offenbarung,2021-06-22 12:02:27+00:00,,☝🏻Das ist eine der über 700 Klagen die Trump b...,22.06.2021 11:46:00,,,,,,,,,-1.145969e+18,-3.777232e+18,2021-06-22 12:00:00+00:00,25.0,2.0,,-6.77513e+18,,,2021.0,6.0,173.0,t.me,,,,,,,,,channel
3,16266-1145969155139279504,16266.0,,Tagesereignisse der Offenbarung,2021-06-22 12:02:59+00:00,,🎥 <u>General Flynn: In wenigen Wochen platzt d...,21.06.2021 21:24:59,,,,,,,,,-1.145969e+18,9.086751e+18,2021-06-22 12:00:00+00:00,25.0,2.0,,-4.939329e+18,,,2021.0,6.0,173.0,"['paypal.com', 't.me', 't.me']",,"[nan, nan, nan]",,,,,,,channel
4,16267-1145969155139279504,16267.0,,Tagesereignisse der Offenbarung,2021-06-22 12:06:56+00:00,,<em>Schon gelesen? Die Bürger der Gemeinde Zwö...,16.06.2021 21:20:01,,,,,,,,,-1.145969e+18,4.36247e+17,2021-06-22 12:00:00+00:00,25.0,2.0,,4.063949e+18,,,2021.0,6.0,173.0,t.me,,,,,,,,,channel


In [5]:
#take random sample of 100k rows of both df where either message or fwd_message contains data and combine
sample_groups = groups[groups['message'].notnull() | groups['fwd_message'].notnull()].sample(n=1000, random_state=42)
sample_channels = channels = channels[channels['message'].notnull() | channels['fwd_message'].notnull()].sample(n=1000, random_state=42)
combined = pd.concat([sample_groups, sample_channels], ignore_index=True, axis=0)
combined.head(5)

Unnamed: 0,UID_key,initial_ID,mid_message,mid_file,group_name,posting_date,message,fwd_message,fwd_posting_date_message,posting_date_file,link_url,media_file,media_file_type,fwd_posting_date_file,fwd_link_url,fwd_media_file,fwd_media_file_type,author,fwd_author,day,week,weekday,message_hash,fwd_message_hash,website,replied_to,year,month,day_of_year,duration,filepath,filename,filename_if_joined,transcribed_message,newsguard_domain,newsguard_score,group_or_channel,fwd_urls,urls,newsguard_scores,fwd_regex,fwd_cryptocurrency,fwd_cryptolabel,regex,cryptocurrency,cryptolabel
0,5216248197782804714237638,691095.0,521624.0,521624.0,1Research7Intelligence Room,2021-03-25 15:20:06+00:00,👍🏼,,,2021-03-25 15:20:06+00:00,#go_to_message521136,,,,,,,-3.452303e+17,,2021-03-25 12:00:00+00:00,12.0,4.0,-6.067032e+18,,,5211368197782804714237638,2021.0,3.0,84.0,,,,,,,,group,,,,,,,,,
1,3325378587905794225980510,3512702.0,332537.0,,Klartext reden über Deutschland,2021-12-26 01:05:25+00:00,Die Reptiloiden leiten immer wieder Warmphasen...,,,,,,,,,,,-6.639767e+18,,2021-12-26 12:00:00+00:00,51.0,7.0,-6.739998e+18,,,,2021.0,12.0,360.0,,,,,,,,group,,,,,,,,,
2,2255608197782804714237638,474291.0,225560.0,225560.0,1Research7Intelligence Room,2020-11-24 21:20:05+00:00,"Oh mein Gott... ERSTENS :""Sidney Powell WAR ei...",,,2020-11-24 21:20:05+00:00,#go_to_message225281,,,,,,,-7.899042e+18,,2020-11-24 12:00:00+00:00,48.0,2.0,-8.687167e+18,,,2252818197782804714237638,2020.0,11.0,329.0,,,,,,,,group,,,,,,,,,
3,457113756755399766928245,3811126.0,45711.0,45711.0,Bismarcks Erben,2020-02-24 18:28:57+00:00,Teilweise. Biebel Zitate aus dem Zusammenhang ...,,,2020-02-24 18:28:57+00:00,#go_to_message45700,,,,,,,4.079522e+18,,2020-02-24 12:00:00+00:00,9.0,1.0,8.099019e+18,,,457003756755399766928245,2020.0,2.0,55.0,,,,,,,,group,,,,,,,,,
4,6739-6408240263544091615,120338.0,6739.0,6739.0,Helfende Q Hände 👐👏,2021-03-12 09:04:45+00:00,wie bitte ? ich beantworte das problem dieser ...,,,2021-03-12 09:04:45+00:00,#go_to_message6629,,,,,,,6.018852e+18,,2021-03-12 12:00:00+00:00,10.0,5.0,7.781442e+18,,,6629-6408240263544091615,2021.0,3.0,71.0,,,,,,,,group,,,,,,,,,


In [6]:
#keep only UID and message
messages = combined[['UID_key', 'message', 'fwd_message', 'group_or_channel']]

#remove emojis
cleaned_messages = []
for message in messages['message'].astype(str):
    cleaned_messages.append(remove_emojis(message))

cleaned_fwd_messages = []
for message in messages['fwd_message'].astype(str):
    cleaned_fwd_messages.append(remove_emojis(message))

messages['message_string'] = cleaned_messages
messages['fwd_message_string'] = cleaned_fwd_messages
messages['message_string'] = messages['message_string'].astype(str)
messages['fwd_message_string'] = messages['fwd_message_string'].astype(str)

#if message, take message else take fwd_message
messages['final_message'] = messages['message'].where(messages['message'].notnull(), messages['fwd_message'])
messages['final_message_string'] = messages['message_string'].where(messages['message_string'] != 'nan', messages['fwd_message_string'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['message_string'] = cleaned_messages
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['fwd_message_string'] = cleaned_fwd_messages
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['message_string'] = messages['message_string'].astype(str)
A value is trying to be set on a copy

In [7]:
messages['preprocessed_message'] = messages['final_message_string'].apply(preprocess_text)

#delete uneccessary columns
messages = messages.drop(columns=['message', 'fwd_message', 'message_string', 'fwd_message_string'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['preprocessed_message'] = messages['final_message_string'].apply(preprocess_text)


In [9]:
messages.to_csv('../data/messages_sample.csv.gzip', compression='gzip')

### For Re-Running Below Code

In [45]:
#for re-running
messages = pd.read_csv('../data/samples/messages_sample_2000.csv.gzip', compression='gzip').drop('Unnamed: 0', axis=1)

## Count-Based Features & POS-Tagging

In [10]:
#num sentences
messages['sent_count'] = messages['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x else 0)
#num words
messages['word_count'] = messages['final_message_string'].apply(lambda x: len(re.findall(r'\w+', x)) if x else 0)
#avg sentence length (words per sentence)
messages['avg_sent_length'] = messages.apply(lambda row: row['word_count'] / row['sent_count'] if row['sent_count'] > 0 else 0, axis=1)
#avg word length (characters per word)
messages['avg_word_length'] = messages.apply(lambda row: len(row['final_message_string'].replace(' ', '')) / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
#num exclamations (multiple ! coutn as one exclamation)
messages['exclamation_count'] = messages['final_message_string'].apply(lambda x: len(re.findall(r'!+', x)) if x else 0)
#num questions (multiple ? count as one question)
messages['question_count'] = messages['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
#num emojis 
messages['emoji_count'] = messages['final_message'].apply(lambda x: count_emojis(x) if x else 0)

In [14]:
#use count_pos_tags func to count nouns, verbs and adj
messages['noun_count'] = messages['final_message_string'].apply(lambda x: count_pos_tags(x)[0])
messages['verb_count'] = messages['final_message_string'].apply(lambda x: count_pos_tags(x)[1])
messages['adj_count'] = messages['final_message_string'].apply(lambda x: count_pos_tags(x)[2])

## Flesch Reading Ease

In [16]:
#use TextStat to compute Flesch Reading Ease score on final_message_string
messages['flesch_reading_ease'] = messages['final_message_string'].apply(flesch_reading_ease)

In [18]:
messages.head(5)
messages.to_csv('../data/messages_with_features.csv.gzip', compression='gzip')

## HuggingFace Complexity Classifier Model Exploration

In [34]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, DistilBertForSequenceClassification
import torch

In [4]:
tokenizer = AutoTokenizer.from_pretrained('MiriUll/distilbert-german-text-complexity')
model = AutoModelForSequenceClassification.from_pretrained('MiriUll/distilbert-german-text-complexity')

In [30]:
inputs = tokenizer("Mit solchen Drohungen kommt sie nie mehr zurück ", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
predicted_class_id

0

In [42]:
pipe = pipeline("text-classification", model="MiriUll/distilbert-german-text-complexity")
print(pipe('Das ist ein einfacher Satz.'))
print(pipe('Obwohl der junge Wissenschaftler sich intensiv auf seine Forschungsarbeit konzentrierte, war er oft von den unvorhersehbaren und lauten Bauarbeiten im Nachbargebäude abgelenkt, die seine produktivsten Stunden regelmäßig störten.'))

[{'label': 'LABEL_0', 'score': 0.8107044100761414}]
[{'label': 'LABEL_0', 'score': 0.9880794882774353}]


## Kaggle Emoji Sentiment Dataset

In [52]:
emojis = pd.read_csv('../data/archive/Emoji_Sentiment_Data_v1.0.csv')
#emoji sentiment column based on max value of positive neutral or negative
emojis['sentiment'] = emojis[['Positive', 'Neutral', 'Negative']].idxmax(axis=1)

In [51]:
emojis

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block,sentiment
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,Positive
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats,Positive
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols,Positive
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,Positive
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons,Negative
...,...,...,...,...,...,...,...,...,...,...
964,➛,0x279b,1,0.011628,0,1,0,DRAFTING POINT RIGHTWARDS ARROW,Dingbats,Neutral
965,♝,0x265d,1,0.280000,0,1,0,BLACK CHESS BISHOP,Miscellaneous Symbols,Neutral
966,❋,0x274b,1,0.888889,0,1,0,HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK,Dingbats,Neutral
967,✆,0x2706,1,0.557252,0,1,0,TELEPHONE LOCATION SIGN,Dingbats,Neutral


# Re-formatting Liwc Dict

In [32]:
file_path = '../data/LIWC2007_German.dic'
skiprows = 70  # Specify the number of rows to skip

data = []

with open(file_path, 'r', encoding='latin1') as file:
    # Step 1: Skip the specified number of rows
    for _ in range(skiprows):
        next(file)
    
    # Read the file line-by-line
    for line in file:
        split_line = line.strip().split('\t')
        word = split_line[0]
        categories = split_line[1:]
        data.append([word, categories])

# Step 2: Create DataFrame with flexible columns
# Define headers
headers = ['word', 'categories']

# Step 3: Create DataFrame
df = pd.DataFrame(data, columns=headers)

# Print the DataFrame to check the result
df

Unnamed: 0,word,categories
0,ab,"[10, 37, 41]"
1,abbrach*,[38]
2,abbreche,[39]
3,abbrich*,[39]
4,abend*,[37]
...,...,...
7593,zwischenzeit*,[37]
7594,zwoelf*,[11]
7595,zwölf*,[11]
7596,zynisch*,"[12, 16, 18]"


In [33]:
df = df.explode('categories')
df['categories'] = df['categories'].astype(int)

In [34]:
df

Unnamed: 0,word,categories
0,ab,10
0,ab,37
0,ab,41
1,abbrach*,38
2,abbreche,39
...,...,...
7596,zynisch*,12
7596,zynisch*,16
7596,zynisch*,18
7597,zyste*,60


In [35]:
liwc_categories = {
    1: 'Pronoun',
    2: 'I',
    3: 'We',
    4: 'Self',
    5: 'You',
    6: 'Other',
    7: 'Negate',
    8: 'Assent',
    9: 'Article',
    10: 'Preps',
    11: 'Number',
    12: 'Affect',
    13: 'Posemo',
    14: 'Posfeel',
    15: 'Optim',
    16: 'Negemo',
    17: 'Anx',
    18: 'Anger',
    19: 'Sad',
    20: 'Cogmech',
    21: 'Cause',
    22: 'Insight',
    23: 'Discrep',
    24: 'Inhib',
    25: 'Tentat',
    26: 'Certain',
    27: 'Senses',
    28: 'See',
    29: 'Hear',
    30: 'Feel',
    31: 'Social',
    32: 'Comm',
    33: 'Othref',
    34: 'Friends',
    35: 'Family',
    36: 'Humans',
    37: 'Time',
    38: 'Past',
    39: 'Present',
    40: 'Future',
    41: 'Space',
    42: 'Up',
    43: 'Down',
    44: 'Incl',
    45: 'Excl',
    46: 'Motion',
    47: 'Occup',
    48: 'School',
    49: 'Job',
    50: 'Achieve',
    51: 'Leisure',
    52: 'Home',
    53: 'Sports',
    54: 'TV',
    55: 'Music',
    56: 'Money',
    57: 'Metaph',
    58: 'Relig',
    59: 'Death',
    60: 'Physcal',
    61: 'Body',
    62: 'Sexual',
    63: 'Eating',
    64: 'Sleep',
    65: 'Groom',
    66: 'Swear',
    67: 'Nonfl',
    68: 'Fillers'
}

In [36]:
df['cat_name'] = df['categories'].map(liwc_categories)

In [37]:
df

Unnamed: 0,word,categories,cat_name
0,ab,10,Preps
0,ab,37,Time
0,ab,41,Space
1,abbrach*,38,Past
2,abbreche,39,Present
...,...,...,...
7596,zynisch*,12,Affect
7596,zynisch*,16,Negemo
7596,zynisch*,18,Anger
7597,zyste*,60,Physcal


In [38]:
#change the order of the columns so that its word, cat_name, categories
df = df[['word', 'cat_name', 'categories']]

#write df to txt file but omit index and column header
df.to_csv('../data/liwc_german_2007.txt', sep='\t', index=False, header=False)

# Making txt file for GAWK script

In [6]:
#load data
filename = 'messages_sample_10'
sample = pd.read_csv(f'../data/samples/{filename}.csv.gzip', compression='gzip').drop('Unnamed: 0', axis=1)

In [7]:
sample

Unnamed: 0,UID_key,group_or_channel,final_message,final_message_string,preprocessed_message
0,5216248197782804714237638,group,👍🏼,,
1,3325378587905794225980510,group,Die Reptiloiden leiten immer wieder Warmphasen...,Die Reptiloiden leiten immer wieder Warmphasen...,reptiloide leiten warmphasen beschießen Sonne ...
2,2255608197782804714237638,group,"Oh mein Gott... ERSTENS :""Sidney Powell WAR ei...","Oh mein Gott... ERSTENS :""Sidney Powell WAR ei...",oh Gott erstens Sidney Powell einst Bundesanwä...
3,457113756755399766928245,group,Teilweise. Biebel Zitate aus dem Zusammenhang ...,Teilweise. Biebel Zitate aus dem Zusammenhang ...,teilweise Biebel Zitat Zusammenhang Denkanstöß...
4,6739-6408240263544091615,group,wie bitte ? ich beantworte das problem dieser ...,wie bitte ? ich beantworte das problem dieser ...,bitte beantworten Problem sterblichen 3 Wörter...
5,44000-6525030231877560063,channel,Cybercrime: Erpresser geben Irland Entschlüsse...,Cybercrime: Erpresser geben Irland Entschlüsse...,cybercrime erpresser geben Irland Entschlüssel...
6,71065-6525030231877560063,channel,Die Delta-Panikmache hält weiter an. Österreic...,Die Delta-Panikmache hält weiter an. Österreic...,delta-panikmach halten Österreich Regierung Be...
7,50235-2062537961599298253,channel,"Hallo liebe Arwen 🥰, heute wieder eine TV Info...","Hallo liebe Arwen , heute wieder eine TV Info....",Hallo Liebe arwen tv info kurzfristig Programm...
8,26268-2062537961599298253,channel,<strong>Was WIRKLICH auf Epstein Island oder „...,Was WIRKLICH auf Epstein Island oder „Orgy Isl...,Epstein Island orgy Island stattfinden Leute g...
9,36435-6525030231877560063,channel,Nach uns die Sintflut <strong>STAATSHAUS...,Nach uns die Sintflut STAATSHAUSHALT UND...,Sintflut Staatshaushalt Sozialversicher...


In [9]:
#only keep UID_key and final_message_string and save as txt without "" around messages

sample = sample[['UID_key', 'final_message_string']]
sample.to_csv(f'../data/samples/{filename}.txt', sep='\t', index=False, header=False, quoting=3)

## Changing sampling strategy

In [18]:
sample_size = 10 #how big of a sample to take from each dataset
random_state = 42

########## LOAD AND PREPARE DATASET ##########

#load two datasets, drop unnecessary columns and add column to indicate group or channel
groups = pd.read_csv('../data/selected_groups_with_transcriptions.csv.gzip', compression='gzip').drop(columns=['Unnamed: 0'], axis=1)
channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression='gzip').drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], axis=1)


groups['group_or_channel'] = 'group'
channels['group_or_channel'] = 'channel'


#take random sample of both df where either message or fwd_message (or transcribedmessage if group) contains data and combine
sample_groups = groups[groups['message'].notnull() | groups['fwd_message'].notnull() | groups['transcribed_message'].notnull()].sample(n=sample_size, random_state=random_state)
sample_channels = channels = channels[channels['message'].notnull() | channels['fwd_message'].notnull()].sample(n=sample_size, random_state=random_state)

  groups = pd.read_csv('../data/selected_groups_with_transcriptions.csv.gzip', compression='gzip').drop(columns=['Unnamed: 0'], axis=1)
  channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression='gzip').drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], axis=1)


In [20]:
combined = pd.concat([sample_groups, sample_channels], ignore_index=True, axis=0)

#keep only necessary columns
messages = combined[['UID_key', 'message', 'fwd_message', 'transcribed_message', 'group_or_channel']]

#remove emojis and links
cleaned_messages = []
for message in messages['message'].astype(str):
    message = remove_tags(message)
    cleaned_messages.append(remove_emojis(message))

cleaned_fwd_messages = []
for message in messages['fwd_message'].astype(str):
    message = remove_tags(message)
    cleaned_fwd_messages.append(remove_emojis(message))

messages['message_string'] = cleaned_messages
messages['fwd_message_string'] = cleaned_fwd_messages
messages['message_string'] = messages['message_string'].astype(str)
messages['fwd_message_string'] = messages['fwd_message_string'].astype(str)

#if message, take message else take fwd_message else take transcribed message
messages['final_message'] = np.where(messages['message'].notnull(), messages['message'],
                                    np.where(messages['fwd_message'].notnull(), messages['fwd_message'],
                                             messages['transcribed_message'])).astype(str)
messages['final_message_string'] = np.where(messages['message_string'] != 'nan', messages['message_string'],
                                    np.where(messages['fwd_message_string'] != 'nan', messages['fwd_message_string'],
                                             messages['transcribed_message'])).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['message_string'] = cleaned_messages
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['fwd_message_string'] = cleaned_fwd_messages
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['message_string'] = messages['message_string'].astype(str)
A value is trying to be set on a copy

In [21]:
messages

Unnamed: 0,UID_key,message,fwd_message,transcribed_message,group_or_channel,message_string,fwd_message_string,final_message,final_message_string
0,101673-8768690052625427881,Das Völkerrecht hat eine Juristische Grundlage...,,,group,Das Völkerrecht hat eine Juristische Grundlage...,,Das Völkerrecht hat eine Juristische Grundlage...,Das Völkerrecht hat eine Juristische Grundlage...
1,280125-2216795262304420685,"Danke, das macht Mut!🙏🏾",,,group,"Danke, das macht Mut!",,"Danke, das macht Mut!🙏🏾","Danke, das macht Mut!"
2,321731-2216795262304420685,damit er dich besser sperren kann ....,,,group,damit er dich besser sperren kann ....,,damit er dich besser sperren kann ....,damit er dich besser sperren kann ....
3,34147718200335715773759193,"Darf man Fragen, wie sich 82 Millionen Mensche...",,,group,"Darf man Fragen, wie sich 82 Millionen Mensche...",,"Darf man Fragen, wie sich 82 Millionen Mensche...","Darf man Fragen, wie sich 82 Millionen Mensche..."
4,31042658200335715773759193,Dann ist ihm leider nicht zu helfen! Geht mir...,,,group,Dann ist ihm leider nicht zu helfen! Geht mir...,,Dann ist ihm leider nicht zu helfen! Geht mir...,Dann ist ihm leider nicht zu helfen! Geht mir...
5,31559668200335715773759193,weiß man bis wann ca?,,,group,weiß man bis wann ca?,,weiß man bis wann ca?,weiß man bis wann ca?
6,602533731161322356821694,hha du bist eben aam hetzen da zeigst du gleic...,,,group,hha du bist eben aam hetzen da zeigst du gleic...,,hha du bist eben aam hetzen da zeigst du gleic...,hha du bist eben aam hetzen da zeigst du gleic...
7,639216-2216795262304420685,Gelenkspfanne?,,,group,Gelenkspfanne?,,Gelenkspfanne?,Gelenkspfanne?
8,54333756755399766928245,👍👍👍,,,group,,,👍👍👍,
9,2332188587905794225980510,Was ist denn ein G rattler Dialekt,,,group,Was ist denn ein G rattler Dialekt,,Was ist denn ein G rattler Dialekt,Was ist denn ein G rattler Dialekt


In [22]:
messages['final_message_string'] = messages['final_message_string'].apply(lambda x: ' '.join(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['final_message_string'] = messages['final_message_string'].apply(lambda x: ' '.join(x.split()))


# Subprocessing

In [47]:
! gawk -f ../src/analysis/liwc_category_ratios.awk ../data/liwc_german_2007.txt ../data/samples/messages_sample_200.txt > ../results/liwc_ratios.csv

In [48]:
# Load the output file but remove last column
liwc_ratios = pd.read_csv('../results/liwc_ratios.csv', sep=',')
liwc_ratios = liwc_ratios.iloc[:, :-1]
liwc_ratios

Unnamed: 0,UID_key,liwc_Present,liwc_Motion
0,101673-8768690052625427881,0.068965,0.000000
1,280125-2216795262304420685,0.250000,0.000000
2,321731-2216795262304420685,0.142857,0.000000
3,34147718200335715773759193,0.111111,0.000000
4,31042658200335715773759193,0.142857,0.057143
...,...,...,...
195,45195-6525030231877560063,0.000000,0.000000
196,17635-2062537961599298253,0.027451,0.007843
197,37613-2062537961599298253,0.000000,0.000000
198,13867-666778652316300543,0.019231,0.019231


In [41]:
ling_features = pd.read_csv('../results/messages_with_features_200.csv.gzip', compression='gzip').drop('Unnamed: 0', axis=1)

In [42]:
ling_features

Unnamed: 0,UID_key,group_or_channel,final_message,final_message_string,preprocessed_message,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,noun_count,verb_count,adj_count,flesch_reading_ease,flesch_reading_ease_class
0,101673-8768690052625427881,group,Das Völkerrecht hat eine Juristische Grundlage...,Das Völkerrecht hat eine Juristische Grundlage...,völkerrechen juristisch Grundlage fast Mensch ...,3,29,9.666667,5.379310,0,0,0,7,3,2,71.90,fairly easy
1,280125-2216795262304420685,group,"Danke, das macht Mut!🙏🏾","Danke, das macht Mut!",danken Mut,2,4,2.000000,4.500000,1,0,1,2,1,0,99.95,very easy
2,321731-2216795262304420685,group,damit er dich besser sperren kann ....,damit er dich besser sperren kann ....,sperren,2,6,3.000000,5.333333,0,0,0,0,1,0,86.25,easy
3,34147718200335715773759193,group,"Darf man Fragen, wie sich 82 Millionen Mensche...","Darf man Fragen, wie sich 82 Millionen Mensche...",fragen 82 Million Mensch verwalten bitte konkr...,3,18,6.000000,6.611111,1,1,0,5,2,1,54.00,fairly difficult
4,31042658200335715773759193,group,Dann ist ihm leider nicht zu helfen! Geht mir...,Dann ist ihm leider nicht zu helfen! Geht mir ...,helfen Mann versuchen Aufklärung betreiben fru...,8,35,4.375000,5.057143,1,0,0,3,8,0,85.25,easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,45195-6525030231877560063,channel,"<a href=""https://www.epochtimes.de/wirtschaft/...",,,1,1,1.000000,3.000000,0,0,0,0,0,0,,unclassified
196,17635-2062537961599298253,channel,Was zur islamistischen Mordtat von Würzburg zu...,Was zur islamistischen Mordtat von Würzburg zu...,islamistisch Mordtat Würzburg sagen 25. Juni 2...,22,260,11.818182,6.380769,0,0,0,71,23,21,44.55,difficult
197,37613-2062537961599298253,channel,"<a href=""https://www.youtube.com/watch?v=PSp1r...",,,1,1,1.000000,3.000000,0,0,0,0,0,0,,unclassified
198,13867-666778652316300543,channel,Um die Infizierten so rasch wie möglich zu fin...,Um die Infizierten so rasch wie möglich zu fin...,infizierten rasch finden lassen Behörde täglic...,3,57,19.000000,6.649123,0,0,0,15,7,0,19.45,very confusing


In [49]:
#concat liwc_ratios and ling_features based on UID_key
merged = pd.merge(ling_features, liwc_ratios, on='UID_key', how='inner')

In [50]:
merged

Unnamed: 0,UID_key,liwc_Present,liwc_Motion,group_or_channel,final_message,final_message_string,preprocessed_message,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,noun_count,verb_count,adj_count,flesch_reading_ease,flesch_reading_ease_class
0,101673-8768690052625427881,0.068965,0.000000,group,Das Völkerrecht hat eine Juristische Grundlage...,Das Völkerrecht hat eine Juristische Grundlage...,völkerrechen juristisch Grundlage fast Mensch ...,3,29,9.666667,5.379310,0,0,0,7,3,2,71.90,fairly easy
1,280125-2216795262304420685,0.250000,0.000000,group,"Danke, das macht Mut!🙏🏾","Danke, das macht Mut!",danken Mut,2,4,2.000000,4.500000,1,0,1,2,1,0,99.95,very easy
2,321731-2216795262304420685,0.142857,0.000000,group,damit er dich besser sperren kann ....,damit er dich besser sperren kann ....,sperren,2,6,3.000000,5.333333,0,0,0,0,1,0,86.25,easy
3,34147718200335715773759193,0.111111,0.000000,group,"Darf man Fragen, wie sich 82 Millionen Mensche...","Darf man Fragen, wie sich 82 Millionen Mensche...",fragen 82 Million Mensch verwalten bitte konkr...,3,18,6.000000,6.611111,1,1,0,5,2,1,54.00,fairly difficult
4,31042658200335715773759193,0.142857,0.057143,group,Dann ist ihm leider nicht zu helfen! Geht mir...,Dann ist ihm leider nicht zu helfen! Geht mir ...,helfen Mann versuchen Aufklärung betreiben fru...,8,35,4.375000,5.057143,1,0,0,3,8,0,85.25,easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,45195-6525030231877560063,0.000000,0.000000,channel,"<a href=""https://www.epochtimes.de/wirtschaft/...",,,1,1,1.000000,3.000000,0,0,0,0,0,0,,unclassified
196,17635-2062537961599298253,0.027451,0.007843,channel,Was zur islamistischen Mordtat von Würzburg zu...,Was zur islamistischen Mordtat von Würzburg zu...,islamistisch Mordtat Würzburg sagen 25. Juni 2...,22,260,11.818182,6.380769,0,0,0,71,23,21,44.55,difficult
197,37613-2062537961599298253,0.000000,0.000000,channel,"<a href=""https://www.youtube.com/watch?v=PSp1r...",,,1,1,1.000000,3.000000,0,0,0,0,0,0,,unclassified
198,13867-666778652316300543,0.019231,0.019231,channel,Um die Infizierten so rasch wie möglich zu fin...,Um die Infizierten so rasch wie möglich zu fin...,infizierten rasch finden lassen Behörde täglic...,3,57,19.000000,6.649123,0,0,0,15,7,0,19.45,very confusing


# Sentiment Bert

In [11]:
# Use a pipeline as a high-level helper
from transformers import pipeline

sentiment_model = pipeline(model="aari1995/German_Sentiment")

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
sentence = ["Ich liebe die Bahn. Pünktlich wie immer ... -.-","Krasser Service"]
result = sentiment_model(sentence)
print(result)
#Output:
#[{'label': 'negative', 'score': 0.4935680031776428},{'label': 'positive', 'score': 0.5790663957595825}]

[{'label': 'negative', 'score': 0.4935676157474518}, {'label': 'positive', 'score': 0.45009028911590576}]


In [13]:
from transformers import AutoTokenizer

# Assuming 'sentiment_model' is already loaded
# Load the tokenizer corresponding to your sentiment model
tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')  # Replace 'model_name' with the actual model name

sentiment_aari = []

for message in messages['final_message_string']:
    # Encode the message, truncate to max length of the model, and only keep the input_ids
    inputs = tokenizer.encode(message, return_tensors='pt', max_length=512, truncation=True)
    # Decode back to text string, to feed into the sentiment model as expected
    truncated_message = tokenizer.decode(inputs[0], skip_special_tokens=True)
    result = sentiment_model(truncated_message)
    sentiment_aari.append(result[0]['label'])

messages['sentiment_aari'] = sentiment_aari

In [20]:
import numpy as np
from tqdm import tqdm

In [28]:
pos_sent = []
neg_sent = []
neutral_sent = []

for message in tqdm(messages['final_message_string'], desc = 'Extracting Sentiment'):
    # if message is empty, don't calculate sentiment
    if message == '' or message == 'nan':
        pos_sent.append(np.nan)
        neg_sent.append(np.nan)
        neutral_sent.append(np.nan)
    else:
        # encode & decode message and truncate to max length that model can handle
        result = sentiment_model(message[:512])
        sent = (result[0]['label'])
        if sent == 'positive':
            pos_sent.append(1)
            neg_sent.append(0)
            neutral_sent.append(0)
        elif sent == 'negative':
            pos_sent.append(0)
            neg_sent.append(1)
            neutral_sent.append(0)
        elif sent == 'neutral':
            pos_sent.append(0)
            neg_sent.append(0)
            neutral_sent.append(1)
        else:
            pos_sent.append(np.nan)
            neg_sent.append(np.nan)
            neutral_sent.append(np.nan)

messages['positive_sentiment'] = pos_sent
messages['negative_sentiment'] = neg_sent
messages['neutral_sentiment'] = neutral_sent

Extracting Sentiment: 100%|██████████| 200/200 [02:39<00:00,  1.25it/s]


In [24]:
from tqdm import tqdm
import numpy as np

# Initialize sentiment lists
pos_sent = [np.nan] * len(messages['final_message_string'])
neg_sent = [np.nan] * len(messages['final_message_string'])
neutral_sent = [np.nan] * len(messages['final_message_string'])

# Map sentiment labels to list indices
sentiment_map = {
    'positive': (1, 0, 0),
    'negative': (0, 1, 0),
    'neutral': (0, 0, 1)
}

# Process messages
for idx, message in tqdm(enumerate(messages['final_message_string']), desc='Extracting Sentiment', total=len(messages['final_message_string'])):
    # Skip empty messages
    if message in ('', 'nan'):
        continue

    # Run sentiment analysis
    result = sentiment_model(message[:512])  # Use the pipeline directly with the message text
    sent = result[0]['label']

    # Update sentiment lists
    if sent in sentiment_map:
        pos_sent[idx], neg_sent[idx], neutral_sent[idx] = sentiment_map[sent]

# Assign results back to DataFrame
messages['positive_sentiment'] = pos_sent
messages['negative_sentiment'] = neg_sent
messages['neutral_sentiment'] = neutral_sent

Extracting Sentiment: 100%|██████████| 200/200 [02:47<00:00,  1.19it/s]


In [25]:
for message in tqdm(messages['final_message_string']):
    sentiment = sentiment_model(message[:512])

 53%|█████▎    | 106/200 [02:00<01:46,  1.13s/it]


KeyboardInterrupt: 

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
text = "Erneuter Streik in der S-Bahn"
model = AutoModelForSequenceClassification.from_pretrained('ssary/XLM-RoBERTa-German-sentiment')
tokenizer = AutoTokenizer.from_pretrained('ssary/XLM-RoBERTa-German-sentiment')
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
    outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
sentiment_classes = ['negative', 'neutral', 'positive']
print(sentiment_classes[predictions.argmax()]) # for the class with highest probability
print(predictions) # for each class probability

  from .autonotebook import tqdm as notebook_tqdm


negative
tensor([[0.9619, 0.0131, 0.0250]])


In [3]:
import pandas as pd

In [6]:
messages = pd.read_csv('../data/samples/messages_sample_200.csv.gzip', compression='gzip').drop('Unnamed: 0', axis=1)
messages['final_message_string'] = messages['final_message_string'].astype(str)

In [7]:
# predict sentiment on all messages
sentiment = []
neg_prob = []
neu_prob = []
pos_prob = []

for message in messages['final_message_string']:
    inputs = tokenizer(message, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_classes = ['negative', 'neutral', 'positive']
    sentiment.append(sentiment_classes[predictions.argmax()])
    neg_prob.append(predictions[0][0].item())
    neu_prob.append(predictions[0][1].item())
    pos_prob.append(predictions[0][2].item())

messages['sentiment'] = sentiment
messages['neg_prob'] = neg_prob
messages['neu_prob'] = neu_prob
messages['pos_prob'] = pos_prob

In [9]:
#print all messages with their sentiment
for i, row in messages.iterrows():
    print(f'{row["final_message_string"]} - {row["sentiment"]}\n')

Das Völkerrecht hat eine Juristische Grundlage, wie fast alles, was die Menschen so verzapft haben und das hat nichts mit "Rasse" zu tun, sondern mit Recht. Menschen gemachtes Recht...  - neutral

Danke, das macht Mut!  - neutral

damit er dich besser sperren kann ....  - neutral

Darf man Fragen, wie sich 82 Millionen Menschen selbst verwaltet sollen? Bitte konkrete Beschreibung, interessiert mich wirklich!! Danke  - neutral

Dann ist ihm leider nicht zu helfen! Geht mir mit meinem Mann auch so. Habe auch alles versucht, Aufklärung zu betreiben. Fruchtet nicht. Dann werde ich mich nicht mehr bemüht. Habe Monate geredet. Nichts hilft.  - negative

weiß man bis wann ca?  - neutral

hha du bist eben aam hetzen da zeigst du gleich wie verdreht du im kopf bist! HETZTE WIRD GEGEN UNS GEMACHT und wwir zeigen dies alles ds ist nicht hetze aussere in einem totl verdrehten kopf! Bist du so verwirrt ds du nicht mal diesen unterschied erkennen kannst! Aber der fluch deiner mnipulations lüge wirss

In [14]:
messages[['final_message_string', 'sentiment', 'sentiment_aari']]

Unnamed: 0,final_message_string,sentiment,sentiment_aari
0,Das Völkerrecht hat eine Juristische Grundlage...,neutral,negative
1,"Danke, das macht Mut!",neutral,positive
2,damit er dich besser sperren kann ....,neutral,negative
3,"Darf man Fragen, wie sich 82 Millionen Mensche...",neutral,neutral
4,Dann ist ihm leider nicht zu helfen! Geht mir ...,negative,negative
...,...,...,...
195,,neutral,neutral
196,Was zur islamistischen Mordtat von Würzburg zu...,neutral,neutral
197,,neutral,neutral
198,Um die Infizierten so rasch wie möglich zu fin...,neutral,neutral


In [17]:
#print all messages with their sentiment
for i, row in messages.iterrows():
    print(f'{row["final_message_string"]}\nRoberta: {row["sentiment"]}\nAari: {row["sentiment_aari"]}\n', '-'*50)

Das Völkerrecht hat eine Juristische Grundlage, wie fast alles, was die Menschen so verzapft haben und das hat nichts mit "Rasse" zu tun, sondern mit Recht. Menschen gemachtes Recht... 
Roberta: neutral
Aari: negative
 --------------------------------------------------
Danke, das macht Mut! 
Roberta: neutral
Aari: positive
 --------------------------------------------------
damit er dich besser sperren kann .... 
Roberta: neutral
Aari: negative
 --------------------------------------------------
Darf man Fragen, wie sich 82 Millionen Menschen selbst verwaltet sollen? Bitte konkrete Beschreibung, interessiert mich wirklich!! Danke 
Roberta: neutral
Aari: neutral
 --------------------------------------------------
Dann ist ihm leider nicht zu helfen! Geht mir mit meinem Mann auch so. Habe auch alles versucht, Aufklärung zu betreiben. Fruchtet nicht. Dann werde ich mich nicht mehr bemüht. Habe Monate geredet. Nichts hilft. 
Roberta: negative
Aari: negative
 -------------------------------

In [29]:
test = pd.read_csv('../data/samples/messages_sample_200.csv.gzip', compression = 'gzip')

In [30]:
test

Unnamed: 0.1,Unnamed: 0,UID_key,author,group_or_channel,final_message,final_message_string
0,0,101673-8768690052625427881,6.151278e+18,group,Das Völkerrecht hat eine Juristische Grundlage...,Das Völkerrecht hat eine Juristische Grundlage...
1,1,280125-2216795262304420685,-3.199383e+18,group,"Danke, das macht Mut!🙏🏾","Danke, das macht Mut!"
2,2,321731-2216795262304420685,-5.728373e+18,group,damit er dich besser sperren kann ....,damit er dich besser sperren kann ....
3,3,34147718200335715773759193,-3.182208e+18,group,"Darf man Fragen, wie sich 82 Millionen Mensche...","Darf man Fragen, wie sich 82 Millionen Mensche..."
4,4,31042658200335715773759193,-7.008655e+18,group,Dann ist ihm leider nicht zu helfen! Geht mir...,Dann ist ihm leider nicht zu helfen! Geht mir ...
...,...,...,...,...,...,...
195,195,45195-6525030231877560063,-6.525030e+18,channel,"<a href=""https://www.epochtimes.de/wirtschaft/...",
196,196,17635-2062537961599298253,-2.062538e+18,channel,Was zur islamistischen Mordtat von Würzburg zu...,Was zur islamistischen Mordtat von Würzburg zu...
197,197,37613-2062537961599298253,-2.062538e+18,channel,"<a href=""https://www.youtube.com/watch?v=PSp1r...",
198,198,13867-666778652316300543,-6.667787e+17,channel,Um die Infizierten so rasch wie möglich zu fin...,Um die Infizierten so rasch wie möglich zu fin...


In [31]:
channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression = 'gzip')

  channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression = 'gzip')


In [33]:
channels.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'UID_key', 'mid_message', 'mid_file',
       'group_name', 'posting_date', 'message', 'fwd_message',
       'fwd_posting_date_message', 'posting_date_file', 'link_url',
       'media_file', 'media_file_type', 'fwd_posting_date_file',
       'fwd_link_url', 'fwd_media_file', 'fwd_media_file_type', 'author',
       'fwd_author', 'day', 'week', 'weekday', 'message_hash',
       'fwd_message_hash', 'website', 'replied_to', 'year', 'month',
       'day_of_year', 'fwd_urls', 'urls', 'newsguard_scores', 'fwd_regex',
       'fwd_cryptocurrency', 'fwd_cryptolabel', 'regex', 'cryptocurrency',
       'cryptolabel'],
      dtype='object')

# Google Perspective API

In [5]:
import pandas as pd
import numpy as np
from time import sleep
from tqdm import tqdm
import random

from googleapiclient import discovery
import json
from config import API_KEY

In [6]:
client = discovery.build(
"commentanalyzer",
"v1alpha1",
developerKey=API_KEY,
discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
static_discovery=False,
)

In [76]:
df = pd.read_csv('../results/post-aggregation/author_200.csv.gzip', compression = 'gzip')
df['final_message_string'] = df['final_message_string'].astype(str)
df['toxicity'] = 0

In [77]:
def toxicity_detection(sentences, client):
    toxic = []
    for sent in sentences:
        analyze_request = {
            'comment': { 'text': f"{sent}" },
            'languages' : ["de"],
            'requestedAttributes': {'TOXICITY': {}},
        }

        response = client.comments().analyze(body=analyze_request).execute()
        j = json.dumps(response, indent=2)
        #print(json.loads(j)['attributeScores']['TOXICITY']['summaryScore']['value'])
        toxic.append(json.loads(j)['attributeScores']['TOXICITY']['summaryScore']['value'])
    avg = sum(toxic)/len(toxic)
    print(avg)
    return avg

In [78]:
# def toxicity_detection(sentences):
#     toxic = []
#     for sent in sentences:
#         analyze_request = {
#             'comment': { 'text': f"{sent}" },
#             'languages' : ["de"],
#             'requestedAttributes': {'TOXICITY': {}},
#         }

#         response = client.comments().analyze(body=analyze_request).execute()
#         j = json.dumps(response, indent=2)
#         #print(json.loads(j)['attributeScores']['TOXICITY']['summaryScore']['value'])
#         toxic.append(json.loads(j)['attributeScores']['TOXICITY']['summaryScore']['value'])
#     return sum(toxic)/len(toxic)


# # n= 10000
# # list_df = [sample[i:i+n] for i in range(0,len(sample),n)]


# #final_toxic_list = []
# # for df in list_df:
# for i in tqdm(range(len(sample_df))):
#     row = sample_df.iloc[i]
#     #toxic = []
#     if row['toxicity'] == 0: 

#         tmp = [sent.strip() for sent in re.split(r'[.!?]', row.final_message_string) if len(sent.split()) > 5]

#         if (len(tmp) > 100):
#             tmp = random.sample(tmp, 100)
#         if (len(tmp) > 1):
#             row['toxicity'] = toxicity_detection(tmp)

#     sample_df.at[i, 'toxicity'] = row['toxicity']

#     #df.at[i, 'toxicity'] = toxic
#     #final_toxic_list.append(df)

# # con = pd.concat(final_toxic_list)
# # con.to_csv('fa_toxic.csv')

In [80]:
#split df into chunks
n= 20
list_df = [df[i:i+n] for i in range(0,len(df),n)]

#iterate over chunks and rows to extract toxicity score
final_toxic_list = []
for df in list_df:
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        if row['toxicity'] == 0: 
            #split message into list of sentences to pass to toxicity detection function
            tmp = [sent.strip() for sent in re.split(r'[.!?]', row['final_message_string']) if len(sent.split()) > 5]

            if (len(tmp) > 100):
                tmp = random.sample(tmp, 100)
            #print(tmp)
            if (len(tmp) > 1):
                row['toxicity'] = toxicity_detection(tmp, client)
            else:
                print('no sentence')
        df.at[df.index[i], 'toxicity'] = row['toxicity']
        print('df.at...', df.at[df.index[i], 'toxicity'])
    final_toxic_list.append(df)

#concat chunks
df_after = pd.concat(final_toxic_list)

100%|██████████| 17/17 [00:00<00:00, 1017.24it/s]

df.at... 0.0318543825
no sentence
df.at... 0.0
no sentence
df.at... 0.0
no sentence
df.at... 0.0
no sentence
df.at... 0.0
df.at... 0.045378546
no sentence
df.at... 0.0
df.at... 0.0532950425
df.at... 0.017903083
no sentence
df.at... 0.0
no sentence
df.at... 0.0
no sentence
df.at... 0.0
no sentence
df.at... 0.0
no sentence
df.at... 0.0
no sentence
df.at... 0.0
no sentence
df.at... 0.0
no sentence
df.at... 0.0





In [89]:
df[df['final_message_string'] == 'nan']

Unnamed: 0,author,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,channel_messages,group_messages,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,final_message,final_message_string,own_message,forwarded_message,total_message_count,own_message_count,forwarded_message_count,action_quotient,sentiment_quotient,avg_flesch_reading_ease_class,toxicity,count
91,-8.502653e+18,,,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,,,unclassified,0.0,4
92,-6.52503e+18,,,,,,,,,,,,,,,,,,,,,,,,0,1,48,0,48,,,unclassified,0.0,4
93,-2.062538e+18,,,,,,,,,,,,,,,,,,,,,,,,0,1,27,0,27,,,unclassified,0.0,4
94,-1.145969e+18,,,,,,,,,,,,,,,,,,,,,,,,0,1,14,0,14,,,unclassified,0.0,4
95,-6.667787e+17,,,,,,,,,,,,,,,,,,,,,,,,0,1,9,0,9,,,unclassified,0.0,4
96,5.795324e+18,,,,,,,,,,,,,,,,,,,,,,,,0,1,2,0,2,,,unclassified,0.0,4


In [58]:
final_toxic_list

[          author  noun_count  verb_count  adj_count  positive_sentiment  \
 0  -9.077298e+18         0.0         1.0        0.0                 1.0   
 1  -8.990367e+18         3.0         1.0        1.0                 0.0   
 2  -8.489960e+18         6.0         0.0        0.0                 0.0   
 3  -8.398202e+18         1.0         2.0        0.0                 0.0   
 4  -8.315076e+18         1.0         1.0        0.0                 0.0   
 5  -8.227480e+18         0.0         0.0        0.0                 0.0   
 6  -7.645288e+18         0.0         0.0        0.0                 0.0   
 7  -7.599929e+18         1.0         0.0        0.0                 0.0   
 8  -7.575311e+18         9.0         9.0        6.0                 0.0   
 9  -7.364414e+18         0.0         2.0        0.0                 1.0   
 10 -7.348419e+18        84.0        60.0       14.0                 0.0   
 11 -7.340301e+18         1.0         1.0        1.0                 1.0   
 12 -7.15067

In [61]:
len(df_after[df_after['toxicity'] != 0])

20

In [62]:
len(df_after)

97

In [92]:
results = pd.read_csv('../results/post-aggregation/author_200.csv.gzip', compression = 'gzip')

In [93]:
results[results['toxicity'] != 0]

Unnamed: 0,author,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,channel_messages,group_messages,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,final_message,final_message_string,own_message,forwarded_message,total_message_count,own_message_count,forwarded_message_count,action_quotient,sentiment_quotient,avg_flesch_reading_ease_class,toxicity


# Forwarded Messages

In [67]:
sample_df.describe()

Unnamed: 0.1,Unnamed: 0,author,own_message,forwarded_message,toxicity
count,200.0,200.0,200.0,200.0,200.0
mean,99.5,2.965873e+18,0.995,0.0,0.022351
std,57.879185,5.250734e+18,0.070711,0.0,0.068442
min,0.0,-9.077298e+18,0.0,0.0,0.0
25%,49.75,-6.599206e+17,1.0,0.0,0.0
50%,99.5,6.712571e+18,1.0,0.0,0.0
75%,149.25,6.712571e+18,1.0,0.0,0.0
max,199.0,8.791341e+18,1.0,0.0,0.469067


In [69]:
sample_df[sample_df['forwarded_message'] == 1]

Unnamed: 0.1,Unnamed: 0,UID_key,author,date,group_or_channel,own_message,forwarded_message,final_message,final_message_string,toxicity


In [7]:
new_sample = pd.read_csv('../data/samples/messages_sample_200.csv.gzip', compression = 'gzip')

In [8]:
new_sample

Unnamed: 0.1,Unnamed: 0,UID_key,author,fwd_message,date,group_or_channel,own_message,forwarded_message,fwd_message_string,final_message,final_message_string
0,0,101673-8768690052625427881,6.151278e+18,,2020.0-7.0,group,1,0,,Das Völkerrecht hat eine Juristische Grundlage...,Das Völkerrecht hat eine Juristische Grundlage...
1,1,280125-2216795262304420685,-3.199383e+18,,2021.0-2.0,group,1,0,,"Danke, das macht Mut!🙏🏾","Danke, das macht Mut!"
2,2,321731-2216795262304420685,-5.728373e+18,,2021.0-3.0,group,1,0,,damit er dich besser sperren kann ....,damit er dich besser sperren kann ....
3,3,34147718200335715773759193,-3.182208e+18,,2021.0-7.0,group,1,0,,"Darf man Fragen, wie sich 82 Millionen Mensche...","Darf man Fragen, wie sich 82 Millionen Mensche..."
4,4,31042658200335715773759193,-7.008655e+18,,2021.0-6.0,group,1,0,,Dann ist ihm leider nicht zu helfen! Geht mir...,Dann ist ihm leider nicht zu helfen! Geht mir ...
...,...,...,...,...,...,...,...,...,...,...,...
195,195,45195-6525030231877560063,-6.525030e+18,"<a href=""https://www.epochtimes.de/wirtschaft/...",2021.0-5.0,channel,0,1,,,
196,196,17635-2062537961599298253,-2.062538e+18,Was zur islamistischen Mordtat von Würzburg zu...,2021.0-6.0,channel,0,1,Was zur islamistischen Mordtat von Würzburg zu...,,
197,197,37613-2062537961599298253,-2.062538e+18,"<a href=""https://www.youtube.com/watch?v=PSp1r...",2021.0-10.0,channel,0,1,,,
198,198,13867-666778652316300543,-6.667787e+17,Um die Infizierten so rasch wie möglich zu fin...,2020.0-6.0,channel,0,1,Um die Infizierten so rasch wie möglich zu fin...,,


In [9]:
new_sample[new_sample['forwarded_message'] == 1]

Unnamed: 0.1,Unnamed: 0,UID_key,author,fwd_message,date,group_or_channel,own_message,forwarded_message,fwd_message_string,final_message,final_message_string
100,100,44000-6525030231877560063,-6.525030e+18,Cybercrime: Erpresser geben Irland Entschlüsse...,2021.0-5.0,channel,0,1,Cybercrime: Erpresser geben Irland Entschlüsse...,,
101,101,71065-6525030231877560063,-6.525030e+18,Die Delta-Panikmache hält weiter an. Österreic...,2021.0-7.0,channel,0,1,Die Delta-Panikmache hält weiter an. Österreic...,,
102,102,50235-2062537961599298253,-2.062538e+18,"Hallo liebe Arwen 🥰, heute wieder eine TV Info...",2021.0-12.0,channel,0,1,"Hallo liebe Arwen , heute wieder eine TV Info....",,
103,103,26268-2062537961599298253,-2.062538e+18,<strong>Was WIRKLICH auf Epstein Island oder „...,2021.0-8.0,channel,0,1,Was WIRKLICH auf Epstein Island oder „Orgy Isl...,,
104,104,36435-6525030231877560063,-6.525030e+18,Nach uns die Sintflut <strong>STAATSHAUS...,2021.0-5.0,channel,0,1,Nach uns die Sintflut STAATSHAUSHALT UND...,,
...,...,...,...,...,...,...,...,...,...,...,...
195,195,45195-6525030231877560063,-6.525030e+18,"<a href=""https://www.epochtimes.de/wirtschaft/...",2021.0-5.0,channel,0,1,,,
196,196,17635-2062537961599298253,-2.062538e+18,Was zur islamistischen Mordtat von Würzburg zu...,2021.0-6.0,channel,0,1,Was zur islamistischen Mordtat von Würzburg zu...,,
197,197,37613-2062537961599298253,-2.062538e+18,"<a href=""https://www.youtube.com/watch?v=PSp1r...",2021.0-10.0,channel,0,1,,,
198,198,13867-666778652316300543,-6.667787e+17,Um die Infizierten so rasch wie möglich zu fin...,2020.0-6.0,channel,0,1,Um die Infizierten so rasch wie möglich zu fin...,,


In [19]:
new_sample['final_message_string'] = new_sample['final_message_string'].astype(str)
new_sample['final_message'] = new_sample['final_message'].astype(str)

In [15]:
new_sample['sent_count'] = new_sample['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x != '' and x != 'nan' else 0)

In [20]:
new_sample['question_count'] = new_sample['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
#num emojis 
new_sample['emoji_count'] = new_sample['final_message'].apply(lambda x: count_emojis(x) if x else 0)

In [27]:
from tqdm import tqdm
from transformers import AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [24]:
nouns = []
verbs = []
adjectives = []


for message in tqdm(new_sample['final_message_string'], desc = 'Extracting POS Tag counts'):
        noun, verb, adj = count_pos_tags(message)
        nouns.append(noun)
        verbs.append(verb)
        adjectives.append(adj)
                        
new_sample['noun_count'] = nouns
new_sample['verb_count'] = verbs
new_sample['adj_count'] = adjectives

Extracting POS Tag counts: 100%|██████████| 200/200 [00:01<00:00, 148.59it/s]


In [25]:
new_sample

Unnamed: 0.1,Unnamed: 0,UID_key,author,fwd_message,date,group_or_channel,own_message,forwarded_message,fwd_message_string,final_message,final_message_string,sent_count,question_count,emoji_count,noun_count,verb_count,adj_count
0,0,101673-8768690052625427881,6.151278e+18,,2020.0-7.0,group,1,0,,Das Völkerrecht hat eine Juristische Grundlage...,Das Völkerrecht hat eine Juristische Grundlage...,3,0,0,7,3,2
1,1,280125-2216795262304420685,-3.199383e+18,,2021.0-2.0,group,1,0,,"Danke, das macht Mut!🙏🏾","Danke, das macht Mut!",2,0,1,2,1,0
2,2,321731-2216795262304420685,-5.728373e+18,,2021.0-3.0,group,1,0,,damit er dich besser sperren kann ....,damit er dich besser sperren kann ....,2,0,0,0,1,0
3,3,34147718200335715773759193,-3.182208e+18,,2021.0-7.0,group,1,0,,"Darf man Fragen, wie sich 82 Millionen Mensche...","Darf man Fragen, wie sich 82 Millionen Mensche...",3,1,0,5,2,1
4,4,31042658200335715773759193,-7.008655e+18,,2021.0-6.0,group,1,0,,Dann ist ihm leider nicht zu helfen! Geht mir...,Dann ist ihm leider nicht zu helfen! Geht mir ...,8,0,0,3,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,45195-6525030231877560063,-6.525030e+18,"<a href=""https://www.epochtimes.de/wirtschaft/...",2021.0-5.0,channel,0,1,,,,0,0,0,0,0,0
196,196,17635-2062537961599298253,-2.062538e+18,Was zur islamistischen Mordtat von Würzburg zu...,2021.0-6.0,channel,0,1,Was zur islamistischen Mordtat von Würzburg zu...,,,0,0,0,0,0,0
197,197,37613-2062537961599298253,-2.062538e+18,"<a href=""https://www.youtube.com/watch?v=PSp1r...",2021.0-10.0,channel,0,1,,,,0,0,0,0,0,0
198,198,13867-666778652316300543,-6.667787e+17,Um die Infizierten so rasch wie möglich zu fin...,2020.0-6.0,channel,0,1,Um die Infizierten so rasch wie möglich zu fin...,,,0,0,0,0,0,0


# Aggregation

In [42]:
sample_size = 200
pre_agg = pd.read_csv(f'../results/pre-aggregation/liwcANDfeatures_results_{sample_size}.csv.gzip', compression='gzip')

In [44]:
pre_agg = pd.get_dummies(pre_agg, columns=['group_or_channel', 'flesch_reading_ease_class'])

In [45]:
pre_agg.columns

Index(['UID_key', 'author', 'fwd_message', 'date', 'own_message',
       'forwarded_message', 'group_name', 'fwd_message_string',
       'final_message', 'final_message_string', 'sent_count', 'word_count',
       'avg_sent_length', 'avg_word_length', 'exclamation_count',
       'question_count', 'emoji_count', 'noun_count', 'verb_count',
       'adj_count', 'flesch_reading_ease', 'positive_sentiment',
       'negative_sentiment', 'neutral_sentiment', 'liwc_I', 'liwc_We',
       'liwc_You', 'liwc_Other', 'liwc_Affect', 'group_or_channel_channel',
       'group_or_channel_group', 'flesch_reading_ease_class_difficult',
       'flesch_reading_ease_class_easy',
       'flesch_reading_ease_class_fairly difficult',
       'flesch_reading_ease_class_fairly easy',
       'flesch_reading_ease_class_standard',
       'flesch_reading_ease_class_unclassified',
       'flesch_reading_ease_class_very confusing',
       'flesch_reading_ease_class_very easy'],
      dtype='object')

In [52]:
# Aggregation dictionary
agg_dict = {
    # COUNT
    'UID_key': 'count',

    # SUM
    'own_message': 'sum',
    'forwarded_message': 'sum',
    'noun_count': 'sum',
    'verb_count': 'sum',
    'adj_count': 'sum',
    'positive_sentiment': 'sum',
    'negative_sentiment': 'sum',
    'neutral_sentiment': 'sum',
    'group_or_channel_channel': 'sum',
    'group_or_channel_group': 'sum',
    'flesch_reading_ease_class_difficult': 'sum',
    'flesch_reading_ease_class_easy': 'sum',
    'flesch_reading_ease_class_fairly difficult': 'sum',
    'flesch_reading_ease_class_fairly easy': 'sum',
    'flesch_reading_ease_class_standard': 'sum',
    'flesch_reading_ease_class_unclassified': 'sum',
    'flesch_reading_ease_class_very confusing': 'sum',
    'flesch_reading_ease_class_very easy': 'sum',

    # AVG
    'sent_count': 'mean',
    'word_count': 'mean',
    'avg_sent_length': 'mean',
    'avg_word_length': 'mean',
    'exclamation_count': 'mean',
    'question_count': 'mean',
    'emoji_count': 'mean',
    'flesch_reading_ease': 'mean',
    'liwc_I': 'mean',
    'liwc_We': 'mean',
    'liwc_You': 'mean',
    'liwc_Other': 'mean',
    'liwc_Affect': 'mean',
    
    # ' '.JOIN
    'fwd_message': lambda x: ' '.join(x.dropna().astype(str)),
    'fwd_message_string': lambda x: ' '.join(x.dropna().astype(str)),
    'final_message': lambda x: ' '.join(x.dropna().astype(str)),
    'final_message_string': lambda x: ' '.join(x.dropna().astype(str)),
}

In [53]:
#dict for aggregatopn
agg = pre_agg.groupby(['author', 'date']).agg(agg_dict)

In [57]:
agg

Unnamed: 0_level_0,Unnamed: 1_level_0,UID_key,own_message,forwarded_message,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,group_or_channel_channel,group_or_channel_group,flesch_reading_ease_class_difficult,flesch_reading_ease_class_easy,flesch_reading_ease_class_fairly difficult,flesch_reading_ease_class_fairly easy,flesch_reading_ease_class_standard,flesch_reading_ease_class_unclassified,flesch_reading_ease_class_very confusing,flesch_reading_ease_class_very easy,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,fwd_message,fwd_message_string,final_message,final_message_string
author,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
-9.077298e+18,2021.0-2.0,1,1,0,0,1,0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,1,1.0,5.0,5.000000,4.200000,0.0,0.0,1.0,93.10,0.200000,0.000000,0.000000,0.000000,0.200000,,,Würde ich auch gut finden 👍🏻,Würde ich auch gut finden
-8.990367e+18,2021.0-3.0,1,1,0,3,1,1,0.0,0.0,1.0,0,1,0,0,0,0,0,1,0,0,2.0,9.0,4.500000,11.666667,0.0,0.0,0.0,-27.91,0.000000,0.000000,0.000000,0.000000,0.000000,,,<strong>VOLLSTRECKUNGSBEAMTER WIRD AUFGEKLÄRT<...,VOLLSTRECKUNGSBEAMTER WIRD AUFGEKLÄRT Ein soge...
-8.502653e+18,2020.0-4.0,1,0,0,20,12,5,0.0,0.0,1.0,0,1,0,1,0,0,0,0,0,0,21.0,122.0,5.809524,4.868852,1.0,0.0,0.0,85.85,0.040984,0.016393,0.008197,0.016393,0.040984,,,Der Papst ist Jesuit. Wenn man das mal anscha...,Der Papst ist Jesuit. Wenn man das mal anscha...
-8.489960e+18,2021.0-8.0,1,1,0,6,0,0,0.0,0.0,1.0,0,1,0,1,0,0,0,0,0,0,2.0,17.0,8.500000,4.294118,0.0,1.0,0.0,83.75,0.058824,0.000000,0.000000,0.000000,0.058824,,,"Ich hab mal eine Frage, wo ist Morgen das Tref...","Ich hab mal eine Frage, wo ist Morgen das Tref..."
-8.398202e+18,2020.0-11.0,1,1,0,1,2,0,0.0,0.0,1.0,0,1,0,0,0,0,1,0,0,0,1.0,11.0,11.000000,5.181818,0.0,0.0,0.0,69.55,0.000000,0.090909,0.000000,0.000000,0.181818,,,"Egal wie, alle zusammen schaffen wir das, dies...","Egal wie, alle zusammen schaffen wir das, dies..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8.223635e+18,2021.0-5.0,1,1,0,0,0,0,1.0,0.0,0.0,0,1,0,0,0,0,1,0,0,0,1.0,1.0,1.000000,5.000000,0.0,0.0,0.0,62.00,0.000000,0.000000,0.000000,0.000000,1.000000,,,Genau,Genau
8.337123e+18,2020.0-4.0,1,1,0,2,4,0,0.0,1.0,0.0,0,1,0,1,0,0,0,0,0,0,4.0,24.0,6.000000,4.666667,0.0,0.0,0.0,84.25,0.083333,0.000000,0.041667,0.041667,0.083333,,,Wenn du das sagst... Ich lese in meinen Zeilen...,Wenn du das sagst... Ich lese in meinen Zeilen...
8.366284e+18,2021.0-3.0,1,1,0,1,1,0,0.0,1.0,0.0,0,1,0,0,0,0,0,0,0,1,3.0,12.0,4.000000,3.750000,0.0,0.0,0.0,97.95,0.083333,0.000000,0.000000,0.166667,0.000000,,,Ja meine Frau wird es kennen. Es ist auch in W...,Ja meine Frau wird es kennen. Es ist auch in W...
8.681116e+18,2020.0-11.0,1,1,0,3,6,1,0.0,1.0,0.0,0,1,1,0,0,0,0,0,0,0,1.0,38.0,38.000000,5.157895,0.0,0.0,1.0,36.70,0.052632,0.000000,0.000000,0.000000,0.052632,,,Gestern hab ich erklär Bär gespielt keiner wol...,Gestern hab ich erklär Bär gespielt keiner wol...


# Parallelization

In [19]:
def analysis(df):    
    df['sent_count'] = df['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x != '' and x != 'nan' else 0)
    #num words
    df['word_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'\w+', x)) if x != '' and x != 'nan' else 0)
    #avg sentence length (words per sentence)
    df['avg_sent_length'] = df.apply(lambda row: row['word_count'] / row['sent_count'] if row['sent_count'] > 0 else 0, axis=1)
    #avg word length (characters per word)
    df['avg_word_length'] = df.apply(lambda row: len(row['final_message_string'].replace(' ', '')) / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
    #num exclamations (multiple ! coutn as one exclamation)
    df['exclamation_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'!+', x)) if x else 0)
    #num questions (multiple ? count as one question)
    df['question_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
    #num emojis 
    df['emoji_count'] = df['final_message'].apply(lambda x: count_emojis(x) if x else 0)

    print('Simple count based features extracted.')

    ########## COUNT OF SELECTED POS TAGS ##########

    #count nouns, verbs and adj
    nouns = []
    verbs = []
    adjectives = []

    for message in tqdm(df['final_message_string'], desc = 'Extracting POS Tag counts'):
        noun, verb, adj = count_pos_tags(message)
        nouns.append(noun)
        verbs.append(verb)
        adjectives.append(adj)
                        
    df['noun_count'] = nouns
    df['verb_count'] = verbs
    df['adj_count'] = adjectives

    ########## FLESCH READING EASE SCORE ##########

    textstat.set_lang('de')
    #compute Flesch Reading Ease score on non-empty df
    df['flesch_reading_ease'] = df['final_message_string'].apply(lambda x: textstat.flesch_reading_ease(x) if x.strip() != '' and x != 'nan' else np.nan)

    #classify scores based on: https://pypi.org/project/textstat/
    flesch_classes = []
    for score in df['flesch_reading_ease']:
        if score >= 0 and score < 30:
            flesch_classes.append('very confusing')
        elif score >= 30 and score < 50:
            flesch_classes.append('difficult')
        elif score >= 50 and score < 60:
            flesch_classes.append('fairly difficult')
        elif score >=60 and score < 70:
            flesch_classes.append('standard')
        elif score >=70 and score < 80:
            flesch_classes.append('fairly easy')
        elif score >=80 and score < 90:
            flesch_classes.append('easy')
        elif score >=90 and score < 101:
            flesch_classes.append('very easy')
        else:
            flesch_classes.append('unclassified')
        
    df['flesch_reading_ease_class'] = flesch_classes

    print('Flesch Reading Ease score extracted.')

    ########## SENTIMENT ANALYSIS ##########

    #load tokenizer and sentiment model
    print('Loading sentiment model...')
    sentiment_model = pipeline(model='aari1995/German_Sentiment')
    tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')  

    pos_sent = []
    neg_sent = []
    neutral_sent = []

    for message in tqdm(df['final_message_string'], desc = 'Extracting Sentiment'):
        #if message is empty, don't calculate sentiment
        if message == '' or message == 'nan':
            pos_sent.append(np.nan)
            neg_sent.append(np.nan)
            neutral_sent.append(np.nan)
        else:
            #truncate message to max length model can handle
            result = sentiment_model(message[:512])
            sent = (result[0]['label'])
            if sent == 'positive':
                pos_sent.append(1)
                neg_sent.append(0)
                neutral_sent.append(0)
            elif sent == 'negative':
                pos_sent.append(0)
                neg_sent.append(1)
                neutral_sent.append(0)
            elif sent == 'neutral':
                pos_sent.append(0)
                neg_sent.append(0)
                neutral_sent.append(1)
            else:
                pos_sent.append(np.nan)
                neg_sent.append(np.nan)
                neutral_sent.append(np.nan)

    df['positive_sentiment'] = pos_sent
    df['negative_sentiment'] = neg_sent
    df['neutral_sentiment'] = neutral_sent
    print('Sentiment extracted.')

    return df

In [20]:
def pool_cluster_metrics(n_cores, network_dict_list):
    rep_list = []

    pool = Pool(n_cores)

    for result in tqdm(
        pool.imap_unordered(func=calculate_cluster_results, iterable=network_dict_list),
        total=len(network_dict_list)
        ):
            rep_list.append(result)

    pool.close()
    return rep_list

In [21]:
#split the dataframe into n_cores parts and return list of dicts
def split_df(n_cores, df):
    df_list = np.array_split(df, n_cores)
    return [df_part.to_dict('records') for df_part in df_list]

In [22]:
n_cores=4
df_list = split_df(4, new_sample)

NameError: name 'new_sample' is not defined

In [None]:
final = pool_cluster_metrics(n_cores, df_list)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 

# Post-Aggregation Features

In [None]:
df = pd.read_csv(f'../data/aggregated/author_date_{sample_size}.csv.gzip', compression='gzip')

In [None]:
df= df[df['message_count'] > 1] 

In [None]:
df.head(5)

Unnamed: 0,author,date,message_count,own_message,forwarded_message,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,group_or_channel_channel,group_or_channel_group,flesch_reading_ease_class_difficult,flesch_reading_ease_class_easy,flesch_reading_ease_class_fairly difficult,flesch_reading_ease_class_fairly easy,flesch_reading_ease_class_standard,flesch_reading_ease_class_unclassified,flesch_reading_ease_class_very confusing,flesch_reading_ease_class_very easy,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,fwd_message,fwd_message_string,final_message,final_message_string
20,-6.52503e+18,2021.0-2.0,4,0,4,0,0,0,0.0,0.0,0.0,4,0,0,0,0,0,0,4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"<a href=""https://www.epochtimes.de/politik/aus...","‼ Erfurt, Thüringen ‼ Netzfund Krasser Be...",,
21,-6.52503e+18,2021.0-3.0,3,0,3,0,0,0,0.0,0.0,0.0,3,0,0,0,0,0,0,3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,Prepare. Kek. I feel like I was born for this....,Prepare. Kek. I feel like I was born for this....,,
22,-6.52503e+18,2021.0-4.0,4,0,4,0,0,0,0.0,0.0,0.0,4,0,0,0,0,0,0,4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,<strong>Ehemaliger Bundeswehrsoldat packt aus!...,Ehemaliger Bundeswehrsoldat packt aus! Stateme...,,
23,-6.52503e+18,2021.0-5.0,10,0,10,0,0,0,0.0,0.0,0.0,10,0,0,0,0,0,0,10,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,Cybercrime: Erpresser geben Irland Entschlüsse...,Cybercrime: Erpresser geben Irland Entschlüsse...,,
24,-6.52503e+18,2021.0-6.0,10,0,10,0,0,0,0.0,0.0,0.0,10,0,0,0,0,0,0,10,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,<strong>Fragen wie diese sind so langweilig. A...,Fragen wie diese sind so langweilig. Antwort: ...,,


In [None]:
# List of count columns to convert to percentages
count_columns = [
    'own_message',
    'forwarded_message',
    'positive_sentiment',
    'negative_sentiment',
    'neutral_sentiment',
    'group_or_channel_channel',
    'group_or_channel_group',
    'flesch_reading_ease_class_difficult',
    'flesch_reading_ease_class_easy',
    'flesch_reading_ease_class_fairly difficult',
    'flesch_reading_ease_class_fairly easy',
    'flesch_reading_ease_class_standard',
    'flesch_reading_ease_class_unclassified',
    'flesch_reading_ease_class_very confusing',
    'flesch_reading_ease_class_very easy'
]

# Convert counts to percentages row by row
for index, row in df.iterrows():
    for col in count_columns:
        df.at[index, col] = row[col] / row['message_count']

In [None]:
df.head()

Unnamed: 0,author,date,message_count,own_message,forwarded_message,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,group_or_channel_channel,group_or_channel_group,flesch_reading_ease_class_difficult,flesch_reading_ease_class_easy,flesch_reading_ease_class_fairly difficult,flesch_reading_ease_class_fairly easy,flesch_reading_ease_class_standard,flesch_reading_ease_class_unclassified,flesch_reading_ease_class_very confusing,flesch_reading_ease_class_very easy,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,fwd_message,fwd_message_string,final_message,final_message_string
20,-6.52503e+18,2021.0-2.0,4,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"<a href=""https://www.epochtimes.de/politik/aus...","‼ Erfurt, Thüringen ‼ Netzfund Krasser Be...",,
21,-6.52503e+18,2021.0-3.0,3,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,Prepare. Kek. I feel like I was born for this....,Prepare. Kek. I feel like I was born for this....,,
22,-6.52503e+18,2021.0-4.0,4,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,<strong>Ehemaliger Bundeswehrsoldat packt aus!...,Ehemaliger Bundeswehrsoldat packt aus! Stateme...,,
23,-6.52503e+18,2021.0-5.0,10,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,Cybercrime: Erpresser geben Irland Entschlüsse...,Cybercrime: Erpresser geben Irland Entschlüsse...,,
24,-6.52503e+18,2021.0-6.0,10,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,<strong>Fragen wie diese sind so langweilig. A...,Fragen wie diese sind so langweilig. Antwort: ...,,


In [None]:
test = pd.read_csv('../results/post-aggregation/author_date_200.csv.gzip', compression='gzip')

In [None]:
test[test['message_count'] > 1]

Unnamed: 0,author,date,message_count,own_message,forwarded_message,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,channel_messages,group_messages,flesch_reading_ease_class_difficult,flesch_reading_ease_class_easy,flesch_reading_ease_class_fairly difficult,flesch_reading_ease_class_fairly easy,flesch_reading_ease_class_standard,flesch_reading_ease_class_unclassified,flesch_reading_ease_class_very confusing,flesch_reading_ease_class_very easy,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,fwd_message,fwd_message_string,final_message,final_message_string,action_quotient,toxicity
20,-6.52503e+18,2021.0-2.0,4,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"<a href=""https://www.epochtimes.de/politik/aus...","‼ Erfurt, Thüringen ‼ Netzfund Krasser Be...",,,,0
21,-6.52503e+18,2021.0-3.0,3,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,Prepare. Kek. I feel like I was born for this....,Prepare. Kek. I feel like I was born for this....,,,,0
22,-6.52503e+18,2021.0-4.0,4,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,<strong>Ehemaliger Bundeswehrsoldat packt aus!...,Ehemaliger Bundeswehrsoldat packt aus! Stateme...,,,,0
23,-6.52503e+18,2021.0-5.0,10,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,Cybercrime: Erpresser geben Irland Entschlüsse...,Cybercrime: Erpresser geben Irland Entschlüsse...,,,,0
24,-6.52503e+18,2021.0-6.0,10,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,<strong>Fragen wie diese sind so langweilig. A...,Fragen wie diese sind so langweilig. Antwort: ...,,,,0
25,-6.52503e+18,2021.0-7.0,12,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,Die Delta-Panikmache hält weiter an. Österreic...,Die Delta-Panikmache hält weiter an. Österreic...,,,,0
26,-6.52503e+18,2021.0-8.0,4,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"Joe Biden - der erste in der Geschichte, der m...","Joe Biden - der erste in der Geschichte, der m...",,,,0
50,-2.062538e+18,2021.0-10.0,6,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,<strong>Türkei droht Rauswurf aus dem Europara...,Türkei droht Rauswurf aus dem Europarat - 27.1...,,,,0
51,-2.062538e+18,2021.0-11.0,4,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"<a href=""t.me/fufmedia"">t.me/fufmedia</a> <a h...",Will Ungekürzte Version Das ist eine BOMBE -...,,,,0
52,-2.062538e+18,2021.0-12.0,5,0,1,0,0,0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,"Hallo liebe Arwen 🥰, heute wieder eine TV Info...","Hallo liebe Arwen , heute wieder eine TV Info....",,,,0


# Parallelization - Again

In [23]:
df = pd.read_csv('../data/samples/messages_sample_200.csv.gzip', compression='gzip')
df_non = df.copy()

In [24]:
mp.cpu_count()

8

In [25]:
def analysis(df):    
    ########## FEATURE EXTRACTION ##########

    #num sentences
    df['sent_count'] = df['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x != '' and x != 'nan' else 0)
    #num words
    df['word_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'\w+', x)) if x != '' and x != 'nan' else 0)
    #avg sentence length (words per sentence)
    df['avg_sent_length'] = df.apply(lambda row: row['word_count'] / row['sent_count'] if row['sent_count'] > 0 else 0, axis=1)
    #avg word length (characters per word)
    df['avg_word_length'] = df.apply(lambda row: len(row['final_message_string'].replace(' ', '')) / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
    #num exclamations (multiple ! coutn as one exclamation)
    df['exclamation_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'!+', x)) if x else 0)
    #num questions (multiple ? count as one question)
    df['question_count'] = df['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
    #num emojis 
    df['emoji_count'] = df['final_message'].apply(lambda x: count_emojis(x) if x else 0)

    print('Simple count based features extracted.')

    ########## COUNT OF SELECTED POS TAGS ##########

    #count nouns, verbs and adj
    nouns = []
    verbs = []
    adjectives = []

    for message in tqdm(df['final_message_string'], desc = 'Extracting POS Tag counts'):
        noun, verb, adj = count_pos_tags(message)
        nouns.append(noun)
        verbs.append(verb)
        adjectives.append(adj)
                        
    df['noun_count'] = nouns
    df['verb_count'] = verbs
    df['adj_count'] = adjectives

    ########## FLESCH READING EASE SCORE ##########

    textstat.set_lang('de')
    #compute Flesch Reading Ease score on non-empty df
    df['flesch_reading_ease'] = df['final_message_string'].apply(lambda x: textstat.flesch_reading_ease(x) if x.strip() != '' and x != 'nan' else np.nan)

    #classify scores based on: https://pypi.org/project/textstat/
    flesch_classes = []
    for score in df['flesch_reading_ease']:
        if score >= 0 and score < 30:
            flesch_classes.append('very confusing')
        elif score >= 30 and score < 50:
            flesch_classes.append('difficult')
        elif score >= 50 and score < 60:
            flesch_classes.append('fairly difficult')
        elif score >=60 and score < 70:
            flesch_classes.append('standard')
        elif score >=70 and score < 80:
            flesch_classes.append('fairly easy')
        elif score >=80 and score < 90:
            flesch_classes.append('easy')
        elif score >=90 and score < 101:
            flesch_classes.append('very easy')
        else:
            flesch_classes.append('unclassified')
        
    df['flesch_reading_ease_class'] = flesch_classes

    print('Flesch Reading Ease score extracted.')

    ########## SENTIMENT ANALYSIS ##########

    #load tokenizer and sentiment model
    print('Loading sentiment model...')
    sentiment_model = pipeline(model='aari1995/German_Sentiment')
    tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')  

    pos_sent = []
    neg_sent = []
    neutral_sent = []

    for message in tqdm(df['final_message_string'], desc = 'Extracting Sentiment'):
        #if message is empty, don't calculate sentiment
        if message == '' or message == 'nan':
            pos_sent.append(np.nan)
            neg_sent.append(np.nan)
            neutral_sent.append(np.nan)
        else:
            #truncate message to max length model can handle
            result = sentiment_model(message[:512])
            sent = (result[0]['label'])
            if sent == 'positive':
                pos_sent.append(1)
                neg_sent.append(0)
                neutral_sent.append(0)
            elif sent == 'negative':
                pos_sent.append(0)
                neg_sent.append(1)
                neutral_sent.append(0)
            elif sent == 'neutral':
                pos_sent.append(0)
                neg_sent.append(0)
                neutral_sent.append(1)
            else:
                pos_sent.append(np.nan)
                neg_sent.append(np.nan)
                neutral_sent.append(np.nan)

    df['positive_sentiment'] = pos_sent
    df['negative_sentiment'] = neg_sent
    df['neutral_sentiment'] = neutral_sent
    print('Sentiment extracted.')

In [26]:
def parallelize_dataframe(df, func, num_partitions):
    # Split the dataframe into smaller chunks
    df_split = np.array_split(df, num_partitions)
    # Create a pool of workers
    with mp.Pool(num_partitions) as pool:
        # Apply the function to each chunk
        for df in pool.map(func, df_split):
    return df

In [29]:
df_split = np.array_split(df, 4)

  return bound(*args, **kwds)


In [31]:
len(df_split)

4

In [28]:
time_start_non_parallel = time.time()
########## FEATURE EXTRACTION ##########

#num sentences
df_non['sent_count'] = df_non['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x != '' and x != 'nan' else 0)
#num words
df_non['word_count'] = df_non['final_message_string'].apply(lambda x: len(re.findall(r'\w+', x)) if x != '' and x != 'nan' else 0)
#avg sentence length (words per sentence)
df_non['avg_sent_length'] = df_non.apply(lambda row: row['word_count'] / row['sent_count'] if row['sent_count'] > 0 else 0, axis=1)
#avg word length (characters per word)
df_non['avg_word_length'] = df_non.apply(lambda row: len(row['final_message_string'].replace(' ', '')) / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
#num exclamations (multiple ! coutn as one exclamation)
df_non['exclamation_count'] = df_non['final_message_string'].apply(lambda x: len(re.findall(r'!+', x)) if x else 0)
#num questions (multiple ? count as one question)
df_non['question_count'] = df_non['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
#num emojis 
df_non['emoji_count'] = df_non['final_message'].apply(lambda x: count_emojis(x) if x else 0)

print('Simple count based features extracted.')

########## COUNT OF SELECTED POS TAGS ##########

#count nouns, verbs and adj
nouns = []
verbs = []
adjectives = []

for message in tqdm(df_non['final_message_string'], desc = 'Extracting POS Tag counts'):
    noun, verb, adj = count_pos_tags(message)
    nouns.append(noun)
    verbs.append(verb)
    adjectives.append(adj)
                    
df_non['noun_count'] = nouns
df_non['verb_count'] = verbs
df_non['adj_count'] = adjectives

########## FLESCH READING EASE SCORE ##########

textstat.set_lang('de')
#compute Flesch Reading Ease score on non-empty df_non
df_non['flesch_reading_ease'] = df_non['final_message_string'].apply(lambda x: textstat.flesch_reading_ease(x) if x.strip() != '' and x != 'nan' else np.nan)

#classify scores based on: https://pypi.org/project/textstat/
flesch_classes = []
for score in df_non['flesch_reading_ease']:
    if score >= 0 and score < 30:
        flesch_classes.append('very confusing')
    elif score >= 30 and score < 50:
        flesch_classes.append('difficult')
    elif score >= 50 and score < 60:
        flesch_classes.append('fairly difficult')
    elif score >=60 and score < 70:
        flesch_classes.append('standard')
    elif score >=70 and score < 80:
        flesch_classes.append('fairly easy')
    elif score >=80 and score < 90:
        flesch_classes.append('easy')
    elif score >=90 and score < 101:
        flesch_classes.append('very easy')
    else:
        flesch_classes.append('unclassified')
    
df_non['flesch_reading_ease_class'] = flesch_classes

print('Flesch Reading Ease score extracted.')

########## SENTIMENT ANALYSIS ##########

#load tokenizer and sentiment model
print('Loading sentiment model...')
sentiment_model = pipeline(model='aari1995/German_Sentiment')
tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')  

pos_sent = []
neg_sent = []
neutral_sent = []

for message in tqdm(df_non['final_message_string'], desc = 'Extracting Sentiment'):
    #if message is empty, don't calculate sentiment
    if message == '' or message == 'nan':
        pos_sent.append(np.nan)
        neg_sent.append(np.nan)
        neutral_sent.append(np.nan)
    else:
        #truncate message to max length model can handle
        result = sentiment_model(message[:512])
        sent = (result[0]['label'])
        if sent == 'positive':
            pos_sent.append(1)
            neg_sent.append(0)
            neutral_sent.append(0)
        elif sent == 'negative':
            pos_sent.append(0)
            neg_sent.append(1)
            neutral_sent.append(0)
        elif sent == 'neutral':
            pos_sent.append(0)
            neg_sent.append(0)
            neutral_sent.append(1)
        else:
            pos_sent.append(np.nan)
            neg_sent.append(np.nan)
            neutral_sent.append(np.nan)

df_non['positive_sentiment'] = pos_sent
df_non['negative_sentiment'] = neg_sent
df_non['neutral_sentiment'] = neutral_sent
print('Sentiment extracted.')

time_end_non_parallel = time.time()
print(f'Non-parallel execution time: {time_end_non_parallel - time_start_non_parallel} seconds')

TypeError: expected string or buffer

# Threading

In [1]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import numpy as np
import pandas as pd
from transformers import pipeline, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
messages = pd.read_csv(f'../data/samples/messages_sample_200.csv.gzip', compression='gzip').drop(columns=['Unnamed: 0'], axis=1)
messages['final_message_string'] = messages['final_message_string'].astype(str)

In [3]:
# Load tokenizer and sentiment model
print('Loading sentiment model...')
sentiment_model = pipeline(model='aari1995/German_Sentiment')
tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Sentiment')

# Define a function to process a single message
def analyze_sentiment(message):
    if message == '' or message == 'nan':
        return np.nan, np.nan, np.nan
    else:
        # Truncate message to max length model can handle
        result = sentiment_model(message[:512])
        sent = result[0]['label']
        if sent == 'positive':
            return 1, 0, 0
        elif sent == 'negative':
            return 0, 1, 0
        elif sent == 'neutral':
            return 0, 0, 1
        else:
            return np.nan, np.nan, np.nan

# Initialize lists to store sentiment results
pos_sent = []
neg_sent = []
neutral_sent = []

Loading sentiment model...


In [4]:
# Use ThreadPoolExecutor to parallelize sentiment analysis
print('Starting sentiment extraction...')
with ThreadPoolExecutor(max_workers=4) as executor:
    # Submit tasks and store futures
    futures = [executor.submit(analyze_sentiment, msg) for msg in messages['final_message_string']]
    # Process results as they become available
    for future in tqdm(as_completed(futures), total=len(futures), desc='Extracting Sentiment'):
        pos, neg, neutral = future.result()
        pos_sent.append(pos)
        neg_sent.append(neg)
        neutral_sent.append(neutral)

# Add sentiment results to the DataFrame
messages['positive_sentiment'] = pos_sent
messages['negative_sentiment'] = neg_sent
messages['neutral_sentiment'] = neutral_sent
print('Sentiment extraction done.')

Starting sentiment extraction...


Extracting Sentiment:   0%|          | 0/200 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Extracting Sentiment: 100%|██████████| 200/200 [02:21<00:00,  1.42it/s]


Sentiment extraction done.


# was_forwarded

In [148]:
sample_size = 200
pre_agg = pd.read_csv(f'../results/pre-aggregation/liwcANDfeatures_results_{sample_size}.csv.gzip', compression='gzip')
print('Dataset loaded.')

Dataset loaded.


In [149]:
pre_agg = pd.get_dummies(pre_agg, columns=['group_or_channel'])
print('Dummies for categorial variables created.')
messages = pre_agg[['author', 'own_message', 'forwarded_message', 'fwd_author', 'UID_key', 'group_name', 'date']]
pre_agg = pre_agg[pre_agg['own_message'] == 1]

Dummies for categorial variables created.


In [150]:
agg_dict = {
    # SUM
    'noun_count': 'sum',
    'verb_count': 'sum',
    'adj_count': 'sum',
    'positive_sentiment': 'sum',
    'negative_sentiment': 'sum',
    'neutral_sentiment': 'sum',
    'group_or_channel_channel': 'sum',
    'group_or_channel_group': 'sum',

    # AVG
    'sent_count': 'mean',
    'word_count': 'mean',
    'avg_sent_length': 'mean',
    'avg_word_length': 'mean',
    'exclamation_count': 'mean',
    'question_count': 'mean',
    'emoji_count': 'mean',
    'flesch_reading_ease': 'mean',
    'liwc_I': 'mean',
    'liwc_We': 'mean',
    'liwc_You': 'mean',
    'liwc_Other': 'mean',
    'liwc_Affect': 'mean',
    
    # ' '.JOIN
    'final_message': lambda x: ' '.join(x.dropna().astype(str)),
    'final_message_string': lambda x: ' '.join(x.dropna().astype(str)),
}

# Aggregation dictionary for message ratios
agg_dict_messages = {
    'own_message': 'sum',
    'forwarded_message': 'sum',
    'UID_key': 'count'
}

########## RENAMING COLUMNS ##########

rename_dict = {'group_or_channel_channel': 'channel_messages', 'group_or_channel_group': 'group_messages', 'UID_key': 'total_message_count'}


print('Aggregating per author and group...')
#aggregate linguistic features
agg_author_group = pre_agg.groupby(['author', 'group_name']).agg(agg_dict)
agg_author_group = agg_author_group.rename(columns=rename_dict)
#aggregate message ratios
agg_author_group_messages = messages.groupby(['author', 'group_name']).agg(agg_dict_messages)
agg_author_group_messages = agg_author_group_messages.rename(columns=rename_dict)
#concat based on author and group columns
agg_author_group = pd.merge(
    left = agg_author_group,
    right = agg_author_group_messages,
    how = 'outer',
    left_on = ['author', 'group_name'],
    right_on = ['author', 'group_name']
)

Aggregating per author and group...


In [151]:
agg_author_group.index[0]

(-9.182988317250062e+18,
 'GEMEINSAMER ANTI CORONALÜGEN - SIEG CHAT! ZUM WIDERSTANDS QUALITÄTSPOSTEN, DISKUSSION UND GEMEINSAMEN GEDANKENAUSTAUSCH')

In [152]:
agg_author_group.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,noun_count,verb_count,adj_count,positive_sentiment,negative_sentiment,neutral_sentiment,channel_messages,group_messages,sent_count,word_count,avg_sent_length,avg_word_length,exclamation_count,question_count,emoji_count,flesch_reading_ease,liwc_I,liwc_We,liwc_You,liwc_Other,liwc_Affect,final_message,final_message_string,own_message,forwarded_message,total_message_count
author,group_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
-9.182988e+18,"GEMEINSAMER ANTI CORONALÜGEN - SIEG CHAT! ZUM WIDERSTANDS QUALITÄTSPOSTEN, DISKUSSION UND GEMEINSAMEN GEDANKENAUSTAUSCH",4.0,0.0,3.0,1.0,0.0,1.0,0.0,2.0,1.0,10.5,10.5,4.025,0.0,0.0,0.5,93.45,0.0,0.0,0.0,0.025,0.0,"RIP 😢😢😢 Damit dürfte klar sein, dass ei...","RIP Damit dürfte klar sein, dass ein versamme...",2,0,3
-9.146555e+18,"BEFREIUNG DEUTSCHLAND, ÖSTERREICH, SCHWEIZ💃🕺💃 DONALD J. TRUMP 🐇 Q-FAMILY-NETZWERK 🇩🇪🇦🇹🇨🇭",1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,4.0,4.0,5.75,0.0,0.0,1.0,70.7,0.0,0.0,0.0,0.0,0.0,"hahahha fasten, knall hart 😀🤣🤣🤣🤣🤣","hahahha fasten, knall hart",1,0,1
-8.92447e+18,1Research7Intelligence Room,2.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,14.0,4.666667,4.428571,0.0,0.0,0.0,91.1,0.0,0.0,0.0,0.1875,0.0625,die Nahcbarn sind Koreaner und er guckt zu was...,die Nahcbarn sind Koreaner und er guckt zu was...,1,0,1
-8.57958e+18,Alles Ausser Mainstream Chat,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,2.0,5.5,0.0,0.0,0.0,90.25,0.0,0.0,0.0,0.0,0.0,PN geschickt,PN geschickt,1,0,1
-8.545029e+18,FRIEDENSVERTRAG für DEUTSCHLAND - KAISERTREU,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,4.0,2.0,5.25,0.0,0.0,0.0,70.7,0.0,0.0,0.0,0.0,0.5,Zu hören bitte wichtig..,Zu hören bitte wichtig..,1,0,1


# Edgelist

In [157]:
import pandas as pd
import networkx as nx
from itertools import combinations

In [158]:
df = pd.read_csv(f'../data/samples/messages_sample{sample_size}.csv.gzip', compression='gzip')

FileNotFoundError: [Errno 2] No such file or directory: '../data/samples/messages_sample200.csv.gzip'

In [None]:
# assuming df is your dataframe with 'message', 'author', and 'group' columns

# initialize an empty graph
G = nx.Graph()

# group the dataframe by 'group'
grouped = df.groupby('group_name')

# iterate over each group
for group, data in grouped:
    # get unique authors within the group
    authors = data['author'].unique()
    
    # create all possible pairs of authors in the group
    for author1, author2 in combinations(authors, 2):
        # add edge between the two authors
        if G.has_edge(author1, author2):
            G[author1][author2]['weight'] += 1  # if edge exists, increment the weight
        else:
            G.add_edge(author1, author2, weight=1)  # otherwise, create a new edge with weight 1

# get the edgelist
edgelist = nx.to_pandas_edgelist(G)

# FINNALY FIXED Toxicity Code

In [None]:
def toxicity_detection(message, client):
    analyze_request = {
        'comment': { 'text': f"{message}" },
        'languages' : ["de"],
        'requestedAttributes': {'TOXICITY': {}},
    }
    response = client.comments().analyze(body=analyze_request).execute()
    toxic =response['attributeScores']['TOXICITY']['summaryScore']['value']
    return toxic

In [None]:
import random
#initialize column
toxicity = []

for i in tqdm(range(len(results))):
    row = results.iloc[i]
    message = row['final_message_string']
    if row['own_message'] == 1:
        tox = toxicity_detection(message, client)
        toxicity.append(tox)
    else:
        toxicity.append(np.nan)

results['toxicity'] = toxicity