# Building Code for Linguistic Features

In this notebook, code will be written to extract linguistic features from the dataset. It will be done on a small subsample. Afterwards code will be transferred to a script to run on the full dataset.

In [44]:
import sys
import os
#sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # for scripts
sys.path.append('/Users/emilykruger/Documents/GitHub/CSH-Internship') #for local notebook
sys.path.append('../utils')
import pandas as pd
pd.set_option('display.max_columns', None)
import regex as re
from utils.linguistic_features import remove_emojis, count_emojis, preprocess_text, count_pos_tags
from textstat import flesch_reading_ease

## Loading Data

### Initially

In [2]:
groups = pd.read_csv('../data/selected_groups_with_transcriptions.csv.gzip', compression='gzip')
channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression='gzip')

  groups = pd.read_csv('../data/selected_groups_with_transcriptions.csv.gzip', compression='gzip')
  channels = pd.read_csv('../data/channel_subsample.csv.gzip', compression='gzip')


In [3]:
groups = groups.drop(columns=['Unnamed: 0'], axis=1)
groups['group_or_channel'] = 'group'
groups.head(5)

Unnamed: 0,UID_key,initial_ID,mid_message,mid_file,group_name,posting_date,message,fwd_message,fwd_posting_date_message,posting_date_file,link_url,media_file,media_file_type,fwd_posting_date_file,fwd_link_url,fwd_media_file,fwd_media_file_type,author,fwd_author,day,week,weekday,message_hash,fwd_message_hash,website,replied_to,year,month,day_of_year,duration,filepath,filename,filename_if_joined,transcribed_message,newsguard_domain,newsguard_score,group_or_channel
0,209982-6408240263544091615,0,209982.0,209982.0,Helfende Q Hände 👐👏,2021-11-09 22:50:15+00:00,Oki 😃,,,2021-11-09 22:50:15+00:00,messages199.html#go_to_message209980,,,,,,,9.267455e+17,,2021-11-09 12:00:00+00:00,45.0,2.0,-3.596942e+18,,,209980-6408240263544091615,2021.0,11.0,313.0,,,,,,,,group
1,209983-6408240263544091615,1,209983.0,,Helfende Q Hände 👐👏,2021-11-09 22:50:18+00:00,Von weiblich bis männlich,,,,,,,,,,,-9.029508e+18,,2021-11-09 12:00:00+00:00,45.0,2.0,-7.438051e+18,,,,2021.0,11.0,313.0,,,,,,,,group
2,209984-6408240263544091615,2,209984.0,,Helfende Q Hände 👐👏,2021-11-09 22:50:23+00:00,Von direkt zu zurückhaltend,,,,,,,,,,,-9.029508e+18,,2021-11-09 12:00:00+00:00,45.0,2.0,-5.141813e+18,,,,2021.0,11.0,313.0,,,,,,,,group
3,209985-6408240263544091615,3,209985.0,,Helfende Q Hände 👐👏,2021-11-09 22:50:28+00:00,Wie bei WOW,,,,,,,,,,,-9.029508e+18,,2021-11-09 12:00:00+00:00,45.0,2.0,4.179569e+18,,,,2021.0,11.0,313.0,,,,,,,,group
4,209986-6408240263544091615,4,209986.0,,Helfende Q Hände 👐👏,2021-11-09 22:50:47+00:00,Neuer Charakter und XP sammeln,,,,,,,,,,,-9.029508e+18,,2021-11-09 12:00:00+00:00,45.0,2.0,3.772441e+18,,,,2021.0,11.0,313.0,,,,,,,,group


In [4]:
channels = channels.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
channels['group_or_channel'] = 'channel'
channels.head(5)

Unnamed: 0,UID_key,mid_message,mid_file,group_name,posting_date,message,fwd_message,fwd_posting_date_message,posting_date_file,link_url,media_file,media_file_type,fwd_posting_date_file,fwd_link_url,fwd_media_file,fwd_media_file_type,author,fwd_author,day,week,weekday,message_hash,fwd_message_hash,website,replied_to,year,month,day_of_year,fwd_urls,urls,newsguard_scores,fwd_regex,fwd_cryptocurrency,fwd_cryptolabel,regex,cryptocurrency,cryptolabel,group_or_channel
0,16262-1145969155139279504,16262.0,,Tagesereignisse der Offenbarung,2021-06-22 12:01:01+00:00,,EIN FILMISCHES DENKMAL = DENK MAL WIEDER DARAN...,22.06.2021 07:54:10,,,,,,,,,-1.145969e+18,-8.581437e+18,2021-06-22 12:00:00+00:00,25.0,2.0,,-1.816379e+18,,,2021.0,6.0,173.0,,,,,,,,,,channel
1,16263-1145969155139279504,16263.0,,Tagesereignisse der Offenbarung,2021-06-22 12:02:26+00:00,,Ein weiteres sehr wichtiges „DENK MAL“ liefert...,22.06.2021 08:25:19,,,,,,,,,-1.145969e+18,-8.581437e+18,2021-06-22 12:00:00+00:00,25.0,2.0,,8.99959e+18,,,2021.0,6.0,173.0,,,,,,,,,,channel
2,16265-1145969155139279504,16265.0,,Tagesereignisse der Offenbarung,2021-06-22 12:02:27+00:00,,☝🏻Das ist eine der über 700 Klagen die Trump b...,22.06.2021 11:46:00,,,,,,,,,-1.145969e+18,-3.777232e+18,2021-06-22 12:00:00+00:00,25.0,2.0,,-6.77513e+18,,,2021.0,6.0,173.0,t.me,,,,,,,,,channel
3,16266-1145969155139279504,16266.0,,Tagesereignisse der Offenbarung,2021-06-22 12:02:59+00:00,,🎥 <u>General Flynn: In wenigen Wochen platzt d...,21.06.2021 21:24:59,,,,,,,,,-1.145969e+18,9.086751e+18,2021-06-22 12:00:00+00:00,25.0,2.0,,-4.939329e+18,,,2021.0,6.0,173.0,"['paypal.com', 't.me', 't.me']",,"[nan, nan, nan]",,,,,,,channel
4,16267-1145969155139279504,16267.0,,Tagesereignisse der Offenbarung,2021-06-22 12:06:56+00:00,,<em>Schon gelesen? Die Bürger der Gemeinde Zwö...,16.06.2021 21:20:01,,,,,,,,,-1.145969e+18,4.36247e+17,2021-06-22 12:00:00+00:00,25.0,2.0,,4.063949e+18,,,2021.0,6.0,173.0,t.me,,,,,,,,,channel


In [5]:
#take random sample of 100k rows of both df where either message or fwd_message contains data and combine
sample_groups = groups[groups['message'].notnull() | groups['fwd_message'].notnull()].sample(n=1000, random_state=42)
sample_channels = channels = channels[channels['message'].notnull() | channels['fwd_message'].notnull()].sample(n=1000, random_state=42)
combined = pd.concat([sample_groups, sample_channels], ignore_index=True, axis=0)
combined.head(5)

Unnamed: 0,UID_key,initial_ID,mid_message,mid_file,group_name,posting_date,message,fwd_message,fwd_posting_date_message,posting_date_file,link_url,media_file,media_file_type,fwd_posting_date_file,fwd_link_url,fwd_media_file,fwd_media_file_type,author,fwd_author,day,week,weekday,message_hash,fwd_message_hash,website,replied_to,year,month,day_of_year,duration,filepath,filename,filename_if_joined,transcribed_message,newsguard_domain,newsguard_score,group_or_channel,fwd_urls,urls,newsguard_scores,fwd_regex,fwd_cryptocurrency,fwd_cryptolabel,regex,cryptocurrency,cryptolabel
0,5216248197782804714237638,691095.0,521624.0,521624.0,1Research7Intelligence Room,2021-03-25 15:20:06+00:00,👍🏼,,,2021-03-25 15:20:06+00:00,#go_to_message521136,,,,,,,-3.452303e+17,,2021-03-25 12:00:00+00:00,12.0,4.0,-6.067032e+18,,,5211368197782804714237638,2021.0,3.0,84.0,,,,,,,,group,,,,,,,,,
1,3325378587905794225980510,3512702.0,332537.0,,Klartext reden über Deutschland,2021-12-26 01:05:25+00:00,Die Reptiloiden leiten immer wieder Warmphasen...,,,,,,,,,,,-6.639767e+18,,2021-12-26 12:00:00+00:00,51.0,7.0,-6.739998e+18,,,,2021.0,12.0,360.0,,,,,,,,group,,,,,,,,,
2,2255608197782804714237638,474291.0,225560.0,225560.0,1Research7Intelligence Room,2020-11-24 21:20:05+00:00,"Oh mein Gott... ERSTENS :""Sidney Powell WAR ei...",,,2020-11-24 21:20:05+00:00,#go_to_message225281,,,,,,,-7.899042e+18,,2020-11-24 12:00:00+00:00,48.0,2.0,-8.687167e+18,,,2252818197782804714237638,2020.0,11.0,329.0,,,,,,,,group,,,,,,,,,
3,457113756755399766928245,3811126.0,45711.0,45711.0,Bismarcks Erben,2020-02-24 18:28:57+00:00,Teilweise. Biebel Zitate aus dem Zusammenhang ...,,,2020-02-24 18:28:57+00:00,#go_to_message45700,,,,,,,4.079522e+18,,2020-02-24 12:00:00+00:00,9.0,1.0,8.099019e+18,,,457003756755399766928245,2020.0,2.0,55.0,,,,,,,,group,,,,,,,,,
4,6739-6408240263544091615,120338.0,6739.0,6739.0,Helfende Q Hände 👐👏,2021-03-12 09:04:45+00:00,wie bitte ? ich beantworte das problem dieser ...,,,2021-03-12 09:04:45+00:00,#go_to_message6629,,,,,,,6.018852e+18,,2021-03-12 12:00:00+00:00,10.0,5.0,7.781442e+18,,,6629-6408240263544091615,2021.0,3.0,71.0,,,,,,,,group,,,,,,,,,


In [6]:
#keep only UID and message
messages = combined[['UID_key', 'message', 'fwd_message', 'group_or_channel']]

#remove emojis
cleaned_messages = []
for message in messages['message'].astype(str):
    cleaned_messages.append(remove_emojis(message))

cleaned_fwd_messages = []
for message in messages['fwd_message'].astype(str):
    cleaned_fwd_messages.append(remove_emojis(message))

messages['message_string'] = cleaned_messages
messages['fwd_message_string'] = cleaned_fwd_messages
messages['message_string'] = messages['message_string'].astype(str)
messages['fwd_message_string'] = messages['fwd_message_string'].astype(str)

#if message, take message else take fwd_message
messages['final_message'] = messages['message'].where(messages['message'].notnull(), messages['fwd_message'])
messages['final_message_string'] = messages['message_string'].where(messages['message_string'] != 'nan', messages['fwd_message_string'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['message_string'] = cleaned_messages
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['fwd_message_string'] = cleaned_fwd_messages
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['message_string'] = messages['message_string'].astype(str)
A value is trying to be set on a copy

In [7]:
messages['preprocessed_message'] = messages['final_message_string'].apply(preprocess_text)

#delete uneccessary columns
messages = messages.drop(columns=['message', 'fwd_message', 'message_string', 'fwd_message_string'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['preprocessed_message'] = messages['final_message_string'].apply(preprocess_text)


In [9]:
messages.to_csv('../data/messages_sample.csv.gzip', compression='gzip')

### For Re-Running Below Code

In [45]:
#for re-running
messages = pd.read_csv('../data/samples/messages_sample_2000.csv.gzip', compression='gzip').drop('Unnamed: 0', axis=1)

## Count-Based Features & POS-Tagging

In [10]:
#num sentences
messages['sent_count'] = messages['final_message_string'].apply(lambda x: len(re.split(r'[.!?]+', x)) if x else 0)
#num words
messages['word_count'] = messages['final_message_string'].apply(lambda x: len(re.findall(r'\w+', x)) if x else 0)
#avg sentence length (words per sentence)
messages['avg_sent_length'] = messages.apply(lambda row: row['word_count'] / row['sent_count'] if row['sent_count'] > 0 else 0, axis=1)
#avg word length (characters per word)
messages['avg_word_length'] = messages.apply(lambda row: len(row['final_message_string'].replace(' ', '')) / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
#num exclamations (multiple ! coutn as one exclamation)
messages['exclamation_count'] = messages['final_message_string'].apply(lambda x: len(re.findall(r'!+', x)) if x else 0)
#num questions (multiple ? count as one question)
messages['question_count'] = messages['final_message_string'].apply(lambda x: len(re.findall(r'\?+', x)) if x else 0)
#num emojis 
messages['emoji_count'] = messages['final_message'].apply(lambda x: count_emojis(x) if x else 0)

In [14]:
#use count_pos_tags func to count nouns, verbs and adj
messages['noun_count'] = messages['final_message_string'].apply(lambda x: count_pos_tags(x)[0])
messages['verb_count'] = messages['final_message_string'].apply(lambda x: count_pos_tags(x)[1])
messages['adj_count'] = messages['final_message_string'].apply(lambda x: count_pos_tags(x)[2])

## Flesch Reading Ease

In [16]:
#use TextStat to compute Flesch Reading Ease score on final_message_string
messages['flesch_reading_ease'] = messages['final_message_string'].apply(flesch_reading_ease)

In [18]:
messages.head(5)
messages.to_csv('../data/messages_with_features.csv.gzip', compression='gzip')

## HuggingFace Complexity Classifier Model Exploration

In [34]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, DistilBertForSequenceClassification
import torch

In [4]:
tokenizer = AutoTokenizer.from_pretrained('MiriUll/distilbert-german-text-complexity')
model = AutoModelForSequenceClassification.from_pretrained('MiriUll/distilbert-german-text-complexity')

In [30]:
inputs = tokenizer("Mit solchen Drohungen kommt sie nie mehr zurück ", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
predicted_class_id

0

In [42]:
pipe = pipeline("text-classification", model="MiriUll/distilbert-german-text-complexity")
print(pipe('Das ist ein einfacher Satz.'))
print(pipe('Obwohl der junge Wissenschaftler sich intensiv auf seine Forschungsarbeit konzentrierte, war er oft von den unvorhersehbaren und lauten Bauarbeiten im Nachbargebäude abgelenkt, die seine produktivsten Stunden regelmäßig störten.'))

[{'label': 'LABEL_0', 'score': 0.8107044100761414}]
[{'label': 'LABEL_0', 'score': 0.9880794882774353}]


## Kaggle Emoji Sentiment Dataset

In [52]:
emojis = pd.read_csv('../data/archive/Emoji_Sentiment_Data_v1.0.csv')
#emoji sentiment column based on max value of positive neutral or negative
emojis['sentiment'] = emojis[['Positive', 'Neutral', 'Negative']].idxmax(axis=1)

In [51]:
emojis

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block,sentiment
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,Positive
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats,Positive
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols,Positive
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,Positive
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons,Negative
...,...,...,...,...,...,...,...,...,...,...
964,➛,0x279b,1,0.011628,0,1,0,DRAFTING POINT RIGHTWARDS ARROW,Dingbats,Neutral
965,♝,0x265d,1,0.280000,0,1,0,BLACK CHESS BISHOP,Miscellaneous Symbols,Neutral
966,❋,0x274b,1,0.888889,0,1,0,HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK,Dingbats,Neutral
967,✆,0x2706,1,0.557252,0,1,0,TELEPHONE LOCATION SIGN,Dingbats,Neutral
