# Labeling Functions


In [1]:
import re
import glob
import emoji
import fasttext
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from nltk.tokenize import word_tokenize
from nltk import ngrams
from nltk.corpus import stopwords
from sentence_transformers import util

from snorkel.preprocess import preprocessor
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel
from sklearn.metrics import classification_report

In [2]:
# defining data paths
prop_data_annotated = "../data/processed/propaganda_annotated.pkl"
gen_data_annotated = "../data/processed/genuine_annotated.pkl"
wanlp_prop_data_path = "../data/raw/task1_train.json"
weakly_labeled_data_path = "../data/processed/train.json"
lfs_analysis_path = "../data/processed/lfs_analysis.json"
lfs_dev_labels_path = "../data/processed/lfs_dev_labels.npy"
lfs_train_labels_path = "../data/processed/lfs_train_labels.npy"

# we need the lf_dev data to report the performance of LFs
# and then remove it with the test data from the unlabaled data
lf_dev_data_path = "../data/processed/lf_dev.json"
test_data_path = "../data/processed/test_data.json"

# defining lexicons paths
proppy_lex_paths = "../data/raw/proppy_lexicons/*.csv"
loaded_lex_path = "../data/raw/loaded-language-lexicons.csv"

# loading pre-trained encoders
fasttext_model_path = "../models/cc.ar.300.bin"

In [3]:
# defining the needed fields by the working labeling functions
# in case of doing the analysis from scratch, comment the variable

# fields = [
#     "tweetid",
#     "user_profile_description",
#     "tweet_text",
#     "is_retweet",
#     "quote_count",
#     "reply_count",
#     "like_count",
#     "retweet_count",
#     "hashtags",
#     "urls",
#     "user_mentions",
#     "text",
#     "emojis",
#     "word_count",
#     "docs",
#     "is_irony",
#     "is_hate",
#     "xlmroberta_label",
# ]

In [4]:
df1 = pd.read_pickle(prop_data_annotated)  # [fields]
df2 = pd.read_pickle(gen_data_annotated)  # [fields]

In [5]:
# the following lines handle date conversion

df1["account_creation_date"] = pd.to_datetime(df1.account_creation_date, unit="ms")
df2["account_creation_date"] = pd.to_datetime(df2.account_creation_date, unit="ns")

In [6]:
# combining the propaganda and genuine users data

unlabeled_data = pd.concat([df1, df2], ignore_index=True)
unlabeled_data = unlabeled_data.sample(frac=1.0).reset_index(drop=True)

In [7]:
unlabeled_data.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,...,hashtags,urls,user_mentions,text,emojis,word_count,docs,sarcasm,hate,xlmroberta_label
0,1.092746391963779e+18,2321495687.0,إمارة المنطقة الشرقية,emara_sharqia,,الحساب الرسمي لـ #إمارة_المنطقة_الشرقية - المم...,475837.0,7.0,2014-02-03 08:49:37+00:00,#الأمير_سعود_بن_نايف لأهالي #العوامية\n#مشروع_...,...,3,4,0,# الأمير_سعود_بن_نايف لأهالي# العوامية# مشروع_...,0,44,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9979982972...","{'label': 'not offensive', 'score': 0.99977093...",{'sequence': '# الأمير_سعود_بن_نايف لأهالي# ال...
1,1.364675289222103e+18,312576919.0,عبدالله السبع,7Alsabe,الرياض,المحرر التقني في صحيفة اندبندنت العربية alsabe...,1379854.0,497.0,2011-06-07 10:29:38+00:00,اللهم لك الحمد على سلامة سيدي #ولي_العهد \n\nل...,...,2,1,0,اللهم لك الحمد على سلامة سيدي# ولي_العهد لاباس...,0,15,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9974174499...","{'label': 'not offensive', 'score': 0.99885535...",{'sequence': 'اللهم لك الحمد على سلامة سيدي# و...


In [8]:
unlabeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196596 entries, 0 to 196595
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tweetid                   196594 non-null  object 
 1   userid                    196595 non-null  object 
 2   user_display_name         187635 non-null  object 
 3   user_screen_name          187635 non-null  object 
 4   user_reported_location    146198 non-null  object 
 5   user_profile_description  183069 non-null  object 
 6   follower_count            187635 non-null  float64
 7   following_count           187635 non-null  float64
 8   account_creation_date     187635 non-null  object 
 9   tweet_text                196596 non-null  object 
 10  is_retweet                196596 non-null  bool   
 11  quote_count               196591 non-null  float64
 12  reply_count               196592 non-null  float64
 13  like_count                196592 non-null  o

In [9]:
labeled = pd.read_json(lf_dev_data_path)
labeled.head(2)

Unnamed: 0,tweetid,tweet_text,text,tech,label
0,924924839902793728,RT @Amal_onzi: 🕊💕هُو جنْةبعِيني.,RT : هو جنةبعيني.,,0
1,1074734231887187970,ر٣ #تركيا_تجاهر_بالمعاصي,ر٣ تركيا تجاهر بالمعاصي,smears - name-calling - loaded language,1


In [10]:
labeled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetid     500 non-null    int64 
 1   tweet_text  500 non-null    object
 2   text        500 non-null    object
 3   tech        48 non-null     object
 4   label       500 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 23.4+ KB


In [11]:
test = pd.read_json(test_data_path)
test.head(2)

Unnamed: 0,tweetid,tweet_text,text,tech,label
0,977553193814122498,شاركوا معنا .. في #ساعة_الأرض الليلة \nساعة وا...,شاركوا معنا .. في ساعة الأرض الليلة ساعة واحد ...,,0
1,1005856990436970497,RT @qtfcjohz: َاللهم طهر قلوبنا من كل ضيق \nوي...,RT : اللهم طهر قلوبنا من كل ضيق ويسر أمورنا في...,,0


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 420 entries, 0 to 419
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetid     420 non-null    int64 
 1   tweet_text  420 non-null    object
 2   text        420 non-null    object
 3   tech        40 non-null     object
 4   label       420 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 19.7+ KB


In [13]:
labeled_data = unlabeled_data[unlabeled_data.tweetid.isin(labeled.tweetid)]
labeled_data.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,...,hashtags,urls,user_mentions,text,emojis,word_count,docs,sarcasm,hate,xlmroberta_label
1225,1055902884221595648,xezSoAAdXx3PCsozCxviOeMMOjwnnnNpcy1G8JO5OY=,xezSoAAdXx3PCsozCxviOeMMOjwnnnNpcy1G8JO5OY=,xezSoAAdXx3PCsozCxviOeMMOjwnnnNpcy1G8JO5OY=,,لِكل من يقرأ : قضى اللّٰه حَاجتكَ ، وجَبْر قَل...,1170.0,947.0,2017-04-18 00:00:00,RT @KhaleadBader: لكل دولة وحكومتها الحق الكام...,...,0,0,1,لكل دولة وحكومتها الحق الكامل في إتخاذ أي قرار...,0,22,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Sarcasm', 'score': 0.8622291088104248}","{'label': 'not offensive', 'score': 0.99555015...",{'sequence': 'لكل دولة وحكومتها الحق الكامل في...
1254,556195492405579777,2670629875,ابن الطنايا,bbmf16bbm,المملكة العربية السعودية,‏‏‏(رب اغفر لي ولوالدي ولمن دخل بيتي مؤمناً ول...,49152.0,44174.0,2014-07-22 00:00:00,RT @Az3381: اللهم صلى وسلم على حبيبنا محمد رسو...,...,0,0,1,اللهم صلى وسلم على حبيبنا محمد رسول الله وشفيع...,0,19,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9961307048...","{'label': 'not offensive', 'score': 0.99959558...",{'sequence': 'اللهم صلى وسلم على حبيبنا محمد ر...


In [14]:
labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 1225 to 195818
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   500 non-null    object 
 1   userid                    500 non-null    object 
 2   user_display_name         500 non-null    object 
 3   user_screen_name          500 non-null    object 
 4   user_reported_location    295 non-null    object 
 5   user_profile_description  462 non-null    object 
 6   follower_count            500 non-null    float64
 7   following_count           500 non-null    float64
 8   account_creation_date     500 non-null    object 
 9   tweet_text                500 non-null    object 
 10  is_retweet                500 non-null    bool   
 11  quote_count               500 non-null    float64
 12  reply_count               500 non-null    float64
 13  like_count                500 non-null    object 
 14  retweet_c

In [15]:
# removing the labeled tweets from the unlabeled data

unlabeled_data = unlabeled_data[~unlabeled_data.tweetid.isin(labeled_data.tweetid)]
unlabeled_data = unlabeled_data[~unlabeled_data.tweetid.isin(test.tweetid)]
unlabeled_data = unlabeled_data.reset_index(drop=True)

In [16]:
# processing the labaled data tweets

from arabert.preprocess import ArabertPreprocessor

processing_model = "aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=processing_model)


def process_text(text):
    """Process tweet text by removing links, mentions, and hashtags symbol."""
    clean_text = arabert_prep.preprocess(text)
    clean_text = arabert_prep.unpreprocess(clean_text)
    clean_text = clean_text.replace("[رابط]", "")
    clean_text = clean_text.replace("[مستخدم]", "")
    clean_text = clean_text.replace("RT", "")
    clean_text = clean_text.replace(":", "")
    return clean_text.strip()



In [17]:
labeled_data["text"] = labeled_data.text.apply(process_text)
labeled_data = labeled_data.reset_index(drop=True)
labels = [labeled[labeled.tweetid == i].label.values[0] for i in labeled_data.tweetid]
labeled_data["label"] = labels

In [18]:
# making sure the dataset doesn't contain any null values

subset = ["tweetid", "text", "quote_count", "xlmroberta_label"]
unlabeled_data = unlabeled_data.dropna(subset=subset)
unlabeled_data = unlabeled_data.reset_index(drop=True)

In [19]:
unlabeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195671 entries, 0 to 195670
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tweetid                   195671 non-null  object 
 1   userid                    195671 non-null  object 
 2   user_display_name         186715 non-null  object 
 3   user_screen_name          186715 non-null  object 
 4   user_reported_location    145634 non-null  object 
 5   user_profile_description  182217 non-null  object 
 6   follower_count            186715 non-null  float64
 7   following_count           186715 non-null  float64
 8   account_creation_date     186715 non-null  object 
 9   tweet_text                195671 non-null  object 
 10  is_retweet                195671 non-null  bool   
 11  quote_count               195671 non-null  float64
 12  reply_count               195671 non-null  float64
 13  like_count                195671 non-null  o

In [20]:
labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   500 non-null    object 
 1   userid                    500 non-null    object 
 2   user_display_name         500 non-null    object 
 3   user_screen_name          500 non-null    object 
 4   user_reported_location    295 non-null    object 
 5   user_profile_description  462 non-null    object 
 6   follower_count            500 non-null    float64
 7   following_count           500 non-null    float64
 8   account_creation_date     500 non-null    object 
 9   tweet_text                500 non-null    object 
 10  is_retweet                500 non-null    bool   
 11  quote_count               500 non-null    float64
 12  reply_count               500 non-null    float64
 13  like_count                500 non-null    object 
 14  retweet_co

### User LFs


In [21]:
prop = 1
gen = 0
ab = -1

In [22]:
@labeling_function()
def missing_bio(example):
    """Label all tweets of a user as propaganda if they don't have a bio."""
    if pd.isna(example.user_profile_description):
        return prop
    else:
        return ab

In [23]:
@labeling_function()
def missing_loc(example):
    """Label all tweets of a user as propaganda if they don't have a location."""
    if pd.isna(example.user_reported_location):
        return prop
    else:
        return ab

In [24]:
@labeling_function()
def created_2018_2019(example):
    """Label all tweets of a user as propaganda if account is created 2018 or 2019."""
    if example.account_creation_date.year in [2018, 2019]:
        return prop
    else:
        return ab

In [25]:
@labeling_function()
def created_2011_2012(example):
    """Label all tweets of a user as transparent if account is created 2011 or 2012."""
    if example.account_creation_date.year in [2011, 2012]:
        return gen
    else:
        return ab

In [26]:
@preprocessor(memoize=False)
def tokenize_bio(example):
    """Tokenize text in the bio."""
    if not pd.isna(example.user_profile_description):
        example.bio_tokens = word_tokenize(example.user_profile_description)
    else:
        example.bio_tokens = None
    return example

In [27]:
@labeling_function(pre=[tokenize_bio])
def bio_keywords(example):
    """Label all tweets of a user as transparent if bio contains certain lexicons."""
    keys = ["الحساب", "الرسمي", "عضو", "رئيس", "كاتب", "إدارة"]
    if example.bio_tokens is not None:
        if any(np.in1d(keys, example.bio_tokens)):
            return gen
        else:
            return ab
    else:
        return ab

In [28]:
@labeling_function()
def follow_ratio_prop(example):
    """Label all tweets of a user as transparent if 0.8 <= follow ratio <= 1.2"""
    num_followers = example.follower_count
    num_following = example.following_count
    ratio = num_followers / num_following if num_following != 0 else 0
    if 0.8 <= ratio <= 1.2:
        return prop
    else:
        return ab

In [29]:
@labeling_function()
def follow_ratio_gen(example):
    """Label all tweets of a user as transparent if follow ratio <= 0.2"""
    num_followers = example.follower_count
    num_following = example.following_count
    ratio = num_followers / num_following if num_following != 0 else 0
    if 0.0 <= ratio <= 0.2:
        return gen
    else:
        return ab

### Tweet LFs


In [30]:
@labeling_function()
def contain_url(example):
    """Label tweet as transparent if it contains a URL."""
    if example.urls > 0:
        return gen
    else:
        return ab

In [31]:
@labeling_function()
def contain_mention(example):
    """Label tweet as propaganda if it contains a mention."""
    if example.user_mentions > 0:
        return prop
    else:
        return ab

In [32]:
@labeling_function()
def labeling_sarcasm(example):
    """Label tweet as propaganda if it contains the name calling tech (sarcasm)."""
    if len(example.docs.entities) and example.sarcasm["label"] == "Sarcasm":
        return prop
    else:
        return ab

In [33]:
@labeling_function()
def labeling_hate(example):
    """Label tweet as propaganda if it contains the name calling tech (hate)."""
    if len(example.docs.entities) and example.hate["label"] == "offensive":
        return prop
    else:
        return ab

In [34]:
@labeling_function()
def contain_ent(example):
    """Label tweet as propaganda if it contains an entity."""
    if len(example.docs.entities) == 2:
        return prop
    else:
        return ab

In [35]:
@labeling_function()
def ent_free(example):
    """Label tweet as transparent if it doesn't contain any entities."""
    if len(example.docs.entities):
        return ab
    else:
        return gen

In [36]:
@labeling_function()
def contain_question(example):
    """Label tweet as propaganda if it contains an question."""
    for w in example.docs.sentences[0].words:
        if w.upos == "AUX":
            return prop
    return ab

In [37]:
# loading the manually crafted loaded tokens.
# extracted only from the 500 tweets used for labeling functions development.

loaded_lexicons = pd.read_csv(loaded_lex_path)["loaded-language"].to_list()
loaded_lexicons[:5]

['جاهر', 'جهر', 'تجاهر بالمعاصي', 'مجاهرة بالمعاصي', 'تجاهر بالمعاصى']

In [38]:
# loading the proppy lexicons introduced in https://arxiv.org/abs/1912.06810

# proppy_lexicons = []
# for file in glob.glob(proppy_lex_paths):
#     with open(file, encoding="utf-8") as f:
#         proppy_lexicons.extend(f.readlines())

proppy_lexicons = pd.read_csv(glob.glob(proppy_lex_paths)[0])
proppy_lexicons.head(2)

Unnamed: 0,factives,hedges,implicatives,report_verbs,bias,negative_words,positive_words,negative_colloquial_words,positive_colloquial_words
0,عارف,واضح,يدير,يتهم,إسقاط,أبو وجهين,فالح,مزوره,متحمس
1,يدري,فيما يبدو,يتذكر,اتهم,إجهاض,بوجهين,شاطر,مغترب,حماس


In [39]:
# removing diacritization from proppy lexicons

# pattern = r"[\u0617-\u061A\u064B-\u0652]"
# proppy_lexicons = [re.sub(pattern, "", term.strip()) for term in proppy_lexicons]

In [40]:
proppy_lexicons = proppy_lexicons.to_dict(orient="list")

In [41]:
proppy_lexicons = {
    k: [w for w in v if not pd.isna(w)] for k, v in proppy_lexicons.items()
}

In [42]:
print("Number of words in each proppy lexicon type ...")
print([len(v) for v in proppy_lexicons.values()])

Number of words in each proppy lexicon type ...
[42, 109, 43, 212, 578, 4630, 1950, 1468, 914]


In [43]:
# combining all the proppy lexicons

proppy = []
for plex in proppy_lexicons:
    proppy.extend(plex)

In [44]:
# we are removing diacritization from stanza lemmas


@preprocessor(memoize=False)
def tokenize_tweet(example):
    """Tokenize and lemmatize text in tweets."""
    if len(example.docs.sentences):
        example.tweet_tokens = [w.text for w in example.docs.sentences[0].words]
        example.tweet_lemmas = [
            re.sub(pattern, "", w.lemma) for w in example.docs.sentences[0].words
        ]
    else:
        example.tweet_tokens = []
        example.tweet_lemmas = []
    return example

In [45]:
@preprocessor(pre=[tokenize_tweet], memoize=False)
def bigram_tweet(example):
    """Create bigrams of tweet text's tokens and lemmas."""
    example.bigram_tokens = [
        " ".join(gram) for gram in ngrams(example.tweet_tokens, 2) if len(gram)
    ]
    example.bigram_lemmas = [
        " ".join(gram) for gram in ngrams(example.tweet_lemmas, 2) if len(gram)
    ]
    return example

In [46]:
@labeling_function(pre=[bigram_tweet])
def loaded_language(example):
    """Label tweet as propaganda if it contains any of the loaded lexicons."""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, loaded_lexicons)) >= 3:
        return prop
    else:
        return ab

In [47]:
@labeling_function(pre=[bigram_tweet])  # [bigram_tweet]
def genuine_language(example):
    """Label tweet as transparent if it doesn't contain any of the loaded lexicons."""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, loaded_lexicons)) == 0:
        return gen
    else:
        return ab

In [48]:
@labeling_function(pre=[bigram_tweet])
def loaded_proppy(example):
    """Label tweet as propaganda if it contains any of the proppy lexicons."""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, proppy)) >= 1:
        return prop
    else:
        return ab

In [49]:
@labeling_function(pre=[bigram_tweet])
def genuine_proppy(example):
    """Label tweet as transparent if it doesn't contain any of the proppy lexicons."""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, proppy)) == 0:
        return gen
    else:
        return ab

The following commented codes are to split each proppy lexicon type.

In [50]:
name_space = {}
proppy_lfs = []

In [51]:
for name in proppy_lexicons.keys():
    code = f"""
@labeling_function(pre=[bigram_tweet])
def loaded_proppy_{name}(example):
    \"""Label tweet as propaganda if it contains any of the proppy lexicons.\"""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, proppy_lexicons["{name}"])) >= 1:
        return prop
    else:
        return ab
    """
    code = compile(code, "<string>", "exec")
    exec(code, globals(), name_space)
    proppy_lfs.append(name_space[f"loaded_proppy_{name}"])

In [52]:
for name in proppy_lexicons.keys():
    code = f"""
@labeling_function(pre=[bigram_tweet])
def genuine_proppy_{name}(example):
    \"""Label tweet as transparent if it doesn't contain any of the proppy lexicons.\"""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, proppy_lexicons["{name}"])) == 0:
        return gen
    else:
        return ab
    """
    code = compile(code, "<string>", "exec")
    exec(code, globals(), name_space)
    proppy_lfs.append(name_space[f"genuine_proppy_{name}"])

In [53]:
len(proppy_lfs)

18

In [54]:
@labeling_function()
def loaded_hate(example):
    """Label tweet as propaganda if it contains hate speech (loaded language)."""
    if example.hate["label"] == "offensive":
        return prop
    else:
        return ab

In [55]:
@labeling_function()
def loaded_sarcasm(example):
    """Label tweet as propaganda if it contains sarcasm (loaded language)."""
    if example.sarcasm["label"] == "Sarcasm":
        return prop
    else:
        return ab

In [56]:
@labeling_function()
def gen_hate(example):
    """Label tweet as transparent if it doesn't contain hate speech (loaded language)."""
    if example.hate["label"] == "not offensive":
        return gen
    else:
        return ab

In [57]:
@labeling_function()
def gen_sarcasm(example):
    """Label tweet as transparent if it doesn't contain sarcasm (loaded language)."""
    if example.sarcasm["label"] == "Non-Sarcasm":
        return gen
    else:
        return ab

In [58]:
# flag_waving

flag_engine = re.compile(r"Number=\w+")


@labeling_function()
def flag_wave(example):
    """Label tweet as propaganda if it contains plural pronouns (flag-waving)."""
    for w in example.docs.sentences[0].words:
        if (
            w.upos == "PRON"
            and flag_engine.findall(w.feats)[0].split("=")[-1] == "Plur"
        ):
            return prop
    return ab

In [59]:
# loading the Arabic NLTK stop words.
arabic_stop_words = stopwords.words("arabic")

# removing diacritization from stop words
pattern = r"[\u0617-\u061A\u064B-\u0652]"
arabic_stop_words = [re.sub(pattern, "", w) for w in arabic_stop_words]

In [60]:
@labeling_function(pre=[tokenize_tweet])
def repetition(example):
    """Label tweet as propaganda if it has at least one repeated token."""
    tokens = [word for word in example.tweet_tokens if word not in arabic_stop_words]
    tokens = pd.Series(tokens)
    if tokens.value_counts().max() >= 2:
        return prop
    else:
        return ab

In [61]:
# loading the WANLP propaganda dataset for distant supervision
# dataset source: https://gitlab.com/araieval/propaganda-detection


wanlp_train = pd.read_json(wanlp_prop_data_path)

label_processing = lambda x: 0 if "no technique" in x else 1
wanlp_train["labels"] = wanlp_train["labels"].apply(label_processing)

wanlp_train.head(2)

Unnamed: 0,id,text,labels
0,1358824915483435008,"#بي_بي_سي_ترندينغ: النساء ""تثرثر كثيرا"" رئيس أ...",0
1,1389927866356412416,"""ده مش معتقل ده أحسن من اللوكاندة"".. جدل وسخري...",1


In [62]:
# def clean_text(text):
#     """Process text and remove links, symbols, and diacritization."""
#     # links
#     clean_text = re.sub(r"http\S+|t\.co/\S+", "", text)
#     # mentions
#     clean_text = re.sub(r"@\w+", "", clean_text)
#     # hashtags
#     clean_text = re.sub(r"#", "", clean_text)
#     clean_text = re.sub(r"_", " ", clean_text)
#     # tashqeel - from @bakriano
#     clean_text = re.sub(r"[\u0617-\u061A\u064B-\u0652]", "", clean_text)
#     # emojis
#     clean_text = emoji.replace_emoji(clean_text, replace="")
#     # remove new lines and normalize white spaces
#     clean_text = re.sub(r"\s+", " ", clean_text)
#     return clean_text.replace("RT :", "").strip()


wanlp_train["text"] = wanlp_train["text"].apply(process_text)

In [63]:
encoder = fasttext.load_model(fasttext_model_path)

wanlp_hidden_states = []
for tweet in tqdm(wanlp_train.text.values, total=len(wanlp_train)):
    vec = encoder.get_sentence_vector(tweet)
    wanlp_hidden_states.append(list(vec))



  0%|          | 0/504 [00:00<?, ?it/s]

In [64]:
wanlp_hidden_states = np.array(wanlp_hidden_states)
wanlp_hidden_states.shape

(504, 300)

In [65]:
@preprocessor()
def get_sim_scores(example):
    """Get similarity score between tweet and WANLP propaganda tweets."""
    tweet_vec = encoder.get_sentence_vector(example.text)
    sim_scores = util.cos_sim(tweet_vec, wanlp_hidden_states)
    example.sim_scores = sim_scores
    return example

In [66]:
# The included scores are fine-tuned.


@labeling_function(pre=[get_sim_scores])
def distant_supervision_prop(example):
    """Label tweet as propaganda based on its most similar WANLP example."""
    sim_scores = example.sim_scores
    most_sim = sim_scores.argmax(dim=-1).item()
    if (
        wanlp_train.labels.values[most_sim] == 1
        and sim_scores[-1][most_sim].item() >= 0.80
    ):
        return prop
    else:
        return ab

In [67]:
# The included scores are fine-tuned.


@labeling_function(pre=[get_sim_scores])
def distant_supervision_gen(example):
    """Label tweet as transparent based on its most similar WANLP example."""
    sim_scores = example.sim_scores
    most_sim = sim_scores.argmax(dim=-1).item()
    if (
        wanlp_train.labels.values[most_sim] == 0
        and sim_scores[-1][most_sim].item() >= 0.60
    ):
        return gen
    else:
        return ab

In [68]:
@labeling_function()
def slogans(example):
    """Label tweet as propaganda if it has any of the slogans form."""
    matches = re.findall(r"لا ل\w+", example.text)
    matches += re.findall(r"نعم ل\w+", example.text)
    matches += re.findall(r"لا بديل", example.text)
    if len(matches):
        return prop
    else:
        return ab

In [69]:
# defining the manually extracted hated organizations and entities.

hitlerum = [
    "هتلر",
    "البغدادي",
    "اوردوغان",
    "قطر",
    "داعش",
    "حوثي",
    "تركيا",
    "الشيعة",
    "إيران",
    "ايران",
    "اخونجي",
    "اخوان",
    "إخوان",
    "إخوانجي",
    "أوردوغان",
    "الحوثي",
    "الحوثيين",
    "ستالين",
    "الإخوان",
    "الاخوان",
    "إرهابي",
    "الإرهابيين",
    "متطرف",
    "المتطرفين",
    "شيعي",
]

In [70]:
@labeling_function(pre=[bigram_tweet])
def reductio(example):
    """Label tweet as propaganda if it contains the Reductio Ad Hitlerum tech."""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if any(np.in1d(tweet_ngrams, hitlerum)):
        return prop
    else:
        return ab

In [71]:
@labeling_function()
def exaggeration(example):
    """Label tweet as propaganda if has the "أفعل" preference form."""
    if not len(example.docs.sentences):
        return ab
    for w in example.docs.sentences[0].words:
        if w.lemma.startswith("أ") and w.upos == "ADJ":
            return prop
    return ab

In [72]:
@labeling_function()
def pronouns(example):
    """Label tweet as propaganda if has at least one pronoun."""
    pro_nouns = []
    for word in example.docs.sentences[0].words:
        if word.upos == "PRON":
            pro_nouns.append(word.text)
    if len(pro_nouns) >= 1:
        return prop
    else:
        return ab

In [73]:
# defining the XLM-RoBERTa Zero-Shot model classes
candidate_labels = ["transparent", "propaganda"]
candidate_labels

['transparent', 'propaganda']

In [74]:
# The included scores are fine-tuned.


@labeling_function()
def xlmroberta_prop(example):
    """Label tweet as propaganda based on zero-shot model."""
    if pd.isna(example.xlmroberta_label):
        return ab
    if example.xlmroberta_label["scores"][0] >= 0.90:
        if example.xlmroberta_label["labels"][0] == candidate_labels[-1]:
            return prop
        else:
            return ab
    else:
        return ab

In [75]:
# The included scores are fine-tuned.


@labeling_function()
def xlmroberta_gen(example):
    """Label tweet as transparent based on zero-shot model."""
    if pd.isna(example.xlmroberta_label):
        return ab
    if example.xlmroberta_label["scores"][0] >= 0.75:
        if example.xlmroberta_label["labels"][0] == candidate_labels[0]:
            return gen
        else:
            return ab
    else:
        return ab

In [76]:
lfs = [
    missing_bio,
    missing_loc,
    created_2018_2019,
    created_2011_2012,
    bio_keywords,
    follow_ratio_prop,
    follow_ratio_gen,
    contain_url,
    contain_mention,
    labeling_sarcasm,
    labeling_hate,
    contain_ent,
    ent_free,
    contain_question,
    loaded_language,
    genuine_language,
    loaded_proppy,
    genuine_proppy,
    loaded_hate,
    loaded_sarcasm,
    gen_hate,
    gen_sarcasm,
    flag_wave,
    repetition,
    distant_supervision_prop,
    distant_supervision_gen,
    slogans,
    reductio,
    exaggeration,
    pronouns,
    xlmroberta_prop,
    xlmroberta_gen,
]

lfs += proppy_lfs  # uncomment this line in case of splitted proppy lex

print(f"We have {len(lfs)} LFs used.")

We have 50 LFs used.


In [77]:
applier = PandasLFApplier(lfs)

In [78]:
L_dev = applier.apply(labeled_data)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:23<00:00, 21.60it/s]


In [79]:
results = LFAnalysis(L_dev, lfs).lf_summary(labeled_data.label.values)
results.to_json(lfs_analysis_path)
results

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
missing_bio,0,[1],0.076,0.076,0.076,3,35,0.078947
missing_loc,1,[1],0.41,0.41,0.41,20,185,0.097561
created_2018_2019,2,[1],0.084,0.084,0.084,3,39,0.071429
created_2011_2012,3,[0],0.288,0.288,0.282,127,17,0.881944
bio_keywords,4,[0],0.066,0.066,0.066,32,1,0.969697
follow_ratio_prop,5,[1],0.216,0.216,0.216,8,100,0.074074
follow_ratio_gen,6,[0],0.004,0.004,0.004,2,0,1.0
contain_url,7,[0],0.044,0.044,0.044,21,1,0.954545
contain_mention,8,[1],0.652,0.652,0.652,29,297,0.088957
labeling_sarcasm,9,[1],0.006,0.006,0.006,2,1,0.666667


In [80]:
L_train = applier.apply(unlabeled_data)
LFAnalysis(L_train, lfs).lf_summary()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 195671/195671 [3:20:08<00:00, 16.29it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
missing_bio,0,[1],0.068758,0.068758,0.068758
missing_loc,1,[1],0.25572,0.25572,0.25572
created_2018_2019,2,[1],0.04853,0.04853,0.04853
created_2011_2012,3,[0],0.371465,0.371465,0.36637
bio_keywords,4,[0],0.326385,0.326385,0.322966
follow_ratio_prop,5,[1],0.05753,0.05753,0.05753
follow_ratio_gen,6,[0],0.002054,0.002054,0.002014
contain_url,7,[0],0.461223,0.461223,0.45463
contain_mention,8,[1],0.478221,0.478221,0.478221
labeling_sarcasm,9,[1],0.006307,0.006307,0.006307


In [81]:
L_dev_ids = L_dev.copy()
L_train_ids = L_train.copy()
L_dev_ids = np.append(L_dev_ids, labeled_data.tweetid.to_numpy().reshape(-1, 1), axis=1)
L_train_ids = np.append(
    L_train_ids, unlabeled_data.tweetid.to_numpy().reshape(-1, 1), axis=1
)
L_dev_ids.shape, L_train_ids.shape

((500, 51), (195671, 51))

In [82]:
np.save(lfs_dev_labels_path, L_dev_ids)
np.save(lfs_train_labels_path, L_train_ids)

In [77]:
# loading the saved data

results = pd.read_json(lfs_analysis_path)
L_train = np.load(lfs_train_labels_path, allow_pickle=True)
L_dev = np.load(lfs_dev_labels_path, allow_pickle=True)

In [78]:
results.head(2)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
missing_bio,0,[1],0.076,0.076,0.076,3,35,0.078947
missing_loc,1,[1],0.41,0.41,0.41,20,185,0.097561


In [79]:
L_dev.shape, L_train.shape

((500, 51), (195671, 51))

In [80]:
L_train[:2, -1]

array(['1424769773917908996', 1055215995529428993], dtype=object)

In [81]:
unlabeled_data.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,...,hashtags,urls,user_mentions,text,emojis,word_count,docs,sarcasm,hate,xlmroberta_label
0,1.092746391963779e+18,2321495687.0,إمارة المنطقة الشرقية,emara_sharqia,,الحساب الرسمي لـ #إمارة_المنطقة_الشرقية - المم...,475837.0,7.0,2014-02-03 08:49:37+00:00,#الأمير_سعود_بن_نايف لأهالي #العوامية\n#مشروع_...,...,3,4,0,# الأمير_سعود_بن_نايف لأهالي# العوامية# مشروع_...,0,44,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9979982972...","{'label': 'not offensive', 'score': 0.99977093...",{'sequence': '# الأمير_سعود_بن_نايف لأهالي# ال...
1,1.364675289222103e+18,312576919.0,عبدالله السبع,7Alsabe,الرياض,المحرر التقني في صحيفة اندبندنت العربية alsabe...,1379854.0,497.0,2011-06-07 10:29:38+00:00,اللهم لك الحمد على سلامة سيدي #ولي_العهد \n\nل...,...,2,1,0,اللهم لك الحمد على سلامة سيدي# ولي_العهد لاباس...,0,15,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9974174499...","{'label': 'not offensive', 'score': 0.99885535...",{'sequence': 'اللهم لك الحمد على سلامة سيدي# و...


In [82]:
unlabeled_data = (
    unlabeled_data.set_index(unlabeled_data.tweetid)
    .loc[L_train[:, -1]]
    .reset_index(drop=True)
)
unlabeled_data.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,...,hashtags,urls,user_mentions,text,emojis,word_count,docs,sarcasm,hate,xlmroberta_label
0,1424769773917908996,701177444,,,,,,,NaT,هيئة الرقابة ومكافحة الفساد تباشر عدداً من الق...,...,1,1,0,هيئة الرقابة ومكافحة الفساد تباشر عددا من القض...,0,18,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9980459213...","{'label': 'not offensive', 'score': 0.99835205...",{'sequence': 'هيئة الرقابة ومكافحة الفساد تباش...
1,1055215995529428993,2415183190,شركة تنظيف منازل بالرياض,cleanhouse_sa_0,الرياض,"شركة تنظيف المنازل بالرياض,خدمات شركتنا هي الأ...",89784.0,71165.0,2014-03-16 00:00:00,RT @7lal5: رددوو معي\n.\nسبحان الله\n والحمدال...,...,0,0,1,رددوو معي. سبحان الله والحمدالله ولااله الا ال...,1,22,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9970303773...","{'label': 'not offensive', 'score': 0.99778336...",{'sequence': 'رددوو معي. سبحان الله والحمدالله...


In [83]:
labeled_data = (
    labeled_data.set_index(labeled_data.tweetid)
    .loc[L_dev[:, -1]]
    .reset_index(drop=True)
)
labeled_data.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,...,urls,user_mentions,text,emojis,word_count,docs,sarcasm,hate,xlmroberta_label,label
0,1129454625315336193,991682665567973377,﮼ظلال ﮼آنثي,WP0___,,لا يوجد لدي حسابات اخر غير هذا الحساب @wp0 فقط...,26674.0,22455.0,2018-05-02 00:00:00,RT @Power_of_heart_: أولئك الذين هدى الله فبهد...,...,0,1,أولئك الذين هدى الله فبهداهم اقتده قل لاأسألكم...,0,24,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9956616759...","{'label': 'not offensive', 'score': 0.99967229...",{'sequence': 'أولئك الذين هدى الله فبهداهم اقت...,0
1,1119278791971545088,2519127244,♩. لحن الحياةة ..,Rl5l6,,البعض في عقائد الحُب يهُود يقاتلوُن دوُن هُدنه,19045.0,716.0,2014-04-30 00:00:00,أشهد أن لا إله إلا الله\n#نبيل_شعيل_في_ابوووظبي,...,0,0,أشهد أن لا إله إلا الله# نبيل_شعيل_في_ابووظبي,0,8,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9977906942...","{'label': 'not offensive', 'score': 0.99650722...",{'sequence': 'أشهد أن لا إله إلا الله# نبيل_شع...,0


In [84]:
assert len(unlabeled_data) == len(L_train)
assert len(labeled_data) == len(L_dev)

In [85]:
L_train = L_train[:, :-1].copy()
L_dev = L_dev[:, :-1].copy()

In [86]:
L_dev.shape, L_train.shape

((500, 50), (195671, 50))

In [87]:
L_train = L_train.astype(np.int32)
L_dev = L_dev.astype(np.int32)

In [88]:
results

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
missing_bio,0,[1],0.076,0.076,0.076,3,35,0.078947
missing_loc,1,[1],0.41,0.41,0.41,20,185,0.097561
created_2018_2019,2,[1],0.084,0.084,0.084,3,39,0.071429
created_2011_2012,3,[0],0.288,0.288,0.282,127,17,0.881944
bio_keywords,4,[0],0.066,0.066,0.066,32,1,0.969697
follow_ratio_prop,5,[1],0.216,0.216,0.216,8,100,0.074074
follow_ratio_gen,6,[0],0.004,0.004,0.004,2,0,1.0
contain_url,7,[0],0.044,0.044,0.044,21,1,0.954545
contain_mention,8,[1],0.652,0.652,0.652,29,297,0.088957
labeling_sarcasm,9,[1],0.006,0.006,0.006,2,1,0.666667


In [303]:
# selecting the LFs that maximize the performance of the label model
flag = ~np.array([results["Emp. Acc."] < 0.35, results["Coverage"] >= 0.5]).any(axis=0)
inds = results[flag].j.to_list()
# remove user signals
inds.remove(3)
inds.remove(4)
inds.remove(6)
inds.remove(7)
# append tweet signals
# inds.append(12)
# inds.append(15)
len(inds)

8

In [304]:
# displaying the names of the LFs chosen
results.iloc[inds, :].index.to_list()

['labeling_sarcasm',
 'loaded_language',
 'loaded_sarcasm',
 'distant_supervision_prop',
 'distant_supervision_gen',
 'reductio',
 'xlmroberta_prop',
 'xlmroberta_gen']

In [305]:
L_train_ = L_train[:, inds].copy()
L_dev_ = L_dev[:, inds].copy()

In [306]:
LFAnalysis(L_dev_, np.array(lfs)[inds]).lf_summary(labeled_data.label.values)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
labeling_sarcasm,0,[1],0.006,0.006,0.002,2,1,0.666667
loaded_language,1,[1],0.052,0.04,0.04,17,9,0.653846
loaded_sarcasm,2,[1],0.008,0.006,0.002,3,1,0.75
distant_supervision_prop,3,[1],0.008,0.006,0.004,3,1,0.75
distant_supervision_gen,4,[0],0.24,0.128,0.018,111,9,0.925
reductio,5,[1],0.012,0.006,0.006,4,2,0.666667
xlmroberta_prop,6,[1],0.022,0.006,0.004,4,7,0.363636
xlmroberta_gen,7,[0],0.454,0.14,0.03,208,19,0.9163


In [307]:
LFAnalysis(L_train_, np.array(lfs)[inds]).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
labeling_sarcasm,0,[1],0.006307,0.006307,0.001794
loaded_language,1,[1],0.034594,0.027403,0.020013
loaded_sarcasm,2,[1],0.011049,0.009,0.003036
distant_supervision_prop,3,[1],0.112996,0.068365,0.054551
distant_supervision_gen,4,[0],0.220758,0.13561,0.015097
reductio,5,[1],0.027071,0.021046,0.013865
xlmroberta_prop,6,[1],0.028957,0.012991,0.004605
xlmroberta_gen,7,[0],0.5276,0.199171,0.078658


In [308]:
# estimating the class balance in the unlabeled data from the labeled.

w = labeled_data.label.value_counts(normalize=True).to_list()
w

[0.904, 0.096]

In [309]:
# tuning the L2 regularization parameter to maximize the performance of the label model
# we maximize the precision as it is need to surpass 50% for the noise-aware loss

best_score = 0
best_model = None
l2_values = np.arange(0.0, 0.1, 0.01)

metric = "f1"
for l2 in tqdm(l2_values, total=len(l2_values)):
    label_model = LabelModel(cardinality=2, verbose=False, device="cuda")
    label_model.fit(
        L_train=L_train_,
        n_epochs=2000,
        l2=l2,
        lr=0.001,
        seed=42,
        class_balance=None,
        progress_bar=False,
        lr_scheduler="constant",
        optimizer="adam",
        lr_scheduler_config={"warmup_percentage": 0.05, "warmup_unit": "epochs"},
    )
    score = label_model.score(L_dev_, labeled_data.label, metrics=[metric])
    if score[metric] >= best_score:
        best_score = score[metric]
        best_model = label_model

  0%|          | 0/10 [00:00<?, ?it/s]



In [310]:
# displaying the configs that led to the best label model
best_model.train_config

TrainConfig(n_epochs=2000, lr=0.001, l2=0.02, optimizer='adam', optimizer_config=OptimizerConfig(sgd_config=SGDOptimizerConfig(momentum=0.9), adam_config=AdamOptimizerConfig(amsgrad=False, betas=(0.9, 0.999)), adamax_config=AdamaxOptimizerConfig(betas=(0.9, 0.999), eps=1e-08)), lr_scheduler='constant', lr_scheduler_config=LRSchedulerConfig(warmup_steps=0, warmup_unit='epochs', warmup_percentage=0.05, min_lr=0.0, exponential_config=ExponentialLRSchedulerConfig(gamma=0.9), step_config=StepLRSchedulerConfig(gamma=0.9, step_size=5)), prec_init=0.7, seed=42, log_freq=10, mu_eps=None)

Reporting the performance of the label model


In [311]:
best_model.score(
    L_dev_,
    labeled_data.label,
    tie_break_policy="abstain",
    metrics=["accuracy", "precision", "recall", "f1", "f1_macro", "f1_micro"],
)



{'accuracy': 0.9041533546325878,
 'precision': 0.5777777777777777,
 'recall': 0.7027027027027027,
 'f1': 0.6341463414634145,
 'f1_macro': 0.7894996413199427,
 'f1_micro': 0.9041533546325878}

In [312]:
y_true = labeled_data.label
y_pred = best_model.predict(L_dev_)
y_true = y_true[y_pred != -1]
y_pred = y_pred[y_pred != -1]

In [313]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       276
           1       0.58      0.70      0.63        37

    accuracy                           0.90       313
   macro avg       0.77      0.82      0.79       313
weighted avg       0.91      0.90      0.91       313



Saving the weakly labeled dataset


In [314]:
# labeled_data["snorkel"] = best_model.predict(L_dev_)

In [315]:
weak_probs = best_model.predict_proba(L_train_)
weak_preds = best_model.predict(L_train_)

In [316]:
pd.Series(weak_preds).value_counts(normalize=True)

 0    0.537796
-1    0.281687
 1    0.180517
Name: proportion, dtype: float64

In [317]:
pd.Series(weak_preds[weak_preds != -1]).value_counts(normalize=True)

0    0.748693
1    0.251307
Name: proportion, dtype: float64

In [318]:
# saving the datasets
unlabeled_data["label"] = weak_preds
unlabeled_data["probs"] = list(weak_probs)
unlabeled_data = unlabeled_data[weak_preds != -1]

In [319]:
unlabeled_data[["tweetid", "text", "label", "probs"]].to_json(weakly_labeled_data_path)