# Labeling Functions


In [1]:
import re
import glob
import emoji
import fasttext
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from nltk.tokenize import word_tokenize
from nltk import ngrams
from nltk.corpus import stopwords
from sentence_transformers import util

from snorkel.preprocess import preprocessor
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel
from sklearn.metrics import classification_report

In [2]:
# defining data paths
prop_data_annotated = "../data/processed/propaganda_annotated.pkl"
gen_data_annotated = "../data/processed/genuine_annotated.pkl"
wanlp_prop_data_path = "../data/raw/task1_train.json"
weakly_labeled_data_path = "../data/processed/train.json"

# we need the lf_dev data to report the performance of LFs
# and then remove it with the test data from the unlabaled data
lf_dev_data_path = "../data/processed/lf_dev.json"
test_data_path = "../data/processed/test_gold.json"

# defining lexicons paths
proppy_lex_paths = "../data/raw/proppy_lexicons/*.csv"
loaded_lex_path = "../data/raw/loaded-language-lexicons.csv"

# loading pre-trained encoders
fasttext_model_path = "../models/cc.ar.300.bin"

In [3]:
# defining the needed fields by the working labeling functions
# in case of doing the analysis from scratch, comment the variable

# fields = [
#     "tweetid",
#     "user_profile_description",
#     "tweet_text",
#     "is_retweet",
#     "quote_count",
#     "reply_count",
#     "like_count",
#     "retweet_count",
#     "hashtags",
#     "urls",
#     "user_mentions",
#     "text",
#     "emojis",
#     "word_count",
#     "docs",
#     "is_irony",
#     "is_hate",
#     "xlmroberta_label",
# ]

In [4]:
df1 = pd.read_pickle(prop_data_annotated)  # [fields]
df2 = pd.read_pickle(gen_data_annotated)  # [fields]

In [5]:
# the following lines handle date conversion

df1["account_creation_date"] = pd.to_datetime(df1.account_creation_date, unit="ms")
df2["account_creation_date"] = pd.to_datetime(df2.account_creation_date, unit="ns")

In [6]:
# combining the propaganda and genuine users data

unlabeled_data = pd.concat([df1, df2], ignore_index=True)
unlabeled_data = unlabeled_data.sample(frac=1.0).reset_index(drop=True)

In [7]:
unlabeled_data.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,...,hashtags,urls,user_mentions,text,emojis,word_count,docs,sarcasm,hate,xlmroberta_label
0,1252315523897004032,301120393,اليوم السابع,youm7,Egypt,الحساب الرسمى لجريدة اليوم السابع علي تويتر..ا...,12894047.0,8.0,2011-05-18 22:35:56+00:00,مدحت نافع: انهيار أسعار #النفط يصب فى مصلحة ال...,...,2,1,0,مدحت نافع: انهيار أسعار النفط يصب فى مصلحة الا...,0,12,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9979304075...","{'label': 'not offensive', 'score': 0.99924039...",{'sequence': 'مدحت نافع: انهيار أسعار النفط يص...
1,1036264255950139393,844254394938015744,هيدو,hedoo1982,state of kuwait,الحساب الرسمي لصحيفة هيدو الرياضية ﮼الممثل،الق...,444316.0,660.0,2017-03-21 00:00:00,شاهد ..المدرب الإيطالي روبرتو مانشيني في جسد ر...,...,0,0,0,شاهد ..المدرب الإيطالي روبرتو مانشيني في جسد ر...,0,14,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9978727102...","{'label': 'not offensive', 'score': 0.98803830...",{'sequence': 'شاهد ..المدرب الإيطالي روبرتو ما...


In [8]:
unlabeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196611 entries, 0 to 196610
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tweetid                   196609 non-null  object 
 1   userid                    196610 non-null  object 
 2   user_display_name         187650 non-null  object 
 3   user_screen_name          187650 non-null  object 
 4   user_reported_location    146204 non-null  object 
 5   user_profile_description  183083 non-null  object 
 6   follower_count            187650 non-null  float64
 7   following_count           187650 non-null  float64
 8   account_creation_date     187650 non-null  object 
 9   tweet_text                196611 non-null  object 
 10  is_retweet                196611 non-null  bool   
 11  quote_count               196606 non-null  float64
 12  reply_count               196607 non-null  float64
 13  like_count                196607 non-null  o

In [9]:
labeled = pd.read_json(lf_dev_data_path)
labeled.head(2)

Unnamed: 0,tweetid,tweet_text,text,tech,label
0,924924839902793728,RT @Amal_onzi: 🕊💕هُو جنْةبعِيني.,RT : هو جنةبعيني.,,0
1,1074734231887187970,ر٣ #تركيا_تجاهر_بالمعاصي,ر٣ تركيا تجاهر بالمعاصي,smears - name-calling - loaded language,1


In [10]:
labeled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetid     500 non-null    int64 
 1   tweet_text  500 non-null    object
 2   text        500 non-null    object
 3   tech        48 non-null     object
 4   label       500 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 23.4+ KB


In [11]:
test = pd.read_json(test_data_path)
test.head(2)

Unnamed: 0,tweetid,tweet_text,text,tech,label
0,977553193814122496,شاركوا معنا .. في #ساعة_الأرض الليلة \nساعة وا...,شاركوا معنا .. في ساعة الأرض الليلة ساعة واحد ...,,0
1,1005856990436970496,RT @qtfcjohz: َاللهم طهر قلوبنا من كل ضيق \nوي...,RT : اللهم طهر قلوبنا من كل ضيق ويسر أمورنا في...,,0


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 420 entries, 0 to 419
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetid     420 non-null    int64 
 1   tweet_text  420 non-null    object
 2   text        420 non-null    object
 3   tech        40 non-null     object
 4   label       420 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 19.7+ KB


In [13]:
labeled_data = unlabeled_data[unlabeled_data.tweetid.isin(labeled.tweetid)]
labeled_data.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,...,hashtags,urls,user_mentions,text,emojis,word_count,docs,sarcasm,hate,xlmroberta_label
476,1123736587278462976,SVHXDIHCMTkNAYiBhk7NE23xJm6DqQOIYZ23ojo0Ek=,SVHXDIHCMTkNAYiBhk7NE23xJm6DqQOIYZ23ojo0Ek=,SVHXDIHCMTkNAYiBhk7NE23xJm6DqQOIYZ23ojo0Ek=,,,801.0,333.0,2011-10-13 00:00:00,نلحق عالشعب https://t.co/eXBNBkiMPM,...,0,0,0,نلحق عالشعب,0,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9953168630...","{'label': 'not offensive', 'score': 0.97536575...","{'sequence': 'نلحق عالشعب', 'labels': ['transp..."
573,321172108296196096,OF0VE6pC9ESb5cs91Mzoo1VHnaYdmILpuWbqN2bnZQ=,OF0VE6pC9ESb5cs91Mzoo1VHnaYdmILpuWbqN2bnZQ=,OF0VE6pC9ESb5cs91Mzoo1VHnaYdmILpuWbqN2bnZQ=,,مزوح واعشق الصراحة,2166.0,289.0,2011-12-13 00:00:00,انت لا تكرهني انت تكرة الصورة اللتي كونتها عني...,...,0,0,0,انت لا تكرهني انت تكرة الصورة اللتي كونتها عني...,0,15,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'Non-Sarcasm', 'score': 0.9953943490...","{'label': 'not offensive', 'score': 0.99803966...",{'sequence': 'انت لا تكرهني انت تكرة الصورة ال...


In [14]:
labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 476 to 196396
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   500 non-null    object 
 1   userid                    500 non-null    object 
 2   user_display_name         500 non-null    object 
 3   user_screen_name          500 non-null    object 
 4   user_reported_location    295 non-null    object 
 5   user_profile_description  462 non-null    object 
 6   follower_count            500 non-null    float64
 7   following_count           500 non-null    float64
 8   account_creation_date     500 non-null    object 
 9   tweet_text                500 non-null    object 
 10  is_retweet                500 non-null    bool   
 11  quote_count               500 non-null    float64
 12  reply_count               500 non-null    float64
 13  like_count                500 non-null    object 
 14  retweet_co

In [15]:
# removing the labeled tweets from the unlabeled data

unlabeled_data = unlabeled_data[~unlabeled_data.tweetid.isin(labeled_data.tweetid)]
unlabeled_data = unlabeled_data[~unlabeled_data.tweetid.isin(test.tweetid)]
unlabeled_data = unlabeled_data.reset_index(drop=True)

In [16]:
labeled_data["text"] = labeled_data.text.apply(lambda x: x.replace("RT :", "").strip())
labeled_data = labeled_data.reset_index(drop=True)
labels = [labeled[labeled.tweetid == i].label.values[0] for i in labeled_data.tweetid]
labeled_data["label"] = labels

In [17]:
# making sure the dataset doesn't contain any null values

subset = ["tweetid", "text", "quote_count", "xlmroberta_label"]
unlabeled_data = unlabeled_data.dropna(subset=subset)
unlabeled_data = unlabeled_data.reset_index(drop=True)

In [18]:
unlabeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195840 entries, 0 to 195839
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tweetid                   195840 non-null  object 
 1   userid                    195840 non-null  object 
 2   user_display_name         186884 non-null  object 
 3   user_screen_name          186884 non-null  object 
 4   user_reported_location    145738 non-null  object 
 5   user_profile_description  182374 non-null  object 
 6   follower_count            186884 non-null  float64
 7   following_count           186884 non-null  float64
 8   account_creation_date     186884 non-null  object 
 9   tweet_text                195840 non-null  object 
 10  is_retweet                195840 non-null  bool   
 11  quote_count               195840 non-null  float64
 12  reply_count               195840 non-null  float64
 13  like_count                195840 non-null  o

In [19]:
labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   500 non-null    object 
 1   userid                    500 non-null    object 
 2   user_display_name         500 non-null    object 
 3   user_screen_name          500 non-null    object 
 4   user_reported_location    295 non-null    object 
 5   user_profile_description  462 non-null    object 
 6   follower_count            500 non-null    float64
 7   following_count           500 non-null    float64
 8   account_creation_date     500 non-null    object 
 9   tweet_text                500 non-null    object 
 10  is_retweet                500 non-null    bool   
 11  quote_count               500 non-null    float64
 12  reply_count               500 non-null    float64
 13  like_count                500 non-null    object 
 14  retweet_co

### User LFs


In [20]:
prop = 1
gen = 0
ab = -1

In [21]:
@labeling_function()
def missing_bio(example):
    """Label all tweets of a user as propaganda if they don't have a bio."""
    if pd.isna(example.user_profile_description):
        return prop
    else:
        return ab

In [22]:
@labeling_function()
def missing_loc(example):
    """Label all tweets of a user as propaganda if they don't have a location."""
    if pd.isna(example.user_reported_location):
        return prop
    else:
        return ab

In [23]:
@labeling_function()
def created_2018_2019(example):
    """Label all tweets of a user as propaganda if account is created 2018 or 2019."""
    if example.account_creation_date.year in [2018, 2019]:
        return prop
    else:
        return ab

In [24]:
@labeling_function()
def created_2011_2012(example):
    """Label all tweets of a user as transparent if account is created 2011 or 2012."""
    if example.account_creation_date.year in [2011, 2012]:
        return gen
    else:
        return ab

In [25]:
@preprocessor(memoize=False)
def tokenize_bio(example):
    """Tokenize text in the bio."""
    if not pd.isna(example.user_profile_description):
        example.bio_tokens = word_tokenize(example.user_profile_description)
    else:
        example.bio_tokens = None
    return example

In [26]:
@labeling_function(pre=[tokenize_bio])
def bio_keywords(example):
    """Label all tweets of a user as transparent if bio contains certain lexicons."""
    keys = ["الحساب", "الرسمي", "عضو", "رئيس", "كاتب", "إدارة"]
    if example.bio_tokens is not None:
        if any(np.in1d(keys, example.bio_tokens)):
            return gen
        else:
            return ab
    else:
        return ab

In [27]:
@labeling_function()
def follow_ratio_prop(example):
    """Label all tweets of a user as transparent if 0.8 <= follow ratio <= 1.2"""
    num_followers = example.follower_count
    num_following = example.following_count
    ratio = num_followers / num_following if num_following != 0 else 0
    if 0.8 <= ratio <= 1.2:
        return prop
    else:
        return ab

In [28]:
@labeling_function()
def follow_ratio_gen(example):
    """Label all tweets of a user as transparent if follow ratio <= 0.2"""
    num_followers = example.follower_count
    num_following = example.following_count
    ratio = num_followers / num_following if num_following != 0 else 0
    if 0.0 <= ratio <= 0.2:
        return gen
    else:
        return ab

### Tweet LFs


In [29]:
@labeling_function()
def contain_url(example):
    """Label tweet as transparent if it contains a URL."""
    if example.urls > 0:
        return gen
    else:
        return ab

In [30]:
@labeling_function()
def contain_mention(example):
    """Label tweet as propaganda if it contains a mention."""
    if example.user_mentions > 0:
        return prop
    else:
        return ab

In [31]:
@labeling_function()
def labeling_sarcasm(example):
    """Label tweet as propaganda if it contains the name calling tech (sarcasm)."""
    if len(example.docs.entities) and example.sarcasm["label"] == "Sarcasm":
        return prop
    else:
        return ab

In [32]:
@labeling_function()
def labeling_hate(example):
    """Label tweet as propaganda if it contains the name calling tech (hate)."""
    if len(example.docs.entities) and example.hate["label"] == "offensive":
        return prop
    else:
        return ab

In [33]:
@labeling_function()
def contain_ent(example):
    """Label tweet as propaganda if it contains an entity."""
    if len(example.docs.entities):
        return prop
    else:
        return ab

In [34]:
@labeling_function()
def ent_free(example):
    """Label tweet as transparent if it doesn't contain any entities."""
    if len(example.docs.entities):
        return ab
    else:
        return gen

In [35]:
@labeling_function()
def contain_question(example):
    """Label tweet as propaganda if it contains an question."""
    for w in example.docs.sentences[0].words:
        if w.upos == "AUX":
            return prop
    return ab

In [36]:
# loading the manually crafted loaded tokens.
# extracted only from the 500 tweets used for labeling functions development.

loaded_lexicons = pd.read_csv(loaded_lex_path)["loaded-language"].to_list()
loaded_lexicons[:5]

['جاهر', 'جهر', 'تجاهر بالمعاصي', 'مجاهرة بالمعاصي', 'تجاهر بالمعاصى']

In [37]:
# loading the proppy lexicons introduced in https://arxiv.org/abs/1912.06810

# proppy_lexicons = []
# for file in glob.glob(proppy_lex_paths):
#     with open(file, encoding="utf-8") as f:
#         proppy_lexicons.extend(f.readlines())

proppy_lexicons = pd.read_csv(glob.glob(proppy_lex_paths)[0])
proppy_lexicons.head(2)

Unnamed: 0,factives,hedges,implicatives,report_verbs,bias,negative_words,positive_words,negative_colloquial_words,positive_colloquial_words
0,عارف,واضح,يدير,يتهم,إسقاط,أبو وجهين,فالح,مزوره,متحمس
1,يدري,فيما يبدو,يتذكر,اتهم,إجهاض,بوجهين,شاطر,مغترب,حماس


In [38]:
# removing diacritization from proppy lexicons

# pattern = r"[\u0617-\u061A\u064B-\u0652]"
# proppy_lexicons = [re.sub(pattern, "", term.strip()) for term in proppy_lexicons]

In [39]:
proppy_lexicons = proppy_lexicons.to_dict(orient="list")

In [40]:
proppy_lexicons = {k: [w for w in v if not pd.isna(w)] for k, v in proppy_lexicons.items()}

In [41]:
print("Number of words in each proppy lexicon type ...")
print([len(v) for v in proppy_lexicons.values()])

Number of words in each proppy lexicon type ...
[42, 109, 43, 212, 578, 4630, 1950, 1468, 914]


In [42]:
# we are removing diacritization from stanza lemmas


@preprocessor(memoize=False)
def tokenize_tweet(example):
    """Tokenize and lemmatize text in tweets."""
    if len(example.docs.sentences):
        example.tweet_tokens = [w.text for w in example.docs.sentences[0].words]
        example.tweet_lemmas = [re.sub(pattern, "", w.lemma) for w in example.docs.sentences[0].words]
    else:
        example.tweet_tokens = []
        example.tweet_lemmas = []
    return example

In [43]:
@preprocessor(pre=[tokenize_tweet], memoize=False)
def bigram_tweet(example):
    """Create bigrams of tweet text's tokens and lemmas."""
    example.bigram_tokens = [
        " ".join(gram) for gram in ngrams(example.tweet_tokens, 2) if len(gram)
    ]
    example.bigram_lemmas = [
        " ".join(gram) for gram in ngrams(example.tweet_lemmas, 2) if len(gram)
    ]
    return example

In [44]:
@labeling_function(pre=[bigram_tweet])
def loaded_language(example):
    """Label tweet as propaganda if it contains any of the loaded lexicons."""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, loaded_lexicons)) >= 1:
        return prop
    else:
        return ab

In [45]:
@labeling_function(pre=[bigram_tweet])  # [bigram_tweet]
def genuine_language(example):
    """Label tweet as transparent if it doesn't contain any of the loaded lexicons."""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, loaded_lexicons)) == 0:
        return gen
    else:
        return ab

In [46]:
# @labeling_function(pre=[bigram_tweet])
# def loaded_proppy(example):
#     """Label tweet as propaganda if it contains any of the proppy lexicons."""
#     tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
#     tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
#     if sum(np.in1d(tweet_ngrams, proppy_lexicons)) >= 1:
#         return prop
#     else:
#         return ab

In [47]:
# @labeling_function(pre=[bigram_tweet])
# def genuine_proppy(example):
#     """Label tweet as transparent if it doesn't contain any of the proppy lexicons."""
#     tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
#     tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
#     if sum(np.in1d(tweet_ngrams, proppy_lexicons)) == 0:
#         return gen
#     else:
#         return ab

In [48]:
name_space = {}
proppy_lfs = []

In [49]:
for name in proppy_lexicons.keys():
    code = f"""
@labeling_function(pre=[bigram_tweet])
def loaded_proppy_{name}(example):
    \"""Label tweet as propaganda if it contains any of the proppy lexicons.\"""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, proppy_lexicons["{name}"])) >= 1:
        return prop
    else:
        return ab
    """
    code = compile(code, "<string>", "exec")
    exec(code, globals(), name_space)
    proppy_lfs.append(name_space[f"loaded_proppy_{name}"])

In [50]:
for name in proppy_lexicons.keys():
    code = f"""
@labeling_function(pre=[bigram_tweet])
def genuine_proppy_{name}(example):
    \"""Label tweet as transparent if it doesn't contain any of the proppy lexicons.\"""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, proppy_lexicons["{name}"])) == 0:
        return gen
    else:
        return ab
    """
    code = compile(code, "<string>", "exec")
    exec(code, globals(), name_space)
    proppy_lfs.append(name_space[f"genuine_proppy_{name}"])

In [51]:
len(proppy_lfs)

18

In [52]:
@labeling_function()
def loaded_hate(example):
    """Label tweet as propaganda if it contains hate speech (loaded language)."""
    if example.hate["label"] == "offensive":
        return prop
    else:
        return ab

In [53]:
@labeling_function()
def loaded_sarcasm(example):
    """Label tweet as propaganda if it contains sarcasm (loaded language)."""
    if example.sarcasm["label"] == "Sarcasm":
        return prop
    else:
        return ab

In [54]:
@labeling_function()
def gen_hate(example):
    """Label tweet as transparent if it doesn't contain hate speech (loaded language)."""
    if example.hate["label"] == "not offensive":
        return gen
    else:
        return ab

In [55]:
@labeling_function()
def gen_sarcasm(example):
    """Label tweet as transparent if it doesn't contain sarcasm (loaded language)."""
    if example.sarcasm["label"] == "Non-Sarcasm":
        return gen
    else:
        return ab

In [56]:
# flag_waving

flag_engine = re.compile(r"Number=\w+")


@labeling_function()
def flag_wave(example):
    """Label tweet as propaganda if it contains plural pronouns (flag-waving)."""
    for w in example.docs.sentences[0].words:
        if (
            w.upos == "PRON"
            and flag_engine.findall(w.feats)[0].split("=")[-1] == "Plur"
        ):
            return prop
    return ab

In [57]:
# loading the Arabic NLTK stop words.
arabic_stop_words = stopwords.words("arabic")

# removing diacritization from stop words
pattern = r"[\u0617-\u061A\u064B-\u0652]"
arabic_stop_words = [re.sub(pattern, "", w) for w in arabic_stop_words]

In [58]:
@labeling_function(pre=[tokenize_tweet])
def repetition(example):
    """Label tweet as propaganda if it has at least one repeated token."""
    tokens = [word for word in example.tweet_tokens if word not in arabic_stop_words]
    tokens = pd.Series(tokens)
    if tokens.value_counts().max() >= 2:
        return prop
    else:
        return ab

In [59]:
# loading the WANLP propaganda dataset for distant supervision
# dataset source: https://gitlab.com/araieval/propaganda-detection


wanlp_train = pd.read_json(wanlp_prop_data_path)

label_processing = lambda x: 0 if "no technique" in x else 1
wanlp_train["labels"] = wanlp_train["labels"].apply(label_processing)

wanlp_train.head(2)

Unnamed: 0,id,text,labels
0,1358824915483435008,"#بي_بي_سي_ترندينغ: النساء ""تثرثر كثيرا"" رئيس أ...",0
1,1389927866356412416,"""ده مش معتقل ده أحسن من اللوكاندة"".. جدل وسخري...",1


In [60]:
def clean_text(text):
    """Process text and remove links, symbols, and diacritization."""
    # links
    clean_text = re.sub(r"http\S+|t\.co/\S+", "", text)
    # mentions
    clean_text = re.sub(r"@\w+", "", clean_text)
    # hashtags
    clean_text = re.sub(r"#", "", clean_text)
    clean_text = re.sub(r"_", " ", clean_text)
    # tashqeel - from @bakriano
    clean_text = re.sub(r"[\u0617-\u061A\u064B-\u0652]", "", clean_text)
    # emojis
    clean_text = emoji.replace_emoji(clean_text, replace="")
    # remove new lines and normalize white spaces
    clean_text = re.sub(r"\s+", " ", clean_text)
    return clean_text.replace("RT :", "").strip()


wanlp_train["text"] = wanlp_train["text"].apply(clean_text)

In [61]:
encoder = fasttext.load_model(fasttext_model_path)

wanlp_hidden_states = []
for tweet in tqdm(wanlp_train.text.values, total=len(wanlp_train)):
    vec = encoder.get_sentence_vector(tweet)
    wanlp_hidden_states.append(list(vec))



  0%|          | 0/504 [00:00<?, ?it/s]

In [62]:
wanlp_hidden_states = np.array(wanlp_hidden_states)
wanlp_hidden_states.shape

(504, 300)

In [63]:
@preprocessor()
def get_sim_scores(example):
    """Get similarity score between tweet and WANLP propaganda tweets."""
    tweet_vec = encoder.get_sentence_vector(example.text)
    sim_scores = util.cos_sim(tweet_vec, wanlp_hidden_states)
    example.sim_scores = sim_scores
    return example

In [64]:
# The included scores are fine-tuned.


@labeling_function(pre=[get_sim_scores])
def distant_supervision_prop(example):
    """Label tweet as propaganda based on its most similar WANLP example."""
    sim_scores = example.sim_scores
    most_sim = sim_scores.argmax(dim=-1).item()
    if (
        wanlp_train.labels.values[most_sim] == 1
        and sim_scores[-1][most_sim].item() >= 0.75
    ):
        return prop
    else:
        return ab

In [65]:
# The included scores are fine-tuned.


@labeling_function(pre=[get_sim_scores])
def distant_supervision_gen(example):
    """Label tweet as transparent based on its most similar WANLP example."""
    sim_scores = example.sim_scores
    most_sim = sim_scores.argmax(dim=-1).item()
    if (
        wanlp_train.labels.values[most_sim] == 0
        and sim_scores[-1][most_sim].item() >= 0.65
    ):
        return gen
    else:
        return ab

In [66]:
@labeling_function()
def slogans(example):
    """Label tweet as propaganda if it has any of the slogans form."""
    matches = re.findall(r"لا ل\w+", example.text)
    matches += re.findall(r"نعم ل\w+", example.text)
    matches += re.findall(r"لا بديل", example.text)
    if len(matches):
        return prop
    else:
        return ab

In [67]:
# defining the manually extracted hated organizations and entities.

hitlerum = [
    "هتلر",
    "البغدادي",
    "اوردوغان",
    "قطر",
    "داعش",
    "حوثي",
    "تركيا",
    "الشيعة",
    "إيران",
    "ايران",
    "اخونجي",
    "اخوان",
    "إخوان",
    "إخوانجي",
    "أوردوغان",
    "الحوثي",
    "الحوثيين",
    "ستالين",
    "الإخوان",
    "الاخوان",
    "إرهابي",
    "الإرهابيين",
    "متطرف",
    "المتطرفين",
    "شيعي",
]

In [68]:
@labeling_function(pre=[bigram_tweet])
def reductio(example):
    """Label tweet as propaganda if it contains the Reductio Ad Hitlerum tech."""
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if any(np.in1d(tweet_ngrams, hitlerum)):
        return prop
    else:
        return ab

In [69]:
@labeling_function()
def exaggeration(example):
    """Label tweet as propaganda if has the "أفعل" preference form."""
    if not len(example.docs.sentences):
        return ab
    for w in example.docs.sentences[0].words:
        if w.lemma.startswith("أ") and w.upos == "ADJ":
            return prop
    return ab

In [70]:
@labeling_function()
def pronouns(example):
    """Label tweet as propaganda if has at least one pronoun."""
    pro_nouns = []
    for word in example.docs.sentences[0].words:
        if word.upos == "PRON":
            pro_nouns.append(word.text)
    if len(pro_nouns) >= 1:
        return prop
    else:
        return ab

In [71]:
# defining the XLM-RoBERTa Zero-Shot model classes
candidate_labels = ["transparent", "propaganda"]
candidate_labels

['transparent', 'propaganda']

In [72]:
# The included scores are fine-tuned.


@labeling_function()
def xlmroberta_prop(example):
    """Label tweet as propaganda based on zero-shot model."""
    if pd.isna(example.xlmroberta_label):
        return ab
    if example.xlmroberta_label["scores"][0] >= 0.90:
        if example.xlmroberta_label["labels"][0] == candidate_labels[-1]:
            return prop
        else:
            return ab
    else:
        return ab

In [73]:
# The included scores are fine-tuned.


@labeling_function()
def xlmroberta_gen(example):
    """Label tweet as transparent based on zero-shot model."""
    if pd.isna(example.xlmroberta_label):
        return ab
    if example.xlmroberta_label["scores"][0] >= 0.90:
        if example.xlmroberta_label["labels"][0] == candidate_labels[0]:
            return gen
        else:
            return ab
    else:
        return ab

In [74]:
lfs = [
    missing_bio,
    missing_loc,
    created_2018_2019,
    created_2011_2012,
    bio_keywords,
    follow_ratio_prop,
    follow_ratio_gen,
    contain_url,
    contain_mention,
    labeling_sarcasm,
    labeling_hate,
    contain_ent,
    ent_free,
    contain_question,
    loaded_language,
    genuine_language,
    # loaded_proppy,
    # genuine_proppy,
    loaded_hate,
    loaded_sarcasm,
    gen_hate,
    gen_sarcasm,
    flag_wave,
    repetition,
    distant_supervision_prop,
    distant_supervision_gen,
    slogans,
    reductio,
    exaggeration,
    pronouns,
    xlmroberta_prop,
    xlmroberta_gen,
]

lfs += proppy_lfs

print(f"We have {len(lfs)} LFs used.")

We have 48 LFs used.


In [75]:
applier = PandasLFApplier(lfs)

In [76]:
L_dev = applier.apply(labeled_data)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:20<00:00, 24.76it/s]


In [77]:
results = LFAnalysis(L_dev, lfs).lf_summary(labeled_data.label.values)
results

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
missing_bio,0,[1],0.076,0.076,0.076,3,35,0.078947
missing_loc,1,[1],0.41,0.41,0.41,20,185,0.097561
created_2018_2019,2,[1],0.084,0.084,0.084,3,39,0.071429
created_2011_2012,3,[0],0.288,0.288,0.288,127,17,0.881944
bio_keywords,4,[0],0.066,0.066,0.066,32,1,0.969697
follow_ratio_prop,5,[1],0.216,0.216,0.216,8,100,0.074074
follow_ratio_gen,6,[0],0.004,0.004,0.004,2,0,1.0
contain_url,7,[0],0.044,0.044,0.044,21,1,0.954545
contain_mention,8,[1],0.652,0.652,0.652,29,297,0.088957
labeling_sarcasm,9,[1],0.006,0.006,0.006,2,1,0.666667


In [None]:
L_train = applier.apply(unlabeled_data)
# LFAnalysis(L_train, lfs).lf_summary()

 87%|█████████████████████████████████████████████████████████████████████████████████████████████▌              | 169662/195840 [2:42:02<17:33, 24.85it/s]

In [83]:
LFAnalysis(L_train, lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
missing_bio,0,[1],0.06876,0.06876,0.06876
missing_loc,1,[1],0.255831,0.255831,0.255831
created_2018_2019,2,[1],0.048591,0.048591,0.048591
created_2011_2012,3,[0],0.371354,0.371354,0.369863
bio_keywords,4,[0],0.326159,0.326159,0.325414
follow_ratio_prop,5,[1],0.057664,0.057664,0.057664
follow_ratio_gen,6,[0],0.002058,0.002058,0.002048
contain_url,7,[0],0.460881,0.460881,0.459278
contain_mention,8,[1],0.478431,0.478431,0.478431
labeling_sarcasm,9,[1],0.007772,0.007772,0.007772


In [84]:
# selecting the LFs that maximize the performance of the label model
inds = results[results["Emp. Acc."] >= 0.25].j.to_list()
len(inds)

29

In [85]:
# displaying the names of the LFs chosen
results.iloc[inds, :].index.to_list()

['created_2011_2012',
 'bio_keywords',
 'follow_ratio_gen',
 'contain_url',
 'labeling_sarcasm',
 'ent_free',
 'contain_question',
 'loaded_language',
 'genuine_language',
 'loaded_sarcasm',
 'gen_hate',
 'gen_sarcasm',
 'distant_supervision_prop',
 'distant_supervision_gen',
 'slogans',
 'reductio',
 'xlmroberta_prop',
 'xlmroberta_gen',
 'loaded_proppy_factives',
 'loaded_proppy_implicatives',
 'genuine_proppy_factives',
 'genuine_proppy_hedges',
 'genuine_proppy_implicatives',
 'genuine_proppy_report_verbs',
 'genuine_proppy_bias',
 'genuine_proppy_negative_words',
 'genuine_proppy_positive_words',
 'genuine_proppy_negative_colloquial_words',
 'genuine_proppy_positive_colloquial_words']

In [86]:
L_train_ = L_train[:, inds].copy()
L_dev_ = L_dev[:, inds].copy()

In [87]:
# estimating the class balance in the unlabeled data from the labeled.

# w = labeled_data.label.value_counts(normalize=True).to_list()
# w

In [88]:
# tuning the L2 regularization parameter to maximize the performance of the label model
# we maximize the precision as it is need to surpass 50% for the noise-aware loss

best_score = 0
best_model = None
l2_values = np.arange(0.0, 0.1, 0.01)

for l2 in tqdm(l2_values, total=len(l2_values)):
    label_model = LabelModel(cardinality=2, verbose=False, device="cuda")
    label_model.fit(
        L_train=L_train_,
        n_epochs=2000,
        l2=l2,
        lr=0.01,
        seed=42,
        class_balance=None,
        progress_bar=False,
        lr_scheduler="linear",
        optimizer="adam",
        lr_scheduler_config={"warmup_percentage": 0.1, "warmup_unit": "epochs"},
    )
    score = label_model.score(L_dev_, labeled_data.label, metrics=["recall"])
    if score["recall"] >= best_score:
        best_score = score["recall"]
        best_model = label_model

  0%|          | 0/10 [00:00<?, ?it/s]



In [89]:
# displaying the configs that led to the best label model
best_model.train_config

TrainConfig(n_epochs=2000, lr=0.01, l2=0.0, optimizer='adam', optimizer_config=OptimizerConfig(sgd_config=SGDOptimizerConfig(momentum=0.9), adam_config=AdamOptimizerConfig(amsgrad=False, betas=(0.9, 0.999)), adamax_config=AdamaxOptimizerConfig(betas=(0.9, 0.999), eps=1e-08)), lr_scheduler='linear', lr_scheduler_config=LRSchedulerConfig(warmup_steps=0, warmup_unit='epochs', warmup_percentage=0.1, min_lr=0.0, exponential_config=ExponentialLRSchedulerConfig(gamma=0.9), step_config=StepLRSchedulerConfig(gamma=0.9, step_size=5)), prec_init=0.7, seed=42, log_freq=10, mu_eps=None)

Reporting the performance of the label model


In [90]:
best_model.score(
    L_dev_,
    labeled_data.label,
    tie_break_policy="abstain",
    metrics=["accuracy", "precision", "recall", "f1", "f1_macro", "f1_micro"],
)



{'accuracy': 0.874,
 'precision': 0.39436619718309857,
 'recall': 0.5833333333333334,
 'f1': 0.4705882352941176,
 'f1_macro': 0.6995392935834946,
 'f1_micro': 0.874}

In [91]:
y_true = labeled_data.label
y_pred = best_model.predict(L_dev_)
y_true = y_true[y_pred != -1]
y_pred = y_pred[y_pred != -1]

In [92]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.90      0.93       452
           1       0.39      0.58      0.47        48

    accuracy                           0.87       500
   macro avg       0.67      0.74      0.70       500
weighted avg       0.90      0.87      0.88       500



Saving the weakly labeled dataset


In [93]:
labeled_data["snorkel"] = best_model.predict(L_dev_)

In [94]:
# saving the datasets
weak_preds = best_model.predict(L_train_)
unlabeled_data["label"] = weak_preds
unlabeled_data = unlabeled_data[weak_preds != -1]

In [95]:
unlabeled_data[["tweetid", "text", "label"]].to_json(weakly_labeled_data_path)