# Labeling Functions


In [1]:
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
warnings.filterwarnings("ignore")

In [3]:
fields = [
    "tweetid",
    "user_profile_description",
    "tweet_text",
    "is_retweet",
    "quote_count",
    "reply_count",
    "like_count",
    "retweet_count",
    "hashtags",
    "urls",
    "user_mentions",
    "text",
    "emojis",
    "word_count",
    "docs",
    "is_irony",
    "is_hate",
    "xlmroberta_label",
]

In [4]:
df1 = pd.read_pickle("../data/processed/prop_data_pro_zero.pkl")[fields]
# df1['account_creation_date'] = pd.to_datetime(
#     df1.account_creation_date, unit='ms')

df2 = pd.read_pickle("../data/processed/gen_data_pro_zero.pkl")[fields]
# df2['account_creation_date'] = pd.to_datetime(
#     df2.account_creation_date, unit='ns')

In [5]:
unlabeled_data = (
    pd.concat([df1, df2], ignore_index=True).sample(frac=1.0).reset_index(drop=True)
)

In [6]:
unlabeled_data.head()

Unnamed: 0,tweetid,user_profile_description,tweet_text,is_retweet,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,text,emojis,word_count,docs,is_irony,is_hate,xlmroberta_label
0,1.1355929772278129e+18,الأخبـار العاجلة #عاجل مـن مـوقـع جريدة الرياض...,#عاجل:\nالمحكمة العليا: ثبوت رؤية هلال شوال.. ...,False,10.0,16.0,110.0,169.0,2,1,0,عاجل: المحكمة العليا: ثبوت رؤية هلال شوال.. وغ...,0,17,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.95331501960...","{'label': 'not offensive', 'score': 0.99942123...",{'sequence': 'عاجل: المحكمة العليا: ثبوت رؤية ...
1,1.4171915642296238e+18,الحساب الرسمي لـ إمارة منطقة القصيم - إنستقرام...,حمد سموه المولى عز وجل على ماتحظى به بلادنا من...,False,0.0,3.0,30.0,39.0,4,1,0,حمد سموه المولى عز وجل على ماتحظى به بلادنا من...,0,35,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.94064635038...","{'label': 'not offensive', 'score': 0.99974423...",{'sequence': 'حمد سموه المولى عز وجل على ماتحظ...
2,1.2637679210100572e+18,الحساب الرسمي للأمن العام السعودي,شرطة القصيم : ضبط مواطنٍ ومقيم استغلّا تصريح ا...,False,3.0,14.0,157.0,119.0,0,1,0,شرطة القصيم : ضبط مواطن ومقيم استغلا تصريح الس...,0,19,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.85628890991...","{'label': 'not offensive', 'score': 0.99958091...",{'sequence': 'شرطة القصيم : ضبط مواطن ومقيم اس...
3,1.4606865421339976e+18,,«نزاهة»: ثبوت تورط (20) مواطناً ومقيماً بتعديل...,False,0.0,3.0,5.0,2.0,1,1,0,«نزاهة»: ثبوت تورط (20) مواطنا ومقيما بتعديل ح...,0,40,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.74483102560...","{'label': 'not offensive', 'score': 0.99972814...",{'sequence': '«نزاهة»: ثبوت تورط (20) مواطنا و...
4,8.705719461064006e+17,‏‏‎‎#الحرية هي أن تؤمن بها لك ولغيرك\nلكن تفرض...,RT @hzen2080: ساعات رومانس\nساعه اوديمار أوتوم...,True,0.0,0.0,0.0,0.0,0,0,1,RT : ساعات رومانس ساعه اوديمار أوتوماتيك 650﷼ ...,0,22,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.96675294637...","{'label': 'not offensive', 'score': 0.99922728...",{'sequence': 'RT : ساعات رومانس ساعه اوديمار أ...


In [7]:
unlabeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196612 entries, 0 to 196611
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tweetid                   196610 non-null  object 
 1   user_profile_description  183084 non-null  object 
 2   tweet_text                196612 non-null  object 
 3   is_retweet                196612 non-null  bool   
 4   quote_count               196607 non-null  float64
 5   reply_count               196608 non-null  float64
 6   like_count                196608 non-null  object 
 7   retweet_count             196608 non-null  float64
 8   hashtags                  196612 non-null  int64  
 9   urls                      196612 non-null  int64  
 10  user_mentions             196612 non-null  int64  
 11  text                      196612 non-null  object 
 12  emojis                    196612 non-null  int64  
 13  word_count                196612 non-null  i

In [8]:
labeled = pd.read_json("../data/processed/lf_dev.json")
labeled.head()

Unnamed: 0,tweetid,tweet_text,text,tech,label
0,924924839902793728,RT @Amal_onzi: 🕊💕هُو جنْةبعِيني.,RT : هو جنةبعيني.,,0
1,1074734231887187968,ر٣ #تركيا_تجاهر_بالمعاصي,ر٣ تركيا تجاهر بالمعاصي,smears - name-calling - loaded language,1
2,1089924333378682880,RT @al_raqi_8: 💝 طقم نسائي حصيره شكل ديور مغ...,RT : طقم نسائي حصيره شكل ديور مغناطيس ١ *ساعه ...,,0
3,752398332270563328,RT @vvvv1l: اصحو ي ناس 👊\n\n #الثوره_الخمينيه_...,RT : اصحو ي ناس الثوره الخمينيه تتمزق,loaded language,1
4,1079749279089078272,RT @rood516: اطقم نسائيه من ماركة رولكس😍✨1\nسا...,RT : اطقم نسائيه من ماركة رولكس1 ساعه نسائيه م...,,0


In [9]:
labeled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetid     500 non-null    int64 
 1   tweet_text  500 non-null    object
 2   text        500 non-null    object
 3   tech        48 non-null     object
 4   label       500 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 23.4+ KB


In [10]:
labeled.label.value_counts()

label
0    452
1     48
Name: count, dtype: int64

In [11]:
labeled_data = unlabeled_data[unlabeled_data.text.isin(labeled.text)]
labeled_data.head()

Unnamed: 0,tweetid,user_profile_description,tweet_text,is_retweet,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,text,emojis,word_count,docs,is_irony,is_hate,xlmroberta_label
318,841000033365446656,دام ابوي حولي ما علي خلاف.,يااخي اول شي اجابته صح ثاني شي في اجابه خطا لع...,False,0.0,0.0,0,0.0,1,0,3,يااخي اول شي اجابته صح ثاني شي في اجابه خطا لع...,0,19,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.53657037019...","{'label': 'not offensive', 'score': 0.99910682...",{'sequence': 'يااخي اول شي اجابته صح ثاني شي ف...
373,862407267840393216,شوي من هنا وشويتين من هناك...سياسة ومجتمع وشعر...,الا غصب تشتري 😂😂 https://t.co/cbVH13bh6J,False,0.0,0.0,0,0.0,0,0,0,الا غصب تشتري,2,3,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.68531942367...","{'label': 'not offensive', 'score': 0.99869710...","{'sequence': 'الا غصب تشتري', 'labels': ['tran..."
390,683277968156798976,‏‏‏(رب اغفر لي ولوالدي ولمن دخل بيتي مؤمناً ول...,RT @aldmgane: لم أشجع الهلال باحثاً عن رضاكم،\...,True,0.0,0.0,0,0.0,0,0,1,RT : لم أشجع الهلال باحثا عن رضاكم، ولم أمتدحه...,0,24,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.92064410448...","{'label': 'not offensive', 'score': 0.99944049...",{'sequence': 'RT : لم أشجع الهلال باحثا عن رضا...
652,1026259146000228352,‏‏‎‎#الحرية هي أن تؤمن بها لك ولغيرك\nلكن تفرض...,RT @ksavi1p: أول مرة أشوف هذا الفديو من هذه ال...,True,0.0,0.0,0,0.0,0,0,1,RT : أول مرة أشوف هذا الفديو من هذه الزاوية,7,10,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.75664991140...","{'label': 'not offensive', 'score': 0.99878746...",{'sequence': 'RT : أول مرة أشوف هذا الفديو من ...
1467,1069588123846356992,من قال اني لا ابوح ؟! قد أخبرت ربي كل شيء !!,#استقبال_مليوني\n اللهم اجعلنا اوفر عبادك حظا ...,False,0.0,0.0,0,0.0,1,0,0,استقبال مليوني اللهم اجعلنا اوفر عبادك حظا في ...,0,11,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","{'label': 'nonsarcasm', 'score': 0.95686054229...","{'label': 'not offensive', 'score': 0.99981170...",{'sequence': 'استقبال مليوني اللهم اجعلنا اوفر...


In [12]:
labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 318 to 196598
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   500 non-null    object 
 1   user_profile_description  462 non-null    object 
 2   tweet_text                500 non-null    object 
 3   is_retweet                500 non-null    bool   
 4   quote_count               500 non-null    float64
 5   reply_count               500 non-null    float64
 6   like_count                500 non-null    object 
 7   retweet_count             500 non-null    float64
 8   hashtags                  500 non-null    int64  
 9   urls                      500 non-null    int64  
 10  user_mentions             500 non-null    int64  
 11  text                      500 non-null    object 
 12  emojis                    500 non-null    int64  
 13  word_count                500 non-null    int64  
 14  docs      

In [13]:
unlabeled_data = unlabeled_data[
    ~unlabeled_data.text.isin(labeled_data.text)
].reset_index(drop=True)
labeled_data = labeled_data.reset_index(drop=True)
labeled_data["label"] = [
    labeled[labeled.text == i].label.values[0] for i in labeled_data.text
]

In [14]:
unlabeled_data = unlabeled_data.dropna(subset=["tweetid", "quote_count", "xlmroberta_label"])
unlabeled_data = unlabeled_data.reset_index(drop=True)

In [15]:
unlabeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196106 entries, 0 to 196105
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tweetid                   196106 non-null  object 
 1   user_profile_description  182621 non-null  object 
 2   tweet_text                196106 non-null  object 
 3   is_retweet                196106 non-null  bool   
 4   quote_count               196106 non-null  float64
 5   reply_count               196106 non-null  float64
 6   like_count                196106 non-null  object 
 7   retweet_count             196106 non-null  float64
 8   hashtags                  196106 non-null  int64  
 9   urls                      196106 non-null  int64  
 10  user_mentions             196106 non-null  int64  
 11  text                      196106 non-null  object 
 12  emojis                    196106 non-null  int64  
 13  word_count                196106 non-null  i

In [16]:
labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   500 non-null    object 
 1   user_profile_description  462 non-null    object 
 2   tweet_text                500 non-null    object 
 3   is_retweet                500 non-null    bool   
 4   quote_count               500 non-null    float64
 5   reply_count               500 non-null    float64
 6   like_count                500 non-null    object 
 7   retweet_count             500 non-null    float64
 8   hashtags                  500 non-null    int64  
 9   urls                      500 non-null    int64  
 10  user_mentions             500 non-null    int64  
 11  text                      500 non-null    object 
 12  emojis                    500 non-null    int64  
 13  word_count                500 non-null    int64  
 14  docs      

### User LFs


In [17]:
from snorkel.labeling import labeling_function

In [18]:
prop = 1
gen = 0
ab = -1

In [19]:
# @labeling_function()
# def missing_bio(example):
#     if pd.isna(example.user_profile_description):
#         return prop
#     else:
#         return ab

In [20]:
# @labeling_function()
# def zero_followers(example):
#     if example.follower_count == 0:
#         return prop
#     else:
#         return ab

In [21]:
# @labeling_function()
# def missing_loc(example):
#     if pd.isna(example.user_reported_location):
#         return prop
#     else:
#         return ab

In [22]:
# @labeling_function()
# def created_2018_2019(example):
#     if example.account_creation_date.year in [2018, 2019]:
#         return prop
#     else:
#         return ab

In [23]:
# @labeling_function()
# def created_2011_2012(example):
#     if example.account_creation_date.year in [2011, 2012]:
#         return gen
#     else:
#         return ab

In [24]:
from snorkel.preprocess import preprocessor

In [25]:
from nltk.tokenize import word_tokenize

In [26]:
@preprocessor(memoize=False)
def tokenize_bio(example):
    if not pd.isna(example.user_profile_description):
        example.bio_tokens = word_tokenize(example.user_profile_description)
    else:
        example.bio_tokens = None
    return example

In [27]:
@labeling_function(pre=[tokenize_bio])
def bio_keywords(example):
    keys = ["الحساب", "الرسمي", "عضو", "رئيس", "كاتب", "إدارة"]
    if example.bio_tokens is not None:
        if any(np.in1d(keys, example.bio_tokens)):
            return gen
        else:
            return ab
    else:
        return ab

### Tweet LFs


In [28]:
import re

In [29]:
@labeling_function()
def contain_url(example):
    if example.urls > 0:
        return gen
    else:
        return ab

In [30]:
# @labeling_function()
# def contain_mention(example):
#     if example.user_mentions > 0:
#         return prop
#     else:
#         return ab

In [31]:
@labeling_function()
def labeling_irony(example):
    if len(example.docs.entities) and example.is_irony["label"] == "sarcasm":
        return prop
    else:
        return ab

In [32]:
@labeling_function()
def labeling_hate(example):
    if len(example.docs.entities) and example.is_hate["label"] == "offensive":
        return prop
    else:
        return ab

In [33]:
# @labeling_function()
# def contain_ent(example):
#     if len(example.docs.entities):
#         return prop
#     else:
#         return ab

In [34]:
@labeling_function()
def ent_free(example):
    if len(example.docs.entities):
        return ab
    else:
        return gen

In [35]:
# @labeling_function()
# def contain_question(example):
#     for w in example.docs.sentences[0].words:
#         if w.upos == "AUX":
#             return prop
#     return ab

In [36]:
loaded_tokens = pd.read_csv("../data/raw/loaded-language-lexicons.csv")["loaded-language"].to_list()
loaded_tokens[:5]

['جاهر', 'جهر', 'تجاهر بالمعاصي', 'مجاهرة بالمعاصي', 'تجاهر بالمعاصى']

In [37]:
import glob

proppy_path = "../data/raw/proppy_lexicons/*"

proppy_lexicons = []
for file in glob.glob(proppy_path):
    with open(file, encoding="utf-8") as f:
        proppy_lexicons.extend(f.readlines())

In [38]:
pattern = r"[\u0617-\u061A\u064B-\u0652]"
proppy_lexicons = [re.sub(pattern, "", term.strip()) for term in proppy_lexicons]

In [39]:
proppy_lexicons[:5]

['إجهاض', 'إساءة', 'تعسفي', 'يقبل', 'حساب']

In [40]:
from nltk import ngrams

In [41]:
@preprocessor(memoize=False)
def tokenize_tweet(example):
    if len(example.docs.sentences):
        example.tweet_tokens = [w.text for w in example.docs.sentences[0].words]
        example.tweet_lemmas = [w.lemma for w in example.docs.sentences[0].words]
    else:
        example.tweet_tokens = []
        example.tweet_lemmas = []
    return example

In [42]:
@preprocessor(pre=[tokenize_tweet], memoize=False)
def bigram_tweet(example):
    example.bigram_tokens = [
        " ".join(gram) for gram in ngrams(example.tweet_tokens, 2) if len(gram)
    ]
    example.bigram_lemmas = [
        " ".join(gram) for gram in ngrams(example.tweet_lemmas, 2) if len(gram)
    ]
    return example

In [43]:
@labeling_function(pre=[bigram_tweet])
def loaded_language(example):
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, loaded_tokens)) >= 1:
        return prop
    else:
        return ab

In [44]:
@labeling_function(pre=[bigram_tweet])  # [bigram_tweet]
def genuine_language(example):
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, loaded_tokens)) == 0:
        return gen
    else:
        return ab

In [45]:
@labeling_function(pre=[bigram_tweet])
def loaded_proppy(example):
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, proppy_lexicons)) >= 1:
        return prop
    else:
        return ab

@labeling_function(pre=[bigram_tweet])
def genuine_proppy(example):
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if sum(np.in1d(tweet_ngrams, proppy_lexicons)) == 0:
        return gen
    else:
        return ab

In [46]:
@labeling_function()
def loaded_hate(example):
    if example.is_hate["label"] == "offensive":
        return prop
    else:
        return ab

In [47]:
@labeling_function()
def loaded_irony(example):
    if example.is_irony["label"] == "sarcasm":
        return prop
    else:
        return ab

In [48]:
# @labeling_function()
# def gen_hate(example):
#     if example.is_hate["label"] == "not offensive":
#         return gen
#     else:
#         return ab

In [49]:
# @labeling_function()
# def gen_irony(example):
#     if example.is_irony["label"] == "nonsarcasm":
#         return gen
#     else:
#         return ab

In [50]:
# # flag_waving

# flag_engine = re.compile(r'Number=\w+')

# @labeling_function()
# def flag_wave(example):
#     for w in example.docs.sentences[0].words:
#         if (w.upos == "PRON" and
#             flag_engine.findall(w.feats)[0].split('=')[-1] == 'Plur'):
#             return prop
#     return ab

In [51]:
# @labeling_function(pre=[bigram_tweet])
# def flag_wave(example):
#     keys = tech_terms["flag_waving"]
#     tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
#     tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
#     if any(np.in1d(keys, tweet_ngrams)):
#         return prop
#     else:
#         return ab

In [52]:
# from nltk.corpus import stopwords

# arabic_stop_words = stopwords.words("arabic")

# arabic_stop_words = [re.sub(pattern, "", w) for w in arabic_stop_words]

In [53]:
# @labeling_function(pre=[tokenize_tweet])
# def repetition(example):
#     tokens = [word for word in example.tweet_tokens if word not in arabic_stop_words]
#     tokens = pd.Series(tokens)
#     if tokens.value_counts().max() >= 2:
#         return prop
#     else:
#         return ab

In [54]:
from sentence_transformers import util

wanlp_train = pd.read_json("../data/raw/task1_train.json")
wanlp_train["labels"] = wanlp_train["labels"].apply(
    lambda x: 0 if "no technique" in x else 1
)

wanlp_train.head()

Unnamed: 0,id,text,labels
0,1358824915483435008,"#بي_بي_سي_ترندينغ: النساء ""تثرثر كثيرا"" رئيس أ...",0
1,1389927866356412416,"""ده مش معتقل ده أحسن من اللوكاندة"".. جدل وسخري...",1
2,1364082975428677632,الرجل الذي كان من فراغ https://t.co/2bnHiRqGRQ,0
3,1391667689656102912,RT @AJABreaking: عاجل | حركة حماس: ما يجري في ...,1
4,1389360446440972288,"انطلاق أسبوع المرور العربي تحت شعار: ""الحوادث ...",1


In [55]:
import re
import emoji


def clean_tweet_text(text):
    # links
    clean_text = re.sub(r"http\S+|t\.co/\S+", "", text)
    # mentions
    clean_text = re.sub(r"@\w+", "", clean_text)
    # hashtags
    clean_text = re.sub(r"#", "", clean_text)
    clean_text = re.sub(r"_", " ", clean_text)
    # tashqeel - from @bakriano
    clean_text = re.sub(r"[\u0617-\u061A\u064B-\u0652]", "", clean_text)
    # emojis
    clean_text = emoji.replace_emoji(clean_text, replace="")
    # remove new lines and normalize white spaces
    clean_text = re.sub(r"\s+", " ", clean_text)
    return clean_text.strip()


wanlp_train["text"] = wanlp_train["text"].apply(clean_tweet_text)

In [56]:
import fasttext
from tqdm.auto import tqdm

encoder = fasttext.load_model("../models/encoders/cc.ar.300.bin")

wanlp_hidden_states = []
for tweet in tqdm(wanlp_train.text.values):
    vec = encoder.get_sentence_vector(tweet)
    wanlp_hidden_states.append(vec)



  0%|          | 0/504 [00:00<?, ?it/s]

In [57]:
# @labeling_function()
# def distant_supervision(example):
#     tweet_vec = encoder.get_sentence_vector(example.text)
#     sim_scores = util.cos_sim(tweet_vec, wanlp_hidden_states)
#     most_sim = sim_scores.argmax(dim=-1).item()
#     if (wanlp_train.labels.values[most_sim] == 1
#         and sim_scores[-1][most_sim].item() >= 0.75):
#         return prop
#     else:
#         return ab

In [58]:
@preprocessor()
def get_sim_scores(example):
    tweet_vec = encoder.get_sentence_vector(example.text)
    sim_scores = util.cos_sim(tweet_vec, wanlp_hidden_states)
    example.sim_scores = sim_scores
    return example


@labeling_function(pre=[get_sim_scores])
def distant_supervision_prop(example):
    sim_scores = example.sim_scores
    most_sim = sim_scores.argmax(dim=-1).item()
    if (
        wanlp_train.labels.values[most_sim] == 1
        and sim_scores[-1][most_sim].item() >= 0.75
    ):
        return prop
    else:
        return ab


@labeling_function(pre=[get_sim_scores])
def distant_supervision_gen(example):
    sim_scores = example.sim_scores
    most_sim = sim_scores.argmax(dim=-1).item()
    if (
        wanlp_train.labels.values[most_sim] == 0
        and sim_scores[-1][most_sim].item() >= 0.65
    ):
        return gen
    else:
        return ab

In [59]:
@labeling_function()
def slogans(example):
    matches = re.findall(r"لا ل\w+", example.text)
    matches += re.findall(r"نعم ل\w+", example.text)
    matches += re.findall(r"لا بديل", example.text)
    if len(matches):
        return prop
    else:
        return ab

In [60]:
hitlerum = [
    "هتلر",
    "البغدادي",
    "اوردوغان",
    "قطر",
    "داعش",
    "حوثي",
    "تركيا",
    "الشيعة",
    "إيران",
    "ايران",
    "اخونجي",
    "اخوان",
    "إخوان",
    "إخوانجي",
    "أوردوغان",
    "الحوثي",
    "الحوثيين",
    "ستالين",
    "الإخوان",
    "الاخوان",
    "إرهابي",
    "الإرهابيين",
    "متطرف",
    "المتطرفين",
    "شيعي",
]

In [61]:
@labeling_function(pre=[bigram_tweet])
def reductio(example):
    tweet_ngrams = example.tweet_tokens + example.tweet_lemmas
    tweet_ngrams += example.bigram_tokens + example.bigram_lemmas
    if any(np.in1d(tweet_ngrams, hitlerum)):
        return prop
    else:
        return ab

In [62]:
@labeling_function()
def exaggeration(example):
    if not len(example.docs.sentences):
        return ab
    for w in example.docs.sentences[0].words:
        if w.lemma.startswith("أ") and w.upos == "ADJ":
            return prop
    return ab

In [63]:
# @labeling_function()
# def pronouns(example):
#     pro_nouns = []
#     for word in example.docs.sentences[0].words:
#         if word.upos == "PRON":
#             pro_nouns.append(word.text)
#     if len(pro_nouns) >= 1:
#         return prop
#     else:
#         return ab

In [64]:
candidate_labels = ["transparent", "propaganda"]

In [65]:
@labeling_function()
def xlmroberta_prop(example):
    if pd.isna(example.xlmroberta_label):
        return ab
    if example.xlmroberta_label["scores"][0] >= 0.90:
        if example.xlmroberta_label["labels"][0] == candidate_labels[-1]:
            return prop
        else:
            return ab
    else:
        return ab


@labeling_function()
def xlmroberta_gen(example):
    if pd.isna(example.xlmroberta_label):
        return ab
    if example.xlmroberta_label["scores"][0] >= 0.90:
        if example.xlmroberta_label["labels"][0] == candidate_labels[0]:
            return gen
        else:
            return ab
    else:
        return ab

In [66]:
# lfs = [bio_keywords, contain_url, contain_mention,
#        labeling_irony, labeling_hate, contain_ent, ent_free,
#        contain_question, loaded_language, genuine_language,
#        loaded_proppy, genuine_proppy, loaded_hate, loaded_irony,
#        gen_hate, gen_irony, flag_wave, repetition,
#        distant_supervision_prop, distant_supervision_gen,
#        slogans, reductio, exaggeration, pronouns,
#        xlmroberta_prop, xlmroberta_gen]

In [67]:
lfs = [bio_keywords, contain_url,
       labeling_irony, labeling_hate, ent_free,
       loaded_language, genuine_language, loaded_proppy,
       genuine_proppy, loaded_hate, loaded_irony,
       distant_supervision_prop, distant_supervision_gen,
       slogans, reductio, exaggeration, xlmroberta_prop, xlmroberta_gen]

In [68]:
from snorkel.labeling import PandasLFApplier

In [69]:
applier = PandasLFApplier(lfs)

In [70]:
L_dev = applier.apply(labeled_data)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:49<00:00, 10.11it/s]


In [71]:
from snorkel.labeling import LFAnalysis

In [72]:
LFAnalysis(L_dev, lfs).lf_summary(labeled_data.label.values)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
bio_keywords,0,[0],0.066,0.066,0.052,32,1,0.969697
contain_url,1,[0],0.044,0.044,0.026,21,1,0.954545
labeling_irony,2,[1],0.038,0.038,0.03,7,12,0.368421
labeling_hate,3,[1],0.02,0.02,0.02,2,8,0.2
ent_free,4,[0],0.226,0.226,0.158,108,5,0.955752
loaded_language,5,[1],0.172,0.172,0.096,32,54,0.372093
genuine_language,6,[0],0.828,0.828,0.57,398,16,0.961353
loaded_proppy,7,[1],0.636,0.636,0.56,36,282,0.113208
genuine_proppy,8,[0],0.364,0.364,0.106,170,12,0.934066
loaded_hate,9,[1],0.03,0.03,0.03,3,12,0.2


In [73]:
L_train = applier.apply(unlabeled_data)
LFAnalysis(L_train, lfs).lf_summary()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 196106/196106 [3:15:22<00:00, 16.73it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
bio_keywords,0,[0],0.325798,0.325798,0.298502
contain_url,1,[0],0.460368,0.460368,0.397515
labeling_irony,2,[1],0.039678,0.039678,0.036149
labeling_hate,3,[1],0.018306,0.018306,0.016904
ent_free,4,[0],0.198796,0.198796,0.162463
loaded_language,5,[1],0.193334,0.193334,0.157155
genuine_language,6,[0],0.806666,0.806666,0.669301
loaded_proppy,7,[1],0.771537,0.771537,0.735357
genuine_proppy,8,[0],0.228463,0.228463,0.091099
loaded_hate,9,[1],0.023309,0.023309,0.021907


In [74]:
LFAnalysis(L_train, lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
bio_keywords,0,[0],0.325798,0.325798,0.298502
contain_url,1,[0],0.460368,0.460368,0.397515
labeling_irony,2,[1],0.039678,0.039678,0.036149
labeling_hate,3,[1],0.018306,0.018306,0.016904
ent_free,4,[0],0.198796,0.198796,0.162463
loaded_language,5,[1],0.193334,0.193334,0.157155
genuine_language,6,[0],0.806666,0.806666,0.669301
loaded_proppy,7,[1],0.771537,0.771537,0.735357
genuine_proppy,8,[0],0.228463,0.228463,0.091099
loaded_hate,9,[1],0.023309,0.023309,0.021907


In [75]:
from snorkel.labeling.model import LabelModel

In [76]:
# w = labeled_data.label.value_counts(normalize=True).to_list()
# w

In [77]:
results = LFAnalysis(L_dev, lfs).lf_summary(labeled_data.label.values)
results.head()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
bio_keywords,0,[0],0.066,0.066,0.052,32,1,0.969697
contain_url,1,[0],0.044,0.044,0.026,21,1,0.954545
labeling_irony,2,[1],0.038,0.038,0.03,7,12,0.368421
labeling_hate,3,[1],0.02,0.02,0.02,2,8,0.2
ent_free,4,[0],0.226,0.226,0.158,108,5,0.955752


In [97]:
# inds = list(range(len(lfs)))
inds = results[results["Emp. Acc."] >= 0.20].j.to_list()
inds

[0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17]

In [98]:
print(results.iloc[inds, :].index.to_list())

['bio_keywords', 'contain_url', 'labeling_irony', 'labeling_hate', 'ent_free', 'loaded_language', 'genuine_language', 'genuine_proppy', 'loaded_hate', 'loaded_irony', 'distant_supervision_prop', 'distant_supervision_gen', 'slogans', 'reductio', 'xlmroberta_prop', 'xlmroberta_gen']


In [99]:
L_train_ = L_train[:, inds].copy()
L_dev_ = L_dev[:, inds].copy()

In [100]:
best_score = 0
best_model = None
l2_values = np.arange(0.0, 0.1, 0.01)

for l2 in tqdm(l2_values, total=len(l2_values)):
    label_model = LabelModel(cardinality=2, verbose=False, device="cuda")
    label_model.fit(
        L_train=L_train_,
        n_epochs=2000,
        lr_scheduler="linear",
        lr_scheduler_config={"warmup_percentage": 0.1, "warmup_unit": "epochs"},
        optimizer="adam",
        l2=l2,
        class_balance=None,
        progress_bar=False,
        seed=42,
        lr=0.01,
    )
    score = label_model.score(
        L_dev_, labeled_data.label, tie_break_policy="abstain", metrics=["precision"]
    )["precision"]
    if score >= best_score:
        best_score = score
        best_model = label_model

  0%|          | 0/10 [00:00<?, ?it/s]



In [101]:
best_model.train_config

TrainConfig(n_epochs=2000, lr=0.01, l2=0.03, optimizer='adam', optimizer_config=OptimizerConfig(sgd_config=SGDOptimizerConfig(momentum=0.9), adam_config=AdamOptimizerConfig(amsgrad=False, betas=(0.9, 0.999)), adamax_config=AdamaxOptimizerConfig(betas=(0.9, 0.999), eps=1e-08)), lr_scheduler='linear', lr_scheduler_config=LRSchedulerConfig(warmup_steps=0, warmup_unit='epochs', warmup_percentage=0.1, min_lr=0.0, exponential_config=ExponentialLRSchedulerConfig(gamma=0.9), step_config=StepLRSchedulerConfig(gamma=0.9, step_size=5)), prec_init=0.7, seed=42, log_freq=10, mu_eps=None)

In [102]:
best_model.score(
    L_dev_,
    labeled_data.label,
    tie_break_policy="abstain",
    metrics=["accuracy", "precision", "recall", "f1", "f1_macro", "f1_micro"],
)



{'accuracy': 0.858,
 'precision': 0.3707865168539326,
 'recall': 0.6875,
 'f1': 0.4817518248175182,
 'f1_macro': 0.6997403388282262,
 'f1_micro': 0.858}

In [103]:
from sklearn.metrics import classification_report

In [104]:
y_true = labeled_data.label
y_pred = best_model.predict(L_dev_, tie_break_policy="abstain")
y_true = y_true[y_pred != -1]
y_pred = y_pred[y_pred != -1]

In [105]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.88      0.92       452
           1       0.37      0.69      0.48        48

    accuracy                           0.86       500
   macro avg       0.67      0.78      0.70       500
weighted avg       0.91      0.86      0.88       500



In [184]:
labeled_data["snorkel"] = best_model.predict(L_dev_, tie_break_policy="abstain")

In [185]:
analysis = labeled_data[labeled_data.label != labeled_data.snorkel][
    ["tweetid", "text", "label", "snorkel"]
]

In [186]:
analysis.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72 entries, 2 to 499
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   tweetid  72 non-null     object
 1   text     72 non-null     object
 2   label    72 non-null     int64 
 3   snorkel  72 non-null     int32 
dtypes: int32(1), int64(1), object(2)
memory usage: 2.5+ KB


In [187]:
analysis.snorkel.value_counts()

snorkel
 1    43
-1    15
 0    14
Name: count, dtype: int64

In [188]:
pd.set_option("display.max_colwidth", None, "display.max_rows", None)

In [189]:
analysis.head(75)

Unnamed: 0,tweetid,text,label,snorkel
2,463398876507283456,RT : من عجيب ما أزعجني اليوم ماسمعته عبر ufm من برنامج ١١٢ للمقدمه منيره المهيزع إن ٧٠٪ من مخلفات الرياض عبارة عن أطعمة قابلة للإ…,1,0
3,610750278954958848,RT : يارب جنبني الكسل فــي عبادتك وعطني سعادة عبد ماعنده ذنوب!,0,-1
21,949045366401519616,RT : حافظوا على صديق .. لا تحتاج أن تعدل جلستك حينما تكون معه ..! ••┈❦┈┈• •❥ قروبات خالد للدهم الفهد للدعم منبع الذوق لل…,0,1
22,1130832753606500353,RT : سبحان الله وبحمده سبحان الله العظيم سبحان الله وبحمده سبحان الله العظيم سبحان الله وبحمده سبحان الله العظيم الغزالي …,0,1
27,902822934510743553,"RT : "" ياظروفي قدريني وسويبي جميل "" لا تحديني على اقصاي والدنيا سعه .......؟",0,-1
29,1100336063854239744,RT : نشاطنا الرائد في مجالي دراسة الماجستير والدكتوراه منحنا المعرفة في مجالات البحث العلمي الحديثة وغير المستهلكة. جامعة الم…,1,-1
71,852263826389168128,كريستيانو قبل التوقيع مع الريال : سوف أثبت لكم بأن القرار الذي اتخذتموه صحيحا - الان كريستيانو ختم جميع الأرقام,1,0
78,894659692970086402,تضاءلت فرص مشاركة البرازيلي ليوناردو محترف الأهلي السعودي في لقاء الاتفاق يوم الجمعة المقبل في انطلاقة دوري جميل بسبب الاصابة هيدو,0,1
79,1062280586159706115,RT : | دشن المنتخب السعودي تدريباته على ملعب مدينة الأمير سعود بن جلوي الرياضية في الخبر، حيث بدأت الحصة التدريبية التي قادها…,0,1
93,650142943672729600,RT : قال الشيخ ابن عثيمين رحمه الله: الذي يفتي بلا علم أضل من الجاهل..,0,1


In [190]:
# saving the datasets
weak_preds = best_model.predict(L_train_)
unlabeled_data["label"] = weak_preds
unlabeled_data = unlabeled_data[weak_preds != -1]

In [191]:
unlabeled_data[["tweetid", "text", "label"]].to_json("../data/processed/train.json")