In [1]:
import pandas as pd
from collections import Counter
import string
import re
from glob import glob
from tqdm import tqdm
import json


# Helper Functions

In [2]:
def emoji_normalise(txt: str):
    
    """
    Function to normalize emojis in the given text using information from an emoji information CSV file.

    Args:
    - txt: A string containing text that may include emojis.

    Returns:
    - Normalized text with emojis replaced according to the information in the CSV file.
    """
    
    tokens = txt.split()
    count = 0
    try:
        emoji_df = pd.read_csv('pre_process/emoji_info.csv')
    except:
        print("ERROR: You need emoji_info.csv if not present already exceute: get_emoj_transcriptions.ipynb")
        
    
    # Extract unique emoji decimal and emoji values from the DataFrame
    emoji_decs = list(emoji_df.emoji_decimal.unique())
    emojis = list(emoji_df.emoji.unique())
    # Loop through each token in the input tex
    
    new_tokens = []
    for tkn in tokens:
        # Check if the token is in the list of emoji decimals
        if tkn in emoji_decs:
            
            emoticon = emoji_df.emoji[emoji_df.emoji_decimal==tkn].values[0]
            count += 1
            tkn = emoticon
        
        # Check if the token is in the list of emojis
        elif tkn in emojis:
            tkn = tkn

        else:
            tkn = tkn
            
        new_tokens.append(tkn)
    
    return ' '.join(new_tokens)

In [3]:
def emoji_remove(txt: str):
    """
    Function to remove emojis from the given text based on information from an emoji information CSV file.

    Args:
    - txt: A string containing text that may include emojis.

    Returns:
    - Text with emojis removed based on the information in the CSV file.
    """
    tokens = txt.split()
    count = 0
    emoji_df = pd.read_csv('pre_process/emoji_info.csv')
    
    emoji_decs = list(emoji_df.emoji_decimal.unique())
    
    emojis = list(emoji_df.emoji.unique())
    
    emoji_pattern = r'&#[0-9]+;[^\s,;"]*'
    
    new_tokens = []
    # Loop through each token in the input text
    for tkn in tokens:
         # Check if the token is in the list of emoji decimals
        if tkn in emoji_decs:
            count += 1
            tkn = ""
         # Check if the token is in the list of emojis
        elif tkn in emojis:
            tkn = ""
            
        elif len(re.findall(emoji_pattern, tkn))>=1:
            tkn = ""
        else:
            tkn = tkn
            
        new_tokens.append(tkn)
    
    return ' '.join(new_tokens)

In [4]:
def get_timestamp(sent: str):
    """
    Function to extract timestamp from a sentence using a specific date-time pattern.

    Args:
    - sent: A string containing text that may include a timestamp.

    Returns:
    - Timestamp extracted from the sentence based on the provided date-time pattern.
    """
    
    #  date-time pattern to search for in the sentence
    date_time_pattern = '([0-9]+ / [0-9]+ / [0-9]+ , [0-9]+ : [0-9]+)'

    dateTime_list = re.findall(date_time_pattern, sent)
    
     # Check if any date-time patterns were found
    if len(dateTime_list)>0:
        temp_sent = sent.split(dateTime_list[0])[-1]
        tempSent_tokens = temp_sent.split(':')

        dateTime = dateTime_list[0]
            
    else:
        
        dateTime = 'empty'
        
    return dateTime

In [5]:
def author_message(sent: str):
    """
    Function to extract date-time pattern, author and message from a sentence.

    Args:
    - sent: A string containing text that may include author and message information.

    Returns:
    - Tuple containing the extracted author and message from the sentence.
    """
    
    date_time_pattern = '([0-9]+ / [0-9]+ / [0-9]+ , [0-9]+ : [0-9]+)'

    dateTime_list = re.findall(date_time_pattern, sent)
    
    if len(dateTime_list)>0:
        temp_sent = sent.split(dateTime_list[0])[-1]
        tempSent_tokens = temp_sent.split(':')

        dateTime = dateTime_list[0]
        
        # Check the structure of the remaining tokens after splitting by ':'
        if len(tempSent_tokens)>2:
            
            auth = tempSent_tokens[0]
            msg = ' '.join(tempSent_tokens[1:])
            
        else:
            auth = tempSent_tokens[0]
            msg = tempSent_tokens[-1]
            
    else:
        # If no date-time pattern is found, set dateTime as 'empty'
        dateTime = 'empty'
        tempSent_tokens = sent.split(':')
        
        if (len(tempSent_tokens))>2:
            auth = tempSent_tokens[0]
            msg = ' '.join(tempSent_tokens[1:])

        elif (len(tempSent_tokens)<2):
            # Handle specific cases where the structure is different
            if 'jpg' not in tempSent_tokens[0]:
                temp_2 = tempSent_tokens[0].split('  ')
                auth = temp_2[0]
                msg = temp_2[-1]

            else:

                auth = 'empty'
                msg = tempSent_tokens[0]

        else:
            auth = tempSent_tokens[0]
            msg = tempSent_tokens[-1]
            
    
    return auth, msg

In [6]:
def process(df: pd.DataFrame):

    it_data = []
    en_data = []

    for i in tqdm(range(len(df))):

        file = df.File.iloc[i]
        sn = df.SentenceNumber.iloc[i]

        sent = df.Sentence.iloc[i]

        trans = df.Translation.iloc[i]

        it_auth, it_msg = author_message(sent)
        en_auth, en_msg = author_message(trans)

        it_timeStamp = get_timestamp(sent)
        en_timeStamp = get_timestamp(trans)

        it_emoji_msg = emoji_normalise(it_msg)
        it_emoji_auth = emoji_normalise(it_auth)

        en_emoji_msg = emoji_normalise(en_msg)
        en_emoji_auth = emoji_normalise(en_auth)

        it_non_msg = emoji_remove(it_msg)
        it_non_auth = emoji_remove(it_auth)

        en_non_msg = emoji_remove(en_msg)
        en_non_auth = emoji_remove(en_auth)


        it_tup = (file, sn, sent, trans, it_timeStamp, it_auth, it_msg, it_emoji_auth, it_emoji_msg, it_non_auth, it_non_msg)
        en_tup = (file, sn, sent, trans, en_timeStamp, en_auth, en_msg, en_emoji_auth, en_emoji_msg, en_non_auth, en_non_msg)

        it_data.append(it_tup)
        en_data.append(en_tup)
        
    
    it_df = pd.DataFrame(it_data, columns=["File", "SentenceNumber", "Sentence", "Translation",
                                           "TimeStamp", "author", "raw", "authorEmoji", "textEmoji",
                                          "authorNon", "textNon"])
    
    en_df = pd.DataFrame(en_data, columns=["File", "SentenceNumber", "Sentence", "Translation",
                                           "TimeStamp", "author", "raw", "authorEmoji", "textEmoji",
                                          "authorNon", "textNon"])

    return it_df, en_df

In [7]:
def get_binary(bull: pd.DataFrame):

    new_data = []

    for f in bull.File.unique():
        snums = bull.SentenceNumber[bull.File==f].unique()
        for sn in snums:
            it_text = bull.Sentence[(bull.SentenceNumber==sn)&(bull.File==f)].values.tolist()[0]
            en_text = bull.Translation[(bull.SentenceNumber==sn)&(bull.File==f)].values.tolist()[0]

            ent = bull.EntityType[(bull.SentenceNumber==sn)&(bull.File==f)].values.tolist()
            rol = bull.Role[(bull.SentenceNumber==sn)&(bull.File==f)].values.tolist()

            sar = bull.Sarcasm[(bull.SentenceNumber==sn)&(bull.File==f)].values.tolist()[0]
            non = bull.NonOffensive[(bull.SentenceNumber==sn)&(bull.File==f)].values.tolist()[0]

            if (ent in ["Defense", "Encouragement_to_the_Harasser"]) and (rol == 'Victim'):
                lab = 0
            else:
                lab = 1

            for e in ent:
                for r in rol:
                    tup = (f, sn, it_text, en_text, e, r, sar, non, lab)
                    new_data.append(tup)
                    
    new_data = list(set(new_data))

    new_df = pd.DataFrame(new_data, columns=["File", "SentenceNumber", "Sentence", 
                                             "Translation", "EntityType", "Role",
                                             "Sarcasm", "NonOffensive", "Binary"])
    
    return new_df


In [8]:
def merge_frames(bull: pd.DataFrame, data: pd.DataFrame):
    
    
    temp = data.merge(bull, on=(["File", "SentenceNumber", "Sentence", "Translation"]),
                      how="left")
    
    return temp

In [9]:
def en_merge_mt(it_df, en_df):
    
    merged_list = []
    for i in range(len(it_df)):
        mt_w = it_df.mt_w.iloc[i]
        mt_wot = it_df.mt_wot.iloc[i]
        mt_w_b = it_df.mt_w_b[i]
        mt_wot_b = it_df.mt_wot_b[i]
        
        file = it_df.File.iloc[i]
        snum = it_df.SentenceNumber.iloc[i]
        
        it_txt = en_df.Sentence[(it_df.File==file)&(it_df.SentenceNumber==snum)].values[0]
        
        en_txt = en_df.Translation[(it_df.File==file)&(it_df.SentenceNumber==snum)].values[0]
        ts = en_df.TimeStamp[(it_df.File==file)&(it_df.SentenceNumber==snum)].values[0]
        auth = en_df.author[(it_df.File==file)&(it_df.SentenceNumber==snum)].values[0]
        en_txt_raw = en_df.raw[(it_df.File==file)&(it_df.SentenceNumber==snum)].values[0]
        en_auth_emo = en_df.authorEmoji[(it_df.File==file)&(it_df.SentenceNumber==snum)].values[0]
        en_txt_emo = en_df.textEmoji[(it_df.File==file)&(it_df.SentenceNumber==snum)].values[0]
        en_auth_non = en_df.authorNon[(it_df.File==file)&(it_df.SentenceNumber==snum)].values[0]
        en_txt_non = en_df.textNon[(it_df.File==file)&(it_df.SentenceNumber==snum)].values[0]
        
        tup = (file, snum, it_txt, en_txt, ts, auth, en_txt_raw, en_auth_emo, en_txt_emo, 
              en_auth_non, en_txt_non, mt_w, mt_wot, mt_w_b, mt_wot_b)
        
        merged_list.append(tup)
    
    merged_df = pd.DataFrame(merged_list, columns=["File", "SentenceNumber", "Sentence", 
                                                  "Translation", "TimeStamp", "Author",
                                                  "en_raw", "en_auth", "en_emo", 
                                                  "en_auth_non", "en_non", 
                                                  "mt_w", "mt_wot", "mt_w_b", "mt_wot_b"])
    
    return merged_df

In [10]:
def it_merge_mt(data_w_b, data_wot_b, data_w, data_wot, df):
    
    merged_list = []
    for i in range(len(data_w)):
        mt_w = data_w[i]
        mt_wot = data_wot[i]
        mt_w_b = data_w_b[i]
        mt_wot_b = data_wot_b[i]
        
        file = df.File.iloc[i]
        snum = df.SentenceNumber.iloc[i]
        it_txt = df.Sentence.iloc[i]
        en_txt = df.Translation.iloc[i]
        ts = df.TimeStamp.iloc[i]
        auth = df.author.iloc[i]
        it_txt_raw = df.raw.iloc[i]
        it_auth_emo = df.authorEmoji.iloc[i]
        it_txt_emo = df.textEmoji.iloc[i]
        it_auth_non = df.authorNon.iloc[i]
        it_txt_non = df.textNon.iloc[i]
        
        tup = (file, snum, it_txt, en_txt, ts, auth, it_txt_raw, it_auth_emo, it_txt_emo, 
              it_auth_non, it_txt_non, mt_w, mt_wot, mt_w_b, mt_wot_b)
        
        merged_list.append(tup)
    
    merged_df = pd.DataFrame(merged_list, columns=["File", "SentenceNumber", "Sentence", 
                                                  "Translation", "TimeStamp", "Author",
                                                  "it_raw", "it_auth", "it_emo", 
                                                  "it_auth_non", "it_non", 
                                                  "mt_w", "mt_wot", "mt_w_b", "mt_wot_b"])
    
    return merged_df

# Prepare Data for MT experiments

In [11]:
merged_all = pd.read_csv("merged/merged_allData.csv")

merged_lab = pd.read_csv("merged/merged_bullyData.csv")

In [12]:
!ls pre_process/

emoji_info.csv          [31mscenarioC.csv[m[m           [31mtrain.csv[m[m
raw_english_ordered.csv [34mscenario_wise[m[m           [31mvalidation.csv[m[m
raw_italian_ordered.csv [31mtest.csv[m[m


In [13]:

[f for f in glob("merged/*.csv") if ('merged_' not in f) and ('C_' not in f)]


['merged/D_allData.csv',
 'merged/A_allData.csv',
 'merged/B_allData.csv',
 'merged/B_bullyData.csv',
 'merged/A_bullyData.csv',
 'merged/D_bullyData.csv']

In [14]:
a_all = pd.read_csv("merged/A_allData.csv")

a_bul = pd.read_csv("merged/A_bullyData.csv")

In [15]:
a_it_clean, a_en_clean = process(a_all)

100%|██████████| 1077/1077 [00:47<00:00, 22.57it/s]


In [16]:
b_all = pd.read_csv("merged/B_allData.csv")

b_bul = pd.read_csv("merged/B_bullyData.csv")

In [17]:
b_it_clean, b_en_clean = process(b_all)

100%|██████████| 574/574 [00:27<00:00, 20.82it/s]


In [18]:
c_all = pd.read_csv("merged/C_allData.csv")

c_bul = pd.read_csv("merged/C_bullyData.csv")

In [19]:
c_it_clean, c_en_clean = process(c_all)

100%|██████████| 130/130 [00:07<00:00, 16.88it/s]


In [20]:
d_all = pd.read_csv("merged/D_allData.csv")

d_bul = pd.read_csv("merged/D_bullyData.csv")

In [21]:
d_it_clean, d_en_clean = process(d_all)

100%|██████████| 411/411 [00:19<00:00, 20.82it/s]


In [22]:
raw_merg_all = pd.read_csv("merged/merged_allData.csv")

raw_merg_lab = pd.read_csv("merged/merged_bullyData.csv")

In [23]:
raw_merg_all.head()

Unnamed: 0,File,SentenceNumber,Sentence,Translation
0,A_1,0,Vittima : Ciao . . . volevo invitarvi al saggi...,Vittima : Hi . . . I wanted to invite you to t...
1,A_1,1,Vittima : WA0001 . jpg ( file allegato ),Vittima : WA0001 . jpg (file attached)
2,A_1,2,‪SupportoVittima1‬ : Chiedo a mia mamma e to f...,‪SupportoVittima1 : I’ll ask my mom and I'll l...
3,A_1,3,Vittima : Grazie SupportoVittima1 !,Vittima : Thanks SupportoVittima1!
4,A_1,4,‪Bullo1‬ : Ah un saggio di danza,‪Bullo1 : Ah a dance recital


In [24]:
rawMerge_it_clean, rawMerge_en_clean = process(raw_merg_all)

100%|██████████| 2192/2192 [01:41<00:00, 21.51it/s]


In [25]:
rawMerge_en_clean.to_csv("pre_process/raw_english_ordered.csv", index=False)

In [26]:
# This is MT input

rawMerge_it_clean.to_csv("pre_process/raw_italian_ordered.csv", index=False)

In [27]:
a_it_clean.to_csv("pre_process/scenario_wise/A_it_clean.csv", index=False)

b_it_clean.to_csv("pre_process/scenario_wise/B_it_clean.csv", index=False)

c_it_clean.to_csv("pre_process/scenario_wise/C_it_clean.csv", index=False)

d_it_clean.to_csv("pre_process/scenario_wise/D_it_clean.csv", index=False)


In [28]:
a_en_clean.to_csv("pre_process/scenario_wise/A_en_clean.csv", index=False)

b_en_clean.to_csv("pre_process/scenario_wise/B_en_clean.csv", index=False)

c_en_clean.to_csv("pre_process/scenario_wise/C_en_clean.csv", index=False)

d_en_clean.to_csv("pre_process/scenario_wise/D_en_clean.csv", index=False)


# MT Data

In [29]:
# raw_italian_ordered was shared for MT experiments, now we will combine all mt-output

In [30]:
'''
only it-en uses emojis
Test-better system is 
'''

with open("mt_output/it-en.test_better.hyp.v2", "r") as file:
    data_w_b = file.readlines()

In [31]:
'''
only it1-en does not use any emojis as they are replaced with " " (blank space)
Test is not normal system
'''

with open("mt_output/it-en.test.hyp.v2", "r") as file:
    data_w = file.readlines()

In [32]:


with open("mt_output/it1-en.test.hyp.v2", "r") as file:
    data_wot = file.readlines()

In [33]:
with open("mt_output/it1-en.test_better.hyp.v2", "r") as file:
    data_wot_b = file.readlines()

In [34]:
len(data_w), len(data_wot), len(data_w_b), len(data_wot_b)

(2192, 2192, 2192, 2192)

In [35]:
it_merged_all = it_merge_mt(data_w_b, data_wot_b, data_w, data_wot, rawMerge_it_clean)

In [36]:
en_merged_all = en_merge_mt(it_merged_all, rawMerge_en_clean)

# Split data

In [114]:
def get_split(df):
    train_val, test = train_test_split(df, test_size=0.2, random_state=42)
    train, val = train_test_split(train_val, test_size=0.2, random_state=42)
    
    return train, val, test

In [52]:
from sklearn.model_selection import train_test_split

In [112]:
all_but_c = en_merged_all[['File', 'SentenceNumber']][~(en_merged_all.File.str.contains('C'))]

only_c = en_merged_all[['File', 'SentenceNumber']][(en_merged_all.File.str.contains('C'))]

In [113]:
len(all_but_c), len(only_c)

(2062, 130)

In [115]:
train, val, test = get_split(all_but_c)

In [116]:
train.to_csv("train_val_test/train.csv", index=False)

In [117]:
val.to_csv("train_val_test/validation.csv", index=False)

In [118]:
test.to_csv("train_val_test/test.csv", index=False)

In [119]:
only_c.to_csv("train_val_test/scenario_c.csv", index=False)