In [None]:
!ls 

In [1]:
from glob import glob
import pandas as pd
from collections import Counter


In [2]:
trans_files = glob("translated/*")


In [3]:
orig_files = glob("raw/*.csv")

In [23]:
def merge_data(fnames: list, raw_text_df: pd.DataFrame, trans_text_df: pd.DataFrame, raw_lab_df: pd.DataFrame):
    
    """
    Merges data from multiple DataFrames based on file names, generating a combined DataFrame.

    Args:
    - fnames: A list of file names.
    - raw_text_df: DataFrame containing raw text data.
    - trans_text_df: DataFrame containing translated text data.
    - raw_lab_df: DataFrame containing raw label data.

    Returns:
    - merged_df: A Pandas DataFrame containing the merged data.
    """


    fin_data = list()
    
    # Iterate through file names
    for f in fnames:

        temp_text = raw_text_df[raw_text_df['File']==f]
        trans_text = trans_text_df[trans_text_df['File']==f]
        temp_lab = raw_lab_df[raw_lab_df['File']==f]
        temp_text_sns = list(temp_text.SentenceNumber.unique()) #https://pandas.pydata.org/docs/reference/api/pandas.unique.html

        lab_sn = temp_lab['SentenceNumber'].unique()

        for sn in temp_text_sns:
            
            it_text = temp_text.Sentence[temp_text.SentenceNumber==sn].values[0] #values returns a numpy array, values[0] returns first element
            en_text = trans_text.Translation[trans_text.SentenceNumber==sn].values[0]
            
            if sn in lab_sn:
                lab_len = len(temp_lab.EntityType[temp_lab.SentenceNumber==sn])
                
                if lab_len>=1:

                    ent_list = temp_lab.EntityType[temp_lab.SentenceNumber==sn].values.tolist() #tolist() converts numpy array to list
                    rol_list = temp_lab.Role[temp_lab.SentenceNumber==sn].values.tolist()
                    s = temp_lab.Sarcasm[temp_lab.SentenceNumber==sn].values[0]
                    n = temp_lab['Non-Offensive'][temp_lab.SentenceNumber==sn].values[0]

                    for e in ent_list:
                        for r in rol_list:
                            tup = (f, sn, it_text, en_text, e, r, s, n) #tuple with f-filename, sn-sentencenumber, e-entity, r-role, s-sarcasm, n-nonoffensive
                            fin_data.append(tup)
                        
#             else: #if label does not exist use 'None' instead of encountering NaN (missing)
#                 e = 'None'
#                 r = 'None'
#                 s = 'None'
#                 n = 'None'
                
#                 tup = (f, sn, it_text, en_text, e, r, s, n)
#                 fin_data.append(tup)
                
    
    return pd.DataFrame(fin_data, columns=["File", "SentenceNumber", "Sentence", "Translation",
                                           "EntityType", "Role", "Sarcasm", "NonOffensive"])

In [24]:
def get_data(translated_file_path: list, original_file_path: list, file_type: str):
    """
    Retrieve and process data from files to create labeled and translated text DataFrames.

    Args:
    - translated_file_path: List of file paths for translated files.
    - original_file_path: List of file paths for original files.
    - file_type: A string indicating the type of file.

    Returns:
    - label_df: A Pandas DataFrame containing merged label data.
    - text_df: A Pandas DataFrame containing processed text data.
    
    """
    
    trans_path = [f for f in translated_file_path if file_type in f]
    orig_path = [f for f in original_file_path if file_type in f]
    
    trans_p = [trans_p for trans_p in trans_path if 'all' in trans_p][0]
        
    trans_text_df = pd.read_excel(trans_p)
    
    for orig_p in orig_path:
        if 'all' in orig_p:
            raw_text_df = pd.read_csv(orig_p)
        
        else:
            raw_lab_df = pd.read_csv(orig_p)
            
    fnames = list(raw_text_df.File.unique())
    
    text_data = []
    
    for i in range(len(raw_text_df)):
        file = raw_text_df.File.iloc[i] #https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html
        sn = raw_text_df.SentenceNumber.iloc[i]
        it_text = raw_text_df.Sentence.iloc[i]
        en_txt_list = trans_text_df.Translation[(trans_text_df.File==file)&(trans_text_df.SentenceNumber==sn)&(trans_text_df.Sentence==it_text)].values.tolist()
        if(len(en_txt_list)<1):
            '''
            When mapping translated files with original files, 
            it was found that {File: "A_1", SentenceNumber: 14} characters in italian sentence in translation file were edited/altered.
            Checked translation with native speaker. Looks okay.
            it was found that {File:"D_2", SentenceNumber:[42, 63, 68, 69]} characters in italian sentence in translation file were edited/altered.
            Checked translation with native speaker. Looks okay.
            
            Checked using below,
            print(trans_text_df[(trans_text_df.File==file)&(trans_text_df.SentenceNumber==sn)])
            print(file, sn, text)
            print(trans_text_df[(trans_text_df.File==file)&(trans_text_df.SentenceNumber==sn)])
            print("\n")
            
            So re-aligning translation text for above File and SentenceNumber
            '''
            en_text = trans_text_df[(trans_text_df.File==file)&(trans_text_df.SentenceNumber==sn)].values[0]
            tup = (file, sn, it_text, en_text)
            text_data.append(tup)
            
        else:
            
            tup = (file, sn, it_text, en_txt_list[0])
            text_data.append(tup)
            
    #text_data = list(set(text_data))
    
    text_df = pd.DataFrame(text_data, columns=["File", "SentenceNumber", "Sentence", "Translation"])    
        
    label_df = merge_data(fnames, raw_text_df, text_df, raw_lab_df)
    
    label_df = label_df.sort_values(["File", "SentenceNumber"]) #https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html
    
    return label_df, text_df
    

In [40]:
def count_labels(label_list: list):
    """
    Counts the occurrences of each label in the list and returns a dictionary.

    Args:
    - label_list: The input list containing labels to be counted.

    Returns:
    - count_dict: A dictionary with elements as keys and their counts as values.
    """
    count_dict = {}  
    
    # Iterate through the elements in the list
    for elem in label_list:  
        if elem in count_dict:  # Check if element is already in the dictionary
            count_dict[elem] += 1  
        else:
            count_dict[elem] = 1  # Initialize count as 1 for a new element
    
    return count_dict  


In [81]:
def save(text_df: pd.DataFrame, label_df: pd.DataFrame, file_name: str):
    fname = 'merged/'+file_name
    text_df.to_csv(fname+"_allData.csv", index=False)
    label_df.to_csv(fname+"_bullyData.csv", index=False)
    
    print("¡¡¡¡¡\tSaved\t¡¡¡¡¡")

In [82]:
a_label_df, a_text_df = get_data(trans_files, orig_files, 'A_')

In [83]:
b_label_df, b_text_df = get_data(trans_files, orig_files, 'B_')

In [84]:
c_label_df, c_text_df = get_data(trans_files, orig_files, 'C_')

In [85]:
d_label_df, d_text_df = get_data(trans_files, orig_files, 'D_')

In [86]:
save(a_text_df, a_label_df, 'A')

¡¡¡¡¡	Saved	¡¡¡¡¡


In [87]:
save(b_text_df, b_label_df, 'B')

¡¡¡¡¡	Saved	¡¡¡¡¡


In [88]:
save(c_text_df, c_label_df, 'C')

¡¡¡¡¡	Saved	¡¡¡¡¡


In [89]:
save(d_text_df, d_label_df, 'D')

¡¡¡¡¡	Saved	¡¡¡¡¡


In [90]:
label_df = pd.concat([a_label_df, b_label_df, c_label_df, d_label_df])
text_df = pd.concat([a_text_df, b_text_df, c_text_df, d_text_df])

In [91]:
save(text_df, label_df, 'merged')

¡¡¡¡¡	Saved	¡¡¡¡¡


In [None]:
# Preserve Token-wise Labels



In [33]:
count =0 

for f in ['A_','B_','C_','D_']:
    snums = label_df.SentenceNumber[label_df.File.str.contains(f)].unique()
    for sn in snums:
        ent_label = label_df.EntityType[(label_df.File.str.contains(f))&(label_df.SentenceNumber==sn)]
        #ent_label = [lab for lab in label_df.EntityType[(label_df.File.str.contains(f))&(label_df.SentenceNumber==sn)].unique() if lab!='None']
        counter = Counter(ent_label)
        
        if len(ent_label) > 1:
            
            print(counter)
            count += 1
            
    

Counter({'Insult-Discrimination-Sexism': 4, 'Curse_or_Exclusion': 3, 'Insult-General_Insult': 3, 'Defense': 1})
Counter({'Insult-Discrimination-Sexism': 1, 'Defense': 1, 'Insult-General_Insult': 1})
Counter({'Insult-Discrimination-Sexism': 1, 'Defense': 1, 'Insult-General_Insult': 1})
Counter({'Insult-General_Insult': 3})
Counter({'Defense': 1, 'Insult-General_Insult': 1})
Counter({'Defense': 2, 'Insult-General_Insult': 1})
Counter({'Insult-Discrimination-Sexism': 2, 'Defense': 1})
Counter({'Encouragement_to_the_Harasser': 1, 'Defense': 1, 'Insult-General_Insult': 1})
Counter({'Insult-General_Insult': 2, 'Insult-Discrimination-Sexism': 1})
Counter({'Defense': 3, 'Insult-Discrimination-Sexism': 1})
Counter({'Insult-General_Insult': 1, 'Threat_or_Blackmail': 1, 'Defense': 1})
Counter({'Defense': 3, 'Encouragement_to_the_Harasser': 1})
Counter({'Insult-Discrimination-Sexism': 1, 'Curse_or_Exclusion': 1})
Counter({'Defense': 3})
Counter({'Threat_or_Blackmail': 1, 'Insult-BodyShame': 1})
Co

Counter({'Insult-General_Insult': 2, 'Threat_or_Blackmail': 2, 'Defense': 1})
Counter({'Insult-General_Insult': 5, 'Curse_or_Exclusion': 5, 'Defense': 3})
Counter({'Insult-General_Insult': 1, 'Defense': 1})
Counter({'Defense': 1, 'Curse_or_Exclusion': 1})
Counter({'Insult-General_Insult': 1, 'Threat_or_Blackmail': 1})
Counter({'Defense': 1, 'Insult-General_Insult': 1})
Counter({'Curse_or_Exclusion': 1, 'Insult-General_Insult': 1})
Counter({'Insult-General_Insult': 1, 'Threat_or_Blackmail': 1})
Counter({'Encouragement_to_the_Harasser': 1, 'Defense': 1})
Counter({'Insult-General_Insult': 1, 'Defense': 1})
Counter({'Defense': 1, 'Insult-General_Insult': 1})
Counter({'Curse_or_Exclusion': 2})
Counter({'Curse_or_Exclusion': 3, 'Insult-General_Insult': 2})
Counter({'Defense': 1, 'Insult-General_Insult': 1})
Counter({'Defense': 2, 'Curse_or_Exclusion': 2, 'Insult-General_Insult': 1})
Counter({'Insult-General_Insult': 1, 'Defense': 1})
Counter({'Insult-General_Insult': 1, 'Defense': 1})
Counte

In [32]:
count

260

In [None]:
len(label_df), len(text_df)