# Importing Project Dependencies

In [21]:
from collections import Counter #https://docs.python.org/3/library/collections.html
import pandas as pd #https://pandas.pydata.org/docs/user_guide/10min.html

import json #https://docs.python.org/3/library/json.html

from glob import glob #https://docs.python.org/3/library/glob.html
import bs4
from bs4 import BeautifulSoup #https://pypi.org/project/beautifulsoup4/
import re

# Helper Functions

In [2]:
def get_xml_data(path: str):
    
    '''
    Extracts the xml data into two parts one with only token-info and 
    the other with token and label-info for annotated data.
    
    Args:
    @param: path - Path to the XML file
    
    Returns: token_info - List of token elements from the XML
             bull_info - List of BULLISM_INSTANCE elements from the XML
    '''
    
    # Open the XML file
    with open(path, "r") as xml_file:
        xml_content = xml_file.read() # Read the content of the file
        
    xml_data = BeautifulSoup(xml_content, "xml") # Parse XML content https://tedboy.github.io/bs4_doc/_modules/bs4.html#BeautifulSoup
    
    token_info = xml_data.find_all("token") # Find elements with tag https://tedboy.github.io/bs4_doc/generated/generated/bs4.BeautifulSoup.find_all.html#bs4.BeautifulSoup.find_all
    
    bull_info = xml_data.find_all("BULLISM_INSTANCE") # Find elements with labels
    
    # Return the extracted token and BULLISM_INSTANCE elements
    
    
    return token_info, bull_info 


In [3]:
def sort_data(data: dict):
    """
    Sorts a dictionary of dictionaries by both sentence numbers (outer keys) and token numbers (inner keys).

    Args:
    - data (dict): Dictionary of dictionaries to be sorted

    Returns:
    - sorted_data (dict): Sorted dictionary of dictionaries
    """
    sorted_data = {}
    sorted_sentence_nums = sorted(data.keys(), key=lambda x: int(x))
    for sentence_num in sorted_sentence_nums:
        token_dict = data[sentence_num]
        sorted_tokens = sorted(token_dict.keys(), key=lambda x: int(x))
        sorted_token_dict = {token_num: token_dict[token_num] for token_num in sorted_tokens}
        sorted_data[sentence_num] = sorted_token_dict
    return sorted_data

In [4]:
def xml_to_dict(orig_files: list):
    '''
    Comments: Generated by Codegen2 (GPT-like) 
    
    The `xml_to_dict` function processes a list of file paths containing XML files 
    to extract token and label information. 

    Input:
        - orig_files: List of file paths to XML files
    
    Returns:
        - file_text_data: Dictionary containing extracted text data per file
        - file_label_data: Dictionary containing extracted label data per file

    Processing Steps:
        1. Initializes dictionaries to store text and label data per file.
        2. Iterates through each file path in the input list.
        3. Extracts the file name from the path and displays a progress message.
        4. Retrieves token and BULLISM_INSTANCE elements from the XML file using `get_xml_data`.
        5. Stores token and label data into separate lists.
        6. Extracts unique sentence numbers from the collected data.
        7. Organizes text data into a nested dictionary by sentence number and token info.
        8. Sorts the text data dictionary.
        9. Organizes label data into a nested dictionary by sentence number and token info.
        10. Sorts the label data dictionary.
        11. Stores sorted text and label data in respective dictionaries for the file.
        12. Displays completion messages for each file processed.
        13. Returns dictionaries containing extracted text and label data per file.
    
    '''
    
    text_dict = {}
    label_dict = {}
    
    for file in [o_fi for o_fi in orig_files]:
        
        count = 0
        
        # Extract file name from path
        fname = file.split('/')[-1].split('.')[0]
        
        print("Processing\t:", fname)
        
        # Extract token and BULLISM_INSTANCE elements from the XML file
        token_info, bull_info = get_xml_data(file)
        
        print("XML to dict converted...")
        print('Now getting tokens and labels...')
        
        label_data = []
        
        text_data = []
        for i in range(len(token_info)):
            t_id = token_info[i].get("t_id")
            token = token_info[i].text
            s_n = token_info[i].get("sentence")
            
            text_data.append((s_n, t_id, token))
            
            # Loop through bull_info
            for j in range(len(bull_info)):
                token_anchor = bull_info[j].find_all("token_anchor")
                anchor_list = [anchor.get("t_id") for anchor in token_anchor]
                
                # Check if t_id is in anchor_list
                if t_id in anchor_list:
                    
                    label_data.append((s_n, t_id, token))
            
        # Extract unique sentence numbers from text and label data
        txt_unq_sent_num = list(set([txt[0] for txt in text_data]))
        lab_unq_sent_num = list(set([label[0] for label in label_data]))
        
        # Organize text data into a nested dictionary by sentence number and token info
        text_data_dict = {}
        for sent_num in txt_unq_sent_num:
            token_dict = {}
            for tup in text_data:
                if tup[0] == sent_num:
                    token_dict[tup[1]] = tup[-1]
            
            text_data_dict[sent_num] = token_dict
            
        # Sort the text data dictionary
        sort_text_dict = sort_data(text_data_dict)
                    
        # Organize label data into a nested dictionary by sentence number and token info
        label_data_dict = {}
        for sent_num in lab_unq_sent_num:
            token_info_dict = {}
            for tup in label_data:
                if tup[0] == sent_num:
                    token_info_dict[tup[1]] = tup[-1]
                    
        
            label_data_dict[sent_num] = token_info_dict
            
        # Sort the label data dictionary
        sort_label_dict = sort_data(label_data_dict)
        
        # Store sorted text and label data in respective dictionaries for the file
        text_dict[fname] = sort_text_dict
        label_dict[fname] = sort_label_dict
        
        print("Done..")
        print("\n")
        
    print("Fin !")
        
    return text_dict, label_dict
    

In [5]:
def process_data(data: dict, sorted_label_dict: dict):
    """
    Process data and extract relevant information.

    Args:
    - data: A list of dictionaries containing 'annotation' and 'text' keys.
    - sorted_label_dict: A dictionary containing sorted label information.

    Returns:
    - label_count: Count of labeled data instances.
    - non_count: Count of instances without labels.
    - labels: List of tuples containing extracted information from labeled data.
    """
    label_count = 0  # Initialize count for labeled instances
    non_count = 0  # Initialize count for instances without labels
    labels = []  # List to store extracted information from labeled data

    for d in data:  # Iterate through each dictionary in the provided data
        annot_list = d['annotation']  # Extract annotation list
        text = d['text']  # Extract text

        if len(annot_list) > 0:  # Check if there are annotations
            label_count += 1  # Increment labeled instance count
            for ann in annot_list:  # Iterate through annotations
                # Extract various annotation details
                entity_type = ann['entity-type']
                role_type = ann['role-type']
                sarcasm = ann['sarcasm']
                non_offensive = ann['non-offensive']

                span_list = ann['span']  # Extract span list
                for span in span_list:  # Iterate through spans
                    # Extract details from the span
                    fname = span.split("/")[-1].split('.xml')[0]
                    t_id = span.split("/")[-1].split('.xml')[-1]

                    sent_nums = sorted_label_dict[fname].keys()  # Get sentence numbers
                    for sn in sent_nums:  # Iterate through sentence numbers
                        if t_id in sorted_label_dict[fname][sn].keys():
                            # Create a tuple with extracted information and append to labels
                            tup = (fname, sn, t_id, ' '.join(text), entity_type, role_type, sarcasm, non_offensive)
                            labels.append(tup)
        else:
            non_count += 1  # Increment count for instances without labels
    
    labels = list(set(labels))
    
    label_df = pd.DataFrame(labels, columns=["File", "SentenceNumber", "Token-Id", "Sentence", "EntityType", "Role", "Sarcasm", "Non-Offensive"])
    
    label_df.SentenceNumber = label_df.SentenceNumber.astype(int)
    
    fnames = list(label_df.File.unique())
    
    label_df_2 = process_label_data(label_df, fnames)
    
    return label_count, non_count, label_df_2  # Return counts and extracted labels


In [6]:
def process_label_data(label_df: pd.DataFrame, fnames: list):
    """
    Process label data and generate a list of tuples.

    Args:
    - label_df: A Pandas DataFrame containing label data.
    - fnames: A list of file names.

    Returns:
    - label_list2: A list of tuples with processed label information.
    """
    label_list2 = []

    for f in fnames:
        temp = label_df[label_df.File == f]
        sn = temp.SentenceNumber.values.tolist()
        for n in sn:
            temp2 = temp[temp.SentenceNumber == n]
            ent = temp2.EntityType.unique()
            rol = temp2.Role.unique()
            sar = temp2.Sarcasm.unique()
            nof = temp2["Non-Offensive"].unique()
            sent = temp2.Sentence.unique()[0]

            for e in ent:
                for r in rol:
                    tup = (f, n, sent, e, r, sar[0], nof[0])
                    label_list2.append(tup)
                    
    #label_list2 = list(set(label_list2))
                    
    label_df = pd.DataFrame(label_list2, columns=["File", "SentenceNumber", "Sentence", "EntityType", "Role", "Sarcasm", "Non-Offensive"])

    label_df = label_df.sort_values(["File", "SentenceNumber"])
    
    label_df = label_df.drop_duplicates()
    
    return label_df


In [7]:
def convert_to_dataframe(nested_dict: dict):
    """
    Convert a nested dictionary to a Pandas DataFrame in CSV-like format.

    Args:
    - nested_dict: A nested dictionary containing file names, sentence numbers, token IDs, and token values.

    Returns:
    - df: A Pandas DataFrame with columns: File, Token_ids, SentenceNumber, Sentence
    """
    # Step 1: Initialize an empty list to store rows
    rows = []

    # Step 2: Extract data from the nested dictionary and create rows for the DataFrame
    for file_name, sentences in nested_dict.items():
        for sent_number, tokens in sentences.items():
            for token_id, token_value in tokens.items():
                rows.append([file_name, sent_number, token_id, token_value])

    # Step 3: Create a DataFrame from the collected rows with specific column names
    temp = pd.DataFrame(rows, columns=["File", "SentenceNumber", "Token_ids", "Sentence"])
    
    
    # Step 4: Get unique file names from the DataFrame
    fnames = temp.File.unique()

    df_list = []  # Initialize a list to store DataFrame rows

    # Step 5: Iterate through unique file names and concatenate tokens for each unique file and sentence number combination
    for f in fnames:
        sent_nums = temp.SentenceNumber[temp.File == f].unique()
        for sn in sent_nums:
            sentence = ' '.join(temp['Sentence'][(temp.SentenceNumber == sn) & (temp.File == f)].values.tolist())
            #t_ids = temp['Token_ids'][(temp.SentenceNumber == sn) & (temp.File == f)].values.tolist()
            df_list.append((f, sn, sentence))

    # Step 6: Create a new DataFrame from the collected information with specific column names
    df = pd.DataFrame(df_list, columns=["File", "SentenceNumber", "Sentence"])

    return df  # Return the final DataFrame


In [8]:
def split_frames(df: pd.DataFrame):
    
    df.SentenceNumber = df.SentenceNumber.astype(int)
    
    a_df = df[df.File.str.contains('A')].sort_values(["File", "SentenceNumber"])
    b_df = df[df.File.str.contains('B')].sort_values(["File", "SentenceNumber"])
    c_df = df[df.File.str.contains('C')].sort_values(["File", "SentenceNumber"])
    d_df = df[df.File.str.contains('D')].sort_values(["File", "SentenceNumber"])
    
    
    return a_df, b_df, c_df, d_df
    
    

In [9]:
def save_csv(df, fname):
    df.to_csv(fname, index=False)
    return "saved !"

# Get RAW

In [10]:
orig_path = "WhatsApp-Dataset/Annotated_Corpus/"

save_path = "raw/"

In [11]:
orig_files = glob(orig_path+"*.xml")

In [12]:
# Take RAW JSON file created by authors and shared over email

with open("data_whatsapp.json", "r") as file:
    
    data = json.load(file)

In [13]:
sorted_text_dict, sorted_label_dict = xml_to_dict(orig_files)

Processing	: A_4
XML to dict converted...
Now getting tokens and labels...
Done..


Processing	: C_2
XML to dict converted...
Now getting tokens and labels...
Done..


Processing	: A_1
XML to dict converted...
Now getting tokens and labels...
Done..


Processing	: A_3
XML to dict converted...
Now getting tokens and labels...
Done..


Processing	: C_1
XML to dict converted...
Now getting tokens and labels...
Done..


Processing	: A_2
XML to dict converted...
Now getting tokens and labels...
Done..


Processing	: D_1
XML to dict converted...
Now getting tokens and labels...
Done..


Processing	: D_2
XML to dict converted...
Now getting tokens and labels...
Done..


Processing	: B_1
XML to dict converted...
Now getting tokens and labels...
Done..


Processing	: B_2
XML to dict converted...
Now getting tokens and labels...
Done..


Fin !


In [14]:
len(sorted_text_dict), len(sorted_label_dict)

(10, 10)

In [15]:
label_count, non_count, label_df = process_data(data, sorted_label_dict)

In [16]:
label_count, non_count

(1042, 1150)

In [17]:
text_df = convert_to_dataframe(sorted_text_dict)

In [18]:
a_df, b_df, c_df, d_df = split_frames(text_df)

In [19]:
a_lab_df, b_lab_df, c_lab_df, d_lab_df = split_frames(label_df)

In [20]:
save_csv(a_df, save_path+'A_allData.csv')

'saved !'

In [21]:
save_csv(a_lab_df, save_path+'A_bullyData.csv')

'saved !'

In [22]:
save_csv(b_df, save_path+'B_allData.csv')

'saved !'

In [23]:
save_csv(b_lab_df, save_path+'B_bullyData.csv')

'saved !'

In [24]:
save_csv(c_df, save_path+'C_allData.csv')

'saved !'

In [25]:
save_csv(c_lab_df, save_path+'C_bullyData.csv')

'saved !'

In [26]:
save_csv(d_df, save_path+'D_allData.csv')

'saved !'

In [27]:
save_csv(d_lab_df, save_path+'D_bullyData.csv')

'saved !'