# Description
**Functionality**: This module formats the three way split Wikipedia Homograph Data (WHD) for BERT token classification using Huggingface tools. 

**Use**: The BERT model finetuning functionality from Huggingface expects CoNLL03-formatted data. The output from this model will be used to finetune in order to predict pronunciation labels on homographs.

### Imports

In [1]:
import os
from glob import glob
import pandas as pd
from tqdm import tqdm
from typing import List, Dict
import spacy

### Variables

In [2]:
#Paths 
WHD_DATA = "C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/"
METADATA = WHD_DATA + 'WikipediaHomographData.csv'
#Source paths
TRAIN = WHD_DATA + "three_split_data/train/"
VAL = WHD_DATA + "three_split_data/valid/"
TEST = WHD_DATA + "three_split_data/test/"
#Destination paths
BERT_TRAIN = WHD_DATA + "bert_data/train/"
BERT_DEV = WHD_DATA + "bert_data/dev/"
BERT_TEST = WHD_DATA + "bert_data/test/" 
# Zip for source, destination data paths
ORIGINAL_SETS = [TRAIN, VAL, TEST]
BERT_SETS = [BERT_TRAIN, BERT_DEV, BERT_TEST]
SOURCE_DEST = zip(ORIGINAL_SETS, BERT_SETS)

#Tools
nlp = spacy.load('en_core_web_sm')

#Variables
OUTSIDE = "O" #Label for all words that are not a homograph

#Options
pd.set_option('display.max_rows', None)

### Functions

In [3]:
def get_tokens(sentence : str) -> List:
    sent_nlp =  nlp(sentence, disable=['parser', 'tagger', 'ner'])
    tokens = [token.text for token in sent_nlp if not token.is_punct]
    return tokens

def make_str(label : List) -> str: 
    return ' '.join(label)

# Script

In [4]:
for PATHS in SOURCE_DEST: # Do this for train, test, valid
    for f in tqdm(glob(PATHS[0] +'*.tsv')): 
        f_name = os.path.basename(f)
        df = pd.read_table(f)
        df = df[['homograph', 'wordid', 'sentence']]
        df['token'] = df.sentence.apply(lambda sentence : get_tokens(sentence))
        df = df.explode('token') # Get one row per token 
        for index, group in df.groupby(df.index):# Create one tsv per sentence; one line per token, label pair
            sentence_dicts = []
            for idx, row in group.iterrows():
                sentence_dict = {}
                token = row['token']
                homograph = row['homograph']
                sentence_dict['sent_id'] = "{}_{}".format(homograph, index)
                sentence_dict['token'] = token
                if token.lower() == homograph: # If the lowercase token is the same as the homograph, label with the wordid
                    sentence_dict['label'] = [row['wordid']]
                else: 
                    sentence_dict['label'] = [OUTSIDE] # If the token is not the homograph, label with 'O' for 'outside'
                sentence_dicts.append(sentence_dict)
            df = pd.DataFrame(sentence_dicts)
            df['label'] = df['label'].apply(make_str)
            new_f_name = PATHS[1] + f_name[:-4] + "_" + str(index) + '.txt'# Name file with homograph and sentence number
            df.to_csv(new_f_name, sep="\t", header=False, index=False) 
       

100%|████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:47<00:00,  3.39it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:05<00:00, 27.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:06<00:00, 24.90it/s]


### Make labels.txt

In [5]:
metadata_df = pd.read_csv(METADATA)
wordids = metadata_df.wordid.tolist()
out_file = WHD_DATA + "bert_data/labels.txt"
with open(out_file, 'w') as f:
    for wordid in wordids:
        f.write("{}\n".format(wordid))
    f.write("{}\n".format('O'))