In [42]:
# imports
import os
import glob
import regex as re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from camel_tools.utils.charmap import CharMapper # to transliterate Arabic

In [43]:
# FUNCTIONS

def tag_freq(df: pd.DataFrame):
    print("Number of tags: {}".format(len(df.Tag.unique())))
    frequencies = df.Tag.value_counts()
    return frequencies

def remove_extra_labels(old_new_labels: dict, df: pd.DataFrame):
    df = df.replace(old_new_labels)
    return df

def change_dtypes(df: pd.DataFrame):
    data_types_dict = {'Sentence #': str,
                       'Word': str,
                       'Tag': str}
    df = df.astype(data_types_dict)
    return df

def create_sentences_word_labels_cols(df: pd.DataFrame):
    df = df.fillna(method='ffill')
    # let's create a new column called "sentence" which groups the words by sentence 
    df['sentence'] = df[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
    # let's also create a new column called "word_labels" which groups the tags by sentence 
    df['word_labels'] = df[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
    # keeping only the sentence and word_labels columns
    df = df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
    return df

def num_of_sentences(dfs_dict: dict):
    for df in dfs_dict.keys():
        assert type(df) == pd.DataFrame
    languages = [key for key in dfs_dict.keys()]
    num_of_sentences = [int(df['Sentence #'].dropna().tolist()[-1][9:]) for df in dfs_dict.values()]
    plt.bar(languages, num_of_sentences)
    plt.grid(color='black', linestyle='--', linewidth=1, axis='y', alpha=1)
    plt.show()
    

In [44]:
# datasets paths

dir_path = os.getcwd()
raw_datasets = dir_path + '/raw_datasets/'
final_datasets = dir_path + '/final_datasets/'

In [45]:
# saving files into variables (according to their format)

lang_dataframes = dict()
data_txt = glob.glob(f"{raw_datasets}*.txt", recursive=True) # containing Arabic data
data_csv = glob.glob(f"{raw_datasets}*.csv", recursive=True) # containing Maltese, Italian, Spanish, English and Dutch data (need less preprocessing)
print("Data formatted in .txt files", data_txt, "\n Data already formatted in .csv files", data_csv, sep='\n')

Data formatted in .txt files
['/domus/h1/krisfarr/thesis/raw_datasets/CamelLab_test.txt', '/domus/h1/krisfarr/thesis/raw_datasets/CamelLab_train.txt', '/domus/h1/krisfarr/thesis/raw_datasets/WikiFANE_Gold_2014_500K.txt']

 Data already formatted in .csv files
['/domus/h1/krisfarr/thesis/raw_datasets/_dutch_data.csv', '/domus/h1/krisfarr/thesis/raw_datasets/_english_data.csv', '/domus/h1/krisfarr/thesis/raw_datasets/_italian_data.csv', '/domus/h1/krisfarr/thesis/raw_datasets/_maltese_test_data.csv', '/domus/h1/krisfarr/thesis/raw_datasets/_maltese_train_data.csv', '/domus/h1/krisfarr/thesis/raw_datasets/_spanish_data.csv']


# Arabic (fine-grained ANERCorp - CAMEL Lab + coarse-grained WikiFane)  Data

https://camel.abudhabi.nyu.edu/anercorp/ + https://fsalotaibi.kau.edu.sa/Pages-Arabic-NE-Corpora.aspx

In [98]:
# Arabic text files into csv files (appending dataframes together for training)
arabic_dfs = []
sentence_number = 1

for file_txt in data_txt:
    file_name = re.findall('(?<=raw_datasets/).*$', file_txt)[0]
    with open(file_txt) as f:
        # changing tag 'PERS' to 'PER'; removing 'right to left' encoding;
        lines = [line.replace('PERS', 'PER').replace('\u200f', '') for line in f.readlines()]
        lines = [line.replace('\ufeff', '').replace('\t', ' ').replace('\u200e', '') for line in lines]
        if file_name == 'WikiFANE_Gold_2014_500K.txt':
            lines = [line.replace(' ', '') if line == ' \n' else line.strip() for line in lines]
        # initalizing sentence number and list for dataframe
        df = []

        for line in lines:

            if line == '\n':
                # if line is empty (sentence segmentation), append dummy word and tag and increment sentence number
                df.append([f'Sentence: {sentence_number}', '0', 'O'])
                sentence_number += 1
                continue
            # remove skip lines and split into list
            line = line.strip().split(' ')

            # add sentence number to first position
            line.insert(0, f'Sentence: {sentence_number}')
            # append line (type:lst) to dataframe
            df.append(line)
        # initialize column names
        column_names = ['Sentence #', 'Word', 'Tag']
        # change list to dataframe
        df = pd.DataFrame(df, columns=column_names, dtype=str)
        print(f"number of tagged words from {file_name} -> {df.shape[0]}")
        # append dataframe to list of dataframes
        arabic_dfs.append(df)
        print("END OF FILE-->", file_txt)

number of tagged words from CamelLab_test.txt -> 25933
END OF FILE--> /domus/h1/krisfarr/thesis/raw_datasets/CamelLab_test.txt
number of tagged words from CamelLab_train.txt -> 129075
END OF FILE--> /domus/h1/krisfarr/thesis/raw_datasets/CamelLab_train.txt
number of tagged words from WikiFANE_Gold_2014_500K.txt -> 505324
END OF FILE--> /domus/h1/krisfarr/thesis/raw_datasets/WikiFANE_Gold_2014_500K.txt


In [99]:
# concatenate list of dataframes together
arabic_data = pd.concat(arabic_dfs)
arabic_data = change_dtypes(arabic_data)

# number of rows, i.e. number of words and tags
arabic_data.shape

(660332, 3)

In [100]:
arabic_data.tail(3)

Unnamed: 0,Sentence #,Word,Tag
505321,Sentence: 20661,م,O
505322,Sentence: 20661,.,O
505323,Sentence: 20661,0,O


In [101]:
# tags to change - manually chosen

# tags starting with B
to_change_B = {'B-Airport': 'B-LOC', 'B-Artist': 'B-PER', 
            'B-Athlete': 'B-PER', 'B-Building-Grounds': 'B-LOC',
            'B-Businessperson': 'B-PER', 'B-Government': 'B-ORG', 'B-Continent': 'B-LOC',
            'B-Group': 'B-ORG', 'B-Land-Region-Natural': 'B-LOC',
            'B-Lawyer': 'B-PER', 'B-Nation': 'B-LOC',
            'B-Non-Governmental': 'B-ORG',
            'B-Other_PER': 'B-PER',
            'B-Police': 'B-PER',
            'B-Politician': 'B-PER',
            'B-Population-Center': 'B-LOC',
            'B-Religious_ORG': 'B-ORG',
            'B-Religious_PER': 'B-PER',
            'B-Scientist': 'B-PER',
            'B-State-or-Province': 'B-LOC',
            'B-Water-Body' : 'B-LOC'}

# tags starting with I
to_change_I = {'I-'+k[2:]:'I-'+v[2:] for k,v in to_change_B.items()}

In [102]:
# changing chosen tags to PER, LOC and ORG

arabic_data['Tag'] = arabic_data['Tag'].replace(to_change_B)
arabic_data['Tag'] = arabic_data['Tag'].replace(to_change_I)

In [103]:
final_labels = {'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [104]:
# substituting any remaining tags with MISC

arabic_data['Tag'] = arabic_data['Tag'].apply(lambda x: 'B-MISC' if x not in final_labels and x[0] == 'B' else x)
arabic_data['Tag'] = arabic_data['Tag'].apply(lambda x: 'I-MISC' if x not in final_labels and x[0] == 'I' else x)

In [105]:
# checking that tags have been removed

unique_tags = set(arabic_data['Tag'].tolist())
unique_tags

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [106]:
arabic_data.to_csv(f'testing_arabic_data.csv', index=False)

In [107]:
lang_dataframes['arabic'] = arabic_data

# Maltese Data

In [108]:
maltese_csv = [file for file in data_csv if 'maltese' in file]
maltese_csv

['/domus/h1/krisfarr/thesis/raw_datasets/_maltese_test_data.csv',
 '/domus/h1/krisfarr/thesis/raw_datasets/_maltese_train_data.csv']

In [109]:
# checking maltese data
pd.read_csv(maltese_csv[0]).head(2), pd.read_csv(maltese_csv[0]).shape, pd.read_csv(maltese_csv[1]).shape

(    Sentence #      Word   POS Tag
 0  Sentence: 1  kuntenta   ADJ   O
 1          NaN        li  COMP   O,
 (5894, 4),
 (11151, 4))

In [110]:
# concatenating Maltese data

maltese_dfs = []
for data in maltese_csv:
    with open(data) as f:
        df = pd.read_csv(f, usecols=['Sentence #', 'Word', 'Tag'])
        df.reset_index(drop=True, inplace=True)
        maltese_dfs.append(df)

In [111]:
maltese_data = pd.concat(maltese_dfs, ignore_index=True)
maltese_data.shape

(17045, 3)

In [112]:
sentence_number = 1
for i, row in maltese_data.iterrows():
    if pd.notnull(row['Sentence #']):
        row['Sentence #'] = f"Sentence: {sentence_number}"
        sentence_number += 1

print(f"Number of sentences: {maltese_data['Sentence #'].dropna().tolist()[-1][9:]}")

Number of sentences:  599


In [113]:
lang_dataframes['maltese'] = maltese_data

# Italian, English, Spanish and Dutch data (already partially prepocessed from R&D project) 

In [114]:
other_langs_csv = [file for file in data_csv if 'maltese' not in file]
other_langs_csv

['/domus/h1/krisfarr/thesis/raw_datasets/_dutch_data.csv',
 '/domus/h1/krisfarr/thesis/raw_datasets/_english_data.csv',
 '/domus/h1/krisfarr/thesis/raw_datasets/_italian_data.csv',
 '/domus/h1/krisfarr/thesis/raw_datasets/_spanish_data.csv']

In [115]:
for csv_file in other_langs_csv:
    file_name = re.findall('(?<=raw_datasets/).*$', csv_file)[0]
    lang = re.search(r'_(.*?)_', file_name).group(1)
    with open(csv_file) as f:
        df = pd.read_csv(f)
        if lang == 'english':
            # English CoNLL 2003 dataset was fine-grained and therefore, extra tags need to be removed
            # change to 'O' tags or 'MISC' tags?
            df['Tag'] = df['Tag'].replace({'B-TIM': 'B-MISC', 'I-TIM': 'I-MISC',
                        'B-GPE': 'B-MISC', 'I-GPE': 'I-MISC',
                        'B-ART': 'B-MISC', 'I-ART': 'I-MISC',
                        'B-EVE': 'B-MISC', 'I-EVE': 'I-MISC',
                        'B-NAT': 'B-MISC', 'I-NAT': 'I-MISC'})
        lang_dataframes[lang] = df 

## To add periods in English csv + slicing Italian data

In [7]:
# inserting full stops in English dataset
english_data = lang_dataframes['english']
cols = english_data.columns
columns_lists = {}
for col in cols:
    columns_lists[col] = english_data[col].tolist()

In [39]:
skip_next = None
list_sents, list_words = columns_lists['Sentence #'], columns_lists['Word']
list_pos, list_ner = columns_lists['POS'], columns_lists['Tag']
zipped_rows = list(zip(list_sents, list_words, list_pos, list_ner))

In [None]:
zipped_rows_ = []
for i, row in enumerate(zipped_rows):
    if i % 10000 == 0:
        print(i)
    if row == (np.nan, '0', '0', 'O'):
        zipped_rows_.append((np.nan, '.', np.nan, 'O'))
    zipped_rows_.append(row)

In [41]:
df = pd.DataFrame(zipped_rows_, columns=['Sentence #', 'Word', 'POS', 'Tag'])
df.to_csv('testing_english_data.csv', index=False)

In [95]:
with open("/home/krisfarr/thesis/raw_datasets/_italian_data.csv") as f:
    df = pd.read_csv(f)
    to_slice_index = df['Sentence #'].tolist().index('Sentence: 20001')
    df_ = df.iloc[:583127, :].copy()

In [97]:
df_.to_csv("/home/krisfarr/thesis/raw_datasets/_italian_data.csv", index=False)

# From CSVs to fully preprocessed for training and testing

In [194]:
def unique_tags(df):
    '''
    Output: set of unique tags
    '''
    # language chosen arbitrarly
    unique_tags = set(df['Tag'].tolist())
    return unique_tags

def tag2id(unique_tags):
    '''
    Output: dict of tags mapped to indices
    '''
    tag2id_ = {tag: id for id, tag in enumerate(unique_tags)}
    return tag2id_

def id2tag(tag2id_):
    '''
    Output: dict of indices mapped to tags
    '''
    id2tag_ = {id: tag for tag, id in tag2id_.items()}
    return id2tag_

UNIQUE_TAGS = unique_tags(lang_dataframes[random.choice(list(lang_dataframes.keys()))])
TAG2ID = tag2id(UNIQUE_TAGS)
ID2TAG = id2tag(TAG2ID)
FINAL_DATA = dict()

print(UNIQUE_TAGS, TAG2ID, ID2TAG, sep='\n \n')

{'B-LOC', 'B-ORG', 'I-PER', 'I-LOC', 'B-MISC', 'B-PER', 'O', 'I-ORG', 'I-MISC'}
 
{'B-LOC': 0, 'B-ORG': 1, 'I-PER': 2, 'I-LOC': 3, 'B-MISC': 4, 'B-PER': 5, 'O': 6, 'I-ORG': 7, 'I-MISC': 8}
 
{0: 'B-LOC', 1: 'B-ORG', 2: 'I-PER', 3: 'I-LOC', 4: 'B-MISC', 5: 'B-PER', 6: 'O', 7: 'I-ORG', 8: 'I-MISC'}


In [199]:
class PrepareDataset():
    def __init__(self, dataframe: pd.DataFrame, language: str):
        self._df = dataframe
        self._num_of_sents = self._df.shape[0]
        self._lang = language

        train_sentences = None
        train_labels = None
    
    def get_df(self) -> pd.DataFrame:
        return self._df
    
    def get_df_head(self):
        return self._df.head()
    
    def get_df_tail(self):
        return self._df.tail()
    
    def get_num_of_sentences(self):
        return self._num_of_sents

    def check_for_nulls(self):
        return self._df.isnull().sum()

    def assert_lengths(self):
        print("asserting lengths...")
        print(f"Lengths->  sentences: {len(self.train_sentences)} labels: {len(self.train_labels)}")
        assert len(self.train_sentences) == len(self.train_labels)

        for i, sent in enumerate(self.train_sentences):
            assert len(sent) == len(self.train_labels[i])
    

    def create_columns(self):
        '''
        Explained in comments.
        '''
        # "forward fill" function to fill missing values based on the last upper non-nan value
        self._df = self._df.fillna(method='ffill')
        # creating a  new column called "sentence" which groups the words by sentence 
        self._df['tokens'] = self._df[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
        # creating a new column called "word_labels" which groups the tags by sentence 
        self._df['ner_tags'] = self._df[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
        # keeping only the sentence and word_labels columns
        self._df = self._df[["Sentence #", "tokens", "ner_tags"]].drop_duplicates().reset_index(drop=True)
        self._df.columns = ['id', 'tokens', 'ner_tags']
        self._df['id'] = self._df['id'].str.slice(start=9)
        print("Created ner_tags and tokens columns with values as lists! \n")
        print(f"Number of sentences: {df_.get_df().shape[0]}")
        return self._df
    
    def values_to_lists(self, delimiter_n=1):
        '''
        Changing string values to lists in tokens and ner_tags columns
        '''

        self.train_sentences = [sentence.split(' ')[:-delimiter_n] for sentence in self._df.tokens.values]
        self.train_labels = [label.split(',')[:-delimiter_n] for label in self._df.ner_tags.values]

        print(self.train_sentences[0], self.train_labels[0])

        self.assert_lengths()
        
        # assert len(self.train_sentences[0]) == len(self.train_labels[1])

    def update_lists(self):
        '''
        Encode labels and remove sentences tagged with 'O' only.
        Update sentences in accordance with the removed labels.
        '''
    
        train_labels_, remove_sentences = list(), list()
        for i, sent_label in enumerate(self.train_labels):
            # print(self.train_sentences[i])
            if len(sent_label) < 3:
                remove_sentences.append(self.train_sentences[i])
                continue 
            # temporary list
            sent_label_ = []
            for label in sent_label:
                # encode label
                sent_label_.append(TAG2ID[label])

            # rule does not apply to Maltese data given the limited number of sentences
            if self._lang == 'maltese' or set(sent_label_) != {TAG2ID['O']}:
                train_labels_.append(sent_label_)

            # append to remvoe sentences
            elif set(sent_label_) == {TAG2ID['O']}:
                remove_sentences.append(self.train_sentences[i])
        self.train_labels = train_labels_

        for sent in remove_sentences:
            self.train_sentences.remove(sent)

        self.assert_lengths()

    def update_columns(self):
        # if self._lang == 'arabic':
        #     train_sentences_ = []
        #     for i, sent in enumerate(self.train_sentences):
        #         sent.insert(len(sent), '.')
        #         train_sentences_.append(sent)
        #         self.train_labels[i].insert(len(train_labels), tag2id['O'])
        self.final_df = self.get_df()
        temp_df = df.iloc[:len(self.train_labels), :].copy()
        temp_df['tokens'] = self.train_sentences
        temp_df['ner_tags'] = self.train_labels
        self.final_df = temp_df
        
    
    def save_csv(self):
        # df = self.get_df()
        print(self.final_df.head(100))
        self.final_df.to_csv(f'_{self._lang}_testing_data.csv', index=False)

In [200]:
# First stage of preprocessing
for lang, df in lang_dataframes.items():
    if lang=='maltese':
        print(f"LANGUAGE: {lang} \n")
        df_ = PrepareDataset(dataframe=df, language=lang)
        print(f"Number of words and tags: {df_.get_num_of_sentences()} \n")
        print(f"Checking for nulls: \n {df_.check_for_nulls()} \n")
        df_.create_columns()
        print("############################ \n")
        FINAL_DATA[lang] = df_

LANGUAGE: maltese 

Number of words and tags: 17045 

Checking for nulls: 
 Sentence #    16446
Word              0
Tag               0
dtype: int64 

Created ner_tags and tokens columns with values as lists! 

Number of sentences: 599
############################ 



In [201]:
for lang, df_ in FINAL_DATA.items():
    print(f"Language {lang}")
    df_.values_to_lists()
    df_.update_lists()
    df_.update_columns()
    df_.save_csv()

Language maltese
['kuntenta', 'li', 'erġajt', 'fuq', 'l-', 'art', ',', 'imma', 'diġà', 'għandi', 'nostalġija', 'għal', 'dawk', 'il-', 'kwiekeb', 'sbieħ', '.'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
asserting lengths...
Lengths->  sentences: 599 labels: 599
asserting lengths...
Lengths->  sentences: 589 labels: 589
     Sentence #       Word  POS    Tag  \
0   Sentence: 0  Melbourne   NP  B-LOC   
1           NaN          (  Fpa      O   
2           NaN  Australia   NP  B-LOC   
3           NaN          )  Fpt      O   
4           NaN          ,   Fc      O   
..          ...        ...  ...    ...   
95          NaN         de   SP      O   
96          NaN        que   CS      O   
97          NaN        las   DA      O   
98          NaN   personas   NC      O   
99          NaN        que   PR      O   

                                               tokens  \
0   [kuntenta, li, erġajt, fuq, l-, art, ,, imma, ...   
1                 

In [198]:
FINAL_DATA['maltese']

<__main__.PrepareDataset at 0x2b4085620438>

# Tests

# Other

In [13]:
for lang in LANGUAGES:
    df = pd.read_csv(f'/home/krisfarr/thesis/final_datasets/{lang}_data.csv')
    print(lang, f"original length {len(df)}", f"training should be {int(len(df) * 0.8)}", f"validation should be {int(len(df) * 0.2)} \n",  sep='\n')


italian
original length 26424
training should be 21139
validation should be 5284 

arabic
original length 4897
training should be 3917
validation should be 979 

english
original length 47959
training should be 38367
validation should be 9591 

spanish
original length 8323
training should be 6658
validation should be 1664 

dutch
original length 15806
training should be 12644
validation should be 3161 

