In [1]:
import os
import pandas as pd
import fnmatch

In [2]:
#function that gets tokens from files
def tokens_from_files(directory, pattern='*.final'):
        """Recursively find all files matching the pattern."""
        file_path_list = []
        for root, dirnames, filenames in os.walk(directory):
            for filename in fnmatch.filter(filenames, pattern):
                file_path_list.append(os.path.join(root, filename))
        
        token_label_list=_extract_tokens(file_path_list)
        return token_label_list

#function that extracts tokens from a list of files (nested in the function above)
def _extract_tokens(file_list):
    token_label_list = []
    for file in file_list:
        with open(file,'r',errors='replace') as f:
             for token_label in f.read().splitlines():
                 token_label_list.append(token_label)
    return token_label_list

In [3]:
#Function that takes a list of tokens, cleans it and outputs dataframe
def deep_clean(str_list):
    str_list = list(filter(str.strip, str_list)) #remove empty lines
    last_label='' #variable that keeps track of the label of the previous row
    df = pd.DataFrame(columns = ["Token", "Label"]) #creating an empty dataframe with two columns

    for line in str_list:
        line="".join(line.rstrip().lstrip()) #removes spaces on the left and right of each line
        if line[-1]=='I' and last_label in'Oo': #checks that no 'I' labels are preceded by an'O'
            line = list(line)
            line[-1] = 'B'  #if the condition is true, the 'I' is replaced by a 'B'
            line=''.join(line)
        if line[-1]=='0': #if the label is annotated as '0' it is changed to 'O'
            line = list(line)
            line[-1] = 'O'
            line=''.join(line)
        if line[-1] in 'OBIobi': #make sure all labels are either 'O' 'I' or 'B'
            try:
                row=line.split() #split by space or tab
                df.loc[len(df)] = row #add row to dataframe
            except ValueError:
                pass #skip problematic lines
            df['Label'] = df['Label'].str.upper() #make all labels upper case
        last_label=line[-1] #updates label of previous row for next loop
    return df


In [4]:
# Note: This cell takes a long time to execute

path='/mnt/c/Users/noral/Documents/M2_TAL/Term/NLP_tagging/Dataset/' #add your directory path

str_list = tokens_from_files(path) #find final annotation files in directory and extract tokens
df=deep_clean(str_list) #create dataframe with tokens and labels (and remove/correct erroneous lines)

In [5]:
df

Unnamed: 0,Token,Label
0,This,O
1,article,O
2,presents,O
3,an,O
4,investigation,O
...,...,...
26660,banks,I
26661,for,O
26662,99,O
26663,languages,O
