In [1]:
# !pip install stanza
# Load the English model
import stanza
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner')
import pandas as pd

2023-02-25 13:41:26 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-25 13:41:27 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| ner       | ontonotes |

2023-02-25 13:41:27 INFO: Use device: gpu
2023-02-25 13:41:27 INFO: Loading: tokenize
2023-02-25 13:41:29 INFO: Loading: pos
2023-02-25 13:41:30 INFO: Loading: lemma
2023-02-25 13:41:30 INFO: Loading: ner
2023-02-25 13:41:30 INFO: Done loading processors!


In [2]:
train_file = '../data/en_ewt-up-train.conllu'
test_file = '../data/en_ewt-up-test.conllu'

In [3]:
def get_named_entities(sentence):
    """
    Get the named entity for each token in a data frame, and recombine the named entities into a list
    of named entities for each sentence.

    Args:
        df (pandas.DataFrame): A data frame with columns 'sent_id', 'token', and 'ner', where each row contains
        a token and its named entity.

    Returns:
        dict: A dictionary where the keys are the sent_ids and the values are lists of named entities, where each named entity corresponds
        to one token in the original sentence order.

    """
    # Use spaCy to extract named entities from the sentence
    doc = nlp(sentence)
    named_entities_for_sentence = [(ent.text, ent.type) for ent in doc.ents]
    
    # Return the resulting dictionary of named entities
    return named_entities_for_sentence

def assign_ne(row, named_enities):
    named_entity = list(filter(lambda x: row.token in x[0], named_enities))
    if named_entity:
        return named_entity[0][1]
    else:
        return '_'

def add_bigrams(df):
    """
    Adds columns for token and pos bigrams to a DataFrame containing token and pos columns.

    Args:
        df (pandas.DataFrame): A DataFrame containing "token" and "pos" columns.

    Returns:
        pandas.DataFrame: A new DataFrame with additional columns for token and pos bigrams.
    """
    # Create token bigrams
    df["token_bigram"] = pd.Series(list(zip(df["token"].shift(), df["token"])))
    # Create pos bigrams
    df["pos_bigram"] = pd.Series(list(zip(df["POS"].shift(), df["POS"])))
    return df

In [13]:
def read_data(file_path, save_to_csv=False):
    """
    This function reads a CoNLL-U format file and converts it into a pandas DataFrame.
    Each row in the DataFrame corresponds to a token in the file, and columns
    correspond to different features of the token, such as the token itself, its lemma, 
    part-of-speech tag, and syntactic dependency information.
    
    Parameters:
    file_path (str): The path to the input CoNLL-U format file.
    save_to_csv (bool): A boolean flag indicating whether to save the resulting DataFrame 
                        to a CSV file. Default is False.
                        
    Returns:
    df (pandas.DataFrame): A pandas DataFrame containing the token-level information from
                           the input file.
    """
    
    # Open and read the input file
    with open(file_path, 'r', encoding='utf-8') as f:
        train_data = f.read()
    
    # Split the file into individual documents, each separated by a blank line
    data = []
    for doc_i, doc in enumerate(train_data.split('\n\n')):
        doc = doc.split('\n')
        sentences = ''
        for line in doc:
            # Skip lines starting with '#' (comment lines)
            if line and line[0] != '#':
                line = line.split('\t')
                line.insert(0, str(doc_i))
                sentences += '\t'.join(line) + '\n'
        data.append(sentences)
    
    # Create a pandas DataFrame from the token-level data
    train_df = pd.DataFrame([x.split('\t') for sent in data for x in sent.split('\n') if x])
    
    # Rename the columns of the DataFrame
    train_df = train_df.rename(columns={
        0:'sent_id', 
        1:'token_id', 
        2:'token', 
        3:'lemma', 
        4:'POS', 
        5:'uni_POS',
        6:'morph_type', 
        7:'distance_head', 
        8:'dep_label', 
        9:'dep_rel', 
        10:'space', 
        11:'probbank'
    })
    
    named_enities_dict = {}
    for sent_id, group in train_df.groupby('sent_id'):
        sentence_tokens = list(group['token'])
        sentence = ' '.join(sentence_tokens)
        ne = get_named_entities(sentence)
        named_enities_dict[sent_id] = ne
    
                
    train_df['ner'] = train_df.apply(lambda x: assign_ne(x, named_enities_dict[x['sent_id']]), axis=1)
    train_df = add_bigrams(train_df)  
    # Convert the DataFrame from wide to long format
    df = train_df.melt(
        id_vars=[i for i in train_df.columns[:12]]+['ner','token_bigram','pos_bigram'], 
        var_name="notneeded", 
        value_name="target"
    )
    
    # Drop the 'notneeded' column and any rows that contain missing values
    df.drop(['notneeded'], axis=1, inplace=True)
    df = df[df['target'].notna()]
    
    # Optionally save the resulting DataFrame to a CSV file
    if save_to_csv:
        df.to_csv('../data/train_ner.tsv', sep='\t', index=False)
        # df.to_csv('../data/test_ner.tsv', sep='\t', index=False)
    
    # Return the resulting DataFrame
    return df

In [14]:
x = read_data(train_file, save_to_csv=True)
# x = read_data(test_file, save_to_csv=True)

In [15]:
x

Unnamed: 0,sent_id,token_id,token,lemma,POS,uni_POS,morph_type,distance_head,dep_label,dep_rel,space,probbank,ner,token_bigram,pos_bigram,target
0,0,1,Al,Al,PROPN,NNP,Number=Sing,0,root,0:root,SpaceAfter=No,_,PERSON,"(None, Al)","(None, PROPN)",_
1,0,2,-,-,PUNCT,HYPH,_,1,punct,1:punct,SpaceAfter=No,_,PERSON,"(Al, -)","(PROPN, PUNCT)",_
2,0,3,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,1:flat,_,_,PERSON,"(-, Zaman)","(PUNCT, PROPN)",_
3,0,4,:,:,PUNCT,:,_,1,punct,1:punct,_,_,_,"(Zaman, :)","(PROPN, PUNCT)",_
4,0,5,American,american,ADJ,JJ,Degree=Pos,6,amod,6:amod,_,_,NORP,"(:, American)","(PUNCT, ADJ)",_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7077906,7506,131,graduated,graduate,VERB,VBN,Tense=Past|VerbForm=Part,123,advcl,123:advcl:since,_,graduate.01,_,"(have, graduated)","(AUX, VERB)",_
7077907,7506,132,and,and,CCONJ,CC,_,134,cc,134:cc,_,_,_,"(graduated, and)","(VERB, CCONJ)",_
7077908,7506,133,i,i,PRON,PRP,Case=Nom|Number=Sing|Person=1|PronType=Prs,134,nsubj,134:nsubj,_,_,PERSON,"(and, i)","(CCONJ, PRON)",ARG0
7077909,7506,134,hate,hate,VERB,VBP,Mood=Ind|Tense=Pres|VerbForm=Fin,4,conj,4:conj:and,_,hate.01,_,"(i, hate)","(PRON, VERB)",V


In [7]:
# pd.set_option('display.max_columns', 50)
# pd.set_option('display.min_rows', 200)
# pd.set_option('display.max_rows', 200)