In [1]:
# !python -m spacy download en_core_web_md

In [1]:
import pandas as pd
from features import extract_dependency_features, extract_features
from utils import recombine_sentences

  from .autonotebook import tqdm as notebook_tqdm
2023-02-24 21:09:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 11.8MB/s]
2023-02-24 21:10:03 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| constituency | wsj       |
| ner          | ontonotes |

2023-02-24 21:10:03 INFO: Use device: cpu
2023-02-24 21:10:03 INFO: Loading: tokenize
2023-02-24 21:10:03 INFO: Loading: pos
2023-02-24 21:10:03 INFO: Loading: lemma
2023-02-24 21:10:03 INFO: Loading: depparse
2023-02-24 21:10:03 INFO: Loading: constituency
2023-02-24 21:10:04 INFO: Loading: ner
2023-

In [2]:

train_file = '../data/en_ewt-up-train.conllu'
test_file = '../data/en_ewt-up-test.conllu'

def create_sentences(file_name):
    # Open and read the input file
    with open(file_name, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split the file into individual documents, each separated by a blank line
    sentences = {}
    for doc_i, doc in enumerate(content.split('\n\n')):
        doc = doc.split('\n')
        sentence = []
        for line in doc:
            # Skip lines starting with '#' (comment lines)
            if line and line[0] != '#':
                line = line.split('\t')
                sentence.append(line[1])
        sentences[doc_i] = ' '.join(sentence)
    return sentences

In [13]:
# import stanza
# nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner')


def get_named_entities(sentence):
    """
    Get the named entity for each token in a data frame, and recombine the named entities into a list
    of named entities for each sentence.

    Args:
        df (pandas.DataFrame): A data frame with columns 'sent_id', 'token', and 'ner', where each row contains
        a token and its named entity.

    Returns:
        dict: A dictionary where the keys are the sent_ids and the values are lists of named entities, where each named entity corresponds
        to one token in the original sentence order.

    """
    # Use spaCy to extract named entities from the sentence
    doc = nlp(sentence)
    named_entities_for_sentence = [(ent.text, ent.type) for ent in doc.ents]

    # Return the resulting dictionary of named entities
    return named_entities_for_sentence



def extract_features(sentence, sent_id):
    """
    Extracts various linguistic features from a given sentence using the Stanza library.

    Args:
        sentence (str): The input sentence to extract features from.

    Returns:
        pandas.DataFrame: A DataFrame of features extracted from the sentence, with the following columns:
            - token (str): The original form of each token in the sentence.
            - pos (str): The part-of-speech tag for each token.
            - lemma (str): The lemma of each token.
            - ner (str): The named entity label for each token (if any).
            - stemming (str): The stemmed form of each token (using the Snowball stemmer).
            - pos_bigram (str): A string representing the part-of-speech bigram for each token and its successor.
            - token_bigram (str): A string representing the token bigram for each token and its successor.
    """
    # Load the English model
    # nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner')
    
    # Process the sentence
    doc = nlp(sentence)
    
    # Initialize lists to store the feature values
    tokens = []
    ner_tags = []
    pos_bigrams = []
    token_bigrams = []
    
    # Extract features for each token in the sentence
    for i, sent in enumerate(doc.sentences):
        ne = get_named_entities(sentences[sent_id])
            
        for j, word in enumerate(sent.words):
            # Add token to list
            tokens.append(word.text)
            
            # Add named entity label to list if it exists, otherwise add an empty string
            named_entity = list(filter(lambda x: x[0] == word.text, ne))
            if named_entity:
                ner_tags.append(named_entity[0][1])
            else:
                ner_tags.append('_')
            
            
            # Add part-of-speech bigram to list
            if j < len(sent.words) - 1:
                pos_bigrams.append(f"{word.upos}_{sent.words[j+1].upos}")
            else:
                pos_bigrams.append('')
                
            # Add token bigram to list
            if j < len(sent.words) - 1:
                token_bigrams.append(f"{word.text}_{sent.words[j+1].text}")
            else:
                token_bigrams.append('')
    
    return {
        'token': tokens,
        'ner': ner_tags,
        'pos_bigram': pos_bigrams,
        'token_bigram': token_bigrams
    }


In [4]:
sentences = create_sentences('../data/en_ewt-up-train.conllu')

In [15]:
result = [extract_features(sent, i) for i, sent in sentences.items()]

In [16]:
df_train = pd.DataFrame(result).explode(list(result[0].keys()))

In [18]:
train_file = '../data/train.tsv'

df_train_1 = pd.read_csv(train_file, delimiter='\t')

  df_train_1 = pd.read_csv(train_file, delimiter='\t')


In [31]:
df_train.to_csv('../data/train_ner.tsv', sep='\t')

In [29]:
# df_train_1.groupby('sent_id')

12543

In [19]:
sentences_test = create_sentences('../data/en_ewt-up-test.conllu')

In [20]:
result_test = [extract_features(sent, i) for i, sent in sentences_test.items()]

In [21]:
df_test = pd.DataFrame(result_test).explode(list(result[0].keys()))

In [22]:
test_file = '../data/test.tsv'

df_test_1 = pd.read_csv(test_file, delimiter='\t')

  df_test_1 = pd.read_csv(test_file, delimiter='\t')


In [32]:
df_test.to_csv('../data/test_ner.tsv', sep='\t')

In [115]:
df_test_1.tail(10)

Unnamed: 0,sent_id,token_id,token,lemma,POS,uni_POS,morph_type,distance_head,dep_label,dep_rel,space,probbank,target
103236,1463,56.0,something,something,PRON,NN,Number=Sing,53,obl,53:obl:to|58:nsubj|61:nsubj:xsubj,_,_,ARG1
103237,1463,57.0,that,that,PRON,WDT,PronType=Rel,58,nsubj,56:ref,_,_,R-ARG1
103238,1463,58.0,needs,need,VERB,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,56,acl:relcl,56:acl:relcl,_,need.01,_
103239,1463,59.0,to,to,PART,TO,_,61,mark,61:mark,_,_,_
103240,1463,60.0,be,be,AUX,VB,VerbForm=Inf,61,aux:pass,61:aux:pass,_,be.03,_
103241,1463,61.0,attended,attend,VERB,VBN,Tense=Past|VerbForm=Part|Voice=Pass,58,xcomp,58:xcomp,_,attend.01,V
103242,1463,62.0,to,to,ADP,IN,_,61,obl,61:obl,_,_,C-ARG1
103243,1463,63.0,RIGHT,right,ADV,RB,_,64,advmod,64:advmod,_,_,_
103244,1463,64.0,AWAY,away,ADV,RB,_,61,advmod,61:advmod,SpaceAfter=No,_,ARGM-TMP
103245,1463,65.0,!!!,!!!,PUNCT,.,_,24,punct,24:punct,_,_,_


In [None]:
df_train.to_csv('../data/

In [95]:
for t in nlp('How does it work?'):
    print(t.ner)

TypeError: 'Document' object is not iterable

In [None]:

# all_fe = pd.concat([basic_fe, dependency_fe], axis=1)
df = pd.DataFrame([])

for i, s in sentences.items():
    basic_fe = extract_features(s)
    dependency_fe = extract_dependency_features(s)
    
    df = pd.concat([df, pd.concat([basic_fe, dependency_fe], axis=1)])
    print(basic_fe)
    break

In [None]:
0881122112 # gza health care

In [None]:
df