In [4]:
import swifter
import unidecode
import spacy
import pandas as pd
from pathlib import Path
import re

try:
    from flair.models import SequenceTagger
    from flair.data import Sentence
except:
    print('no flair')

no flair


In [5]:
strange_quotes = ['«','‹','»','›','„','“','‟','‘','‛', '”', '’', '❛', '❜', '❝', '❞','❮', '❯', '〝','〞', '〟', '＂', '`', '´']

def clean_quotes(df_col):
    for q in strange_quotes + ['"']:
        df_col = df_col.str.replace(q,"'")
    return df_col

In [6]:
def clean_whitespaces(df_col):
    return df_col.str.replace(r'\s+', ' ').str.strip()

In [7]:
def replace_numbers(df_col, token=" xxnumber "):
    return df_col.str.replace(r"\d+\s|\s\d+\s|\s\d+", " xxnumber ")

In [8]:
def replace_ner_in_sent(sent):
    dic = sent.to_dict(tag_type='ner')
    text = dic['text']
    if 'entities' in dic:
        offset = 0
        for ent in dic['entities']:
            len_before = len(text)
            start = ent['start_pos'] + offset
            end = ent['end_pos'] + offset
            text = text[:start] + ' xx' + ent['type'].lower() + ' ' + text[end:]
            len_after = len(text)
            offset += - len_before + len_after
    return text

In [9]:
def replace_ner(t, nlp, tagger):
    t = unidecode.unidecode(t) # TODO: fix for German
    sents = []
    for s in nlp(t).sents:
        sents.append(Sentence(' '.join(), use_tokenizer=False)) # use_tokenizer important because the text is not whitespace tokenized
    tagger.predict(sents, mini_batch_size=64)
    proc_txt = ' '.join([replace_ner_in_sent(s) for s in sents])
    return proc_txt

In [14]:
def preprocess_df(df, input_col='text', output_col='text_proc', replace_NER=True):
#   clean
    df[output_col] = df[input_col]
    df[output_col] = clean_quotes(df[output_col])
    df[output_col] = clean_whitespaces(df[output_col])
    
    if replace_NER:
        
    #   NER
        nlp = spacy.load('en_core_web_lg', disable=['ner'])
        tagger = SequenceTagger.load('ner-ontonotes')
        df[output_col] = df[output_col].swifter.apply(lambda x: replace_ner(x, nlp, tagger))
    
#   clean
    df[output_col] = replace_numbers(df[output_col])
    df[output_col] = clean_whitespaces(df[output_col]) # the number adds some spaces again

    return df

In [11]:
def peprocess_text(input_path, output_path, header=False, **kwargs):
    if header:
        df = pd.read_csv(input_path)
    else:
        df = pd.read_csv(input_path, header=None, names=['text'])
    df = preprocess_df(df, **kwargs)
    if header:
        df.to_csv(output_path)
    else:
        df.to_csv(output_path, header=None)

In [12]:
# util to print unicodes
def print_unicode(s):
    for _c in s:
        print(_c)
        print('U+%04x' % ord(_c))

In [84]:
print_unicode('peter')

p
U+0070
e
U+0065
t
U+0074
e
U+0065
r
U+0072


In [20]:
kind = 'val'
IN_PATH = Path('/mnt/data/group07/johannes/ynacc_proc/replicate/split/' + kind + '.csv')
OUT_PATH = Path('/mnt/data/group07/johannes/ynacc_proc/replicate/split/' + kind + '_proc_with_ner.csv')

peprocess_text(IN_PATH, OUT_PATH, replace_NER=False, header=True)

In [35]:
df = pd.read_csv(OUT_PATH)

In [49]:
df['text_proc'][500]

"Exactly r, it isn't even close to the same thing. Priests are WAAaAAAaY worse."

In [50]:
df['text'][500]

"Exactly r, it isn't even close to the same thing. Priests are WAAaAAAaY worse."