In [216]:
import pandas as pd
from collections import Counter
from nltk import WordNetLemmatizer
import re
from nltk.corpus import stopwords

In [313]:
df_triples = pd.read_csv(
    '../../data/processed/reference_model_f/reference_model_f.txt', header=None, sep="\t"
)

df_triples = pd.DataFrame(
    df_triples[0].str.split("|", 4).tolist(),
    columns=["status", "article_id", "e1", "r", "e2"],
)

In [301]:
#df_triples = df_triples.groupby('article_id').head(10).reset_index(drop=True)

In [302]:
def remove_stops(word):
    if word.lower() in set(set(stopwords.words('english'))):
        return ''
    else:
        return word

In [303]:
lemmatizer = WordNetLemmatizer()
df_triples["l1"] = (
    df_triples["e1"]
    .apply(lambda x: extract_entities(x))
    .apply(lemmatizer.lemmatize)
    .apply(lambda x: x.lower().strip())
    .apply(remove_stops)
    .apply(lambda x: re.sub("[^\s'_A-Za-z]", "", x))
    .apply(lambda x: x.lstrip().rstrip())
)
df_triples["l2"] = (
    df_triples["e2"]
    .apply(lambda x: extract_entities(x))
    .apply(lemmatizer.lemmatize)
    .apply(lambda x: x.lower().strip())
    .apply(remove_stops)
    .apply(lambda x: re.sub("[^\s'_A-Za-z]", "", x))
    .apply(lambda x: x.lstrip().rstrip())
)
df_triples["rel"] = (
    df_triples["r"]
    .apply(lemmatizer.lemmatize)
    .apply(lambda x: x.lower().strip())
    .apply(remove_stops)
    .apply(lambda x: re.sub("[^\s'_A-Za-z]", "", x))
    .apply(lambda x: x.lstrip().rstrip())
)

In [304]:
total_entities = pd.concat([df_triples["l1"], df_triples["l2"]])

In [305]:
unique_entities = pd.Series(total_entities.unique())

d = {
    k: v for k, v in zip(unique_entities, [i for i in range(len(unique_entities))])
}

d_sorted = sorted(d.items(), key=lambda kv: kv[1])

total_relations = pd.Series(df_triples["rel"])
rc = Counter(total_relations)
unr = Counter(el for el in rc.elements() if rc[el] > 1)
unique_relations = pd.Series(list(unr.keys()))

r = {
    k: v
    for k, v in zip(
        unique_relations,
        [i for i in range(len(unique_relations))],
    )
}

r_sorted = sorted(r.items(), key=lambda kv: kv[1])

In [323]:
a = Counter(total_relations)

In [324]:
un = Counter(el for el in a.elements() if a[el] > 1 and el != '')

In [308]:
import nltk

In [394]:
def extract_entities(text):
    res = []
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
        if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
            res.append((' '.join(c[0] for c in chunk.leaves())))
        else:
            pass
    if len(res) == 0:
        return text
    else
        return res[0]


In [395]:
for i in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize('i am James Kemp'))):
    try:
        print(i.label())
        print(' '.join(c[0] for c in i.label.leaves()))
    except:
        pass

PERSON


In [396]:
extract_entities('political president')

KeyboardInterrupt: 

In [325]:
un.most_common(50)

[('is in', 1792),
 ('holds', 527),
 ('is with', 492),
 ('said on', 245),
 ('said in', 238),
 ('are having', 224),
 ('register', 164),
 ('told', 156),
 ('is victim of', 144),
 ('showed', 136),
 ('says', 131),
 ('released', 126),
 ('has dismissed as', 126),
 ('has confirmed', 126),
 ('handing democrats', 125),
 ('democrats', 125),
 ('boost of', 125),
 ('leads', 120),
 ('have launched', 118),
 ('held', 116),
 ('hand over', 115),
 ('defeated rival hillary hillary clinton according to', 112),
 ('framed', 108),
 ('took', 105),
 ('could gain', 104),
 ('threepoint lead in', 96),
 ('lead in', 96),
 ('drew', 96),
 ('announced', 94),
 ('make', 93),
 ('have shown', 84),
 ('made', 82),
 ('looking to', 82),
 ('said', 81),
 ('would', 80),
 ('led republican donald republican donald trump by', 80),
 ('leads republican donald republican donald trump by', 80),
 ('struggled according to', 80),
 ('called', 79),
 ('seeking', 76),
 ('opposes', 75),
 ('take', 74),
 ('shows', 73),
 ('supports', 73),
 ('are inv

In [245]:
'us' in list(set(stopwords.words('english')))

False

In [242]:
l

['october',
 'donald trump',
 'hillary',
 'syria',
 'kevin shipp',
 'myanmar',
 'november',
 'people',
 'clinton',
 'prison planetcom october',
 'obama',
 'dr duke',
 'yemeni army forces',
 'americans',
 'fbi',
 'russia',
 'isis']