In [26]:
import pandas as pd
import time
import random
from polyglot.text import Text
import requests
import re
import regex
import string
from bs4 import BeautifulSoup
from collections import OrderedDict
from ukr_stemmer3 import UkrainianStemmer
from perceptron_tagger import tagger
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from tokenize_uk import tokenize_words, tokenize_sents
from tqdm import tqdm

In [None]:
list_url = "https://uk.wikipedia.org/wiki/%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D0%BA%D1%80%D0%B0%D1%97%D0%BD_%D1%81%D0%B2%D1%96%D1%82%D1%83"
r = requests.get(list_url)
html = BeautifulSoup(r.content, "lxml")

country_urls = []
for li in html.find_all('li'):
    span = li.find('span')
    if not span: continue
    for a in span.find_all('a'):
        if not a.get_text().strip():
            continue
        else:
            country_urls.append((a.get('href')[6:], a.get_text()))

In [2]:
def get_card(url):
    r = requests.get(url)
    html = BeautifulSoup(r.content, 'lxml')
    text = re.search(r'({{.*}})', html.find('textarea').get_text(), re.DOTALL)
    if text:
        return text.group(0).split('\n')

def clean(entry):
    res = entry.strip()
    pats_to_leave = re.findall(r'.*?\[\[.*?\|([^\[\]]+)?\]\]', res)
    pats_to_replace = re.findall(r'.*?(\[\[[^\[\]]+\|.*?\]\])', res)
    if not len(pats_to_leave) == len(pats_to_replace):
        res = res
    else:
        for (lpat, rpat) in zip(pats_to_leave, pats_to_replace):
            res = res.replace(rpat, lpat)
    res = res.replace("''", '"').replace("'", "")
    res = re.sub(r'<ref>.*?</ref>', '', res)
    res = re.sub(r'<.*?>', '', res)
    res = re.sub(r'{{.*?}}', '', res)
    res = res.replace('[[', '').replace(']]', '')
    res = re.sub(r'\[.*?\]', '', res)
    res = res.replace('&nbsp;', ' ')
    if res.count('|') == 1:
        spl = res.split('|')
        res = '{w1} ({w2})'.format(w1=spl[0].strip(),
                                   w2=spl[1].strip())
    if all((c.isdigit() or c in ' ,') for c in res):
        res = res.replace(' ', '').replace(',', '.')
        try:
            res = int(res)
        except:
            try:
                res = float(res)
            except:
                res = res
    return res

def parse_card(card):
    res_dict = OrderedDict()
    special_entries = []
    for line in card:
        if not line.startswith('|'):
            continue
        if line.count('=') != 1:
            if ('lat' in line) and ('lon' in line):
                res_dict['coordinates'] = line
            special_entries.append(line)
        else:
            cat, entry = line.split('=')
            cat = cat.strip(' |')
            entry = clean(entry)
            res_dict[cat] = entry
    return res_dict

In [43]:
card_dict = OrderedDict()
lake_url = "https://uk.wikipedia.org/wiki/%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D0%BD%D0%B0%D0%B9%D0%B1%D1%96%D0%BB%D1%8C%D1%88%D0%B8%D1%85_%D0%BE%D0%B7%D0%B5%D1%80_%D1%81%D0%B2%D1%96%D1%82%D1%83"
lake_r = requests.get(lake_url)
lake_soup = BeautifulSoup(lake_r.content, 'lxml')
BASE_URL = 'https://uk.wikipedia.org/w/index.php?title={title}&action=edit&section=0'
table = lake_soup.find_all('table')[1]
for row in tqdm(table.find_all('tr')[1:]):
    col = row.find_all('td')
    lake = col[0].find('a')
    name = lake.get_text()
    url_title = lake.get('href')[6:]
    try:
        card = get_card(BASE_URL.format(title=url_title))
        lines = []
        for line in card:
            if line and not line.startswith('|'):
                line = clean(line)
                lines.append(line)
        card_dict[name] = lines
    except:
        continue
    time.sleep(0.1)

100%|██████████| 87/87 [01:04<00:00,  1.36it/s]


In [46]:
for c in card_dict:
    card_dict[c] = [line for line in card_dict[c] if len(line) > 70]

In [47]:
len(card_dict)

65

In [48]:
texts = []
entities = []
for c in card_dict:
    for line in card_dict[c]:
        text = Text(line, hint_language_code='uk')
        texts.append(text)
        entities.append(text.entities)

In [49]:
ner_lists = []
for (text, ents) in zip(texts, entities):
    ent_list = [None] * len(text.words)
    lower = 0
    for ent in ents:
        upper = ent.start
        ent_list[lower:upper] = ['-']*(upper-lower)
        ent_list[upper:ent.end] = [e+' | '+ent.tag for e in ent]
        lower = ent.end
    ent_list[lower:] = ['-']*((len(ent_list)+1)-lower)
    ner_lists.append(list(zip(text.words, ent_list)))

In [50]:
ner_df = pd.DataFrame()
for ner_list in ner_lists:
    ner_to_df = pd.DataFrame(ner_list)
    ner_df = ner_df.append(ner_to_df)

In [51]:
def remove_accents(string):
    string = re.sub("а́", 'а', string)
    string = re.sub("ю́", 'ю', string)
    string = re.sub("у́", 'у', string)
    string = re.sub("о́", 'о', string)
    string = re.sub("и́", 'и', string)
    string = re.sub("е́", 'е', string)
    string = re.sub("я́", 'я', string)
    string = re.sub('́', '', string)

    return string

ner_df[0] = ner_df[0].apply(remove_accents)

In [52]:
ner_df.to_csv('NER_train_lakes.csv', index=False)

<hr>

In [69]:
ner_ann1 = pd.read_csv('NER_train.csv')
ner_ann2 = pd.read_csv('NER_train_cities.csv')
ner_ann3 = pd.read_csv('NER_train_islands.csv')
ner_ann4 = pd.read_csv('NER_train_lakes.csv')
ner_ann5 = pd.read_csv('NER_train_rivers.csv')
ner_ann6 = pd.read_csv('NER_train_seas.csv')
ner_ann = pd.concat([ner_ann1, ner_ann2, ner_ann3, ner_ann4, ner_ann5, ner_ann6])

In [70]:
ner_ann['anns'] = ner_ann['anns'].fillna('-')
ner_ann = ner_ann[ner_ann['words'] != '<S>']
ner_ann = ner_ann[ner_ann['words'] != '</S>']
ner_ann = ner_ann[~ner_ann['words'].str.contains(r'^2C.*')]
ner_ann['words'] = ner_ann['words'].str.replace(r'[•°′]', '.')
ner_ann = ner_ann.reset_index()

In [74]:
pos_tagger = tagger.PerceptronTagger()
def get_ner_features(word, prev_word, next_word):
    features = {
        'word': word,
        'word_stem': UkrainianStemmer(word).stem_word(),
        'prev_word': prev_word,
        'next_word': next_word,
        'prev_stem': UkrainianStemmer(prev_word).stem_word(),
        'next_stem': UkrainianStemmer(next_word).stem_word(),
        'is_uppercase': word.title() == word,
        'is_after_punct': prev_word in string.punctuation,
        'is_after_uppercase': prev_word.title() == prev_word,
        'is_before_uppercase': next_word.title() == next_word,
        'pos': pos_tagger.tag(' '.join([prev_word, word, next_word]))[1][1]
    }
    return features

In [72]:
len(ner_ann)

26419

In [73]:
all_feats = []
all_labels = []
for i, row in ner_ann.iterrows():
    label = row['anns']
    word = row['words']
    if i == 0:
        prev_word = '.'
    else:
        prev_word = ner_ann['words'][i-1]
    if i == len(ner_ann) - 1:
        next_word = '.'
    else:
        next_word = ner_ann['words'][i+1]
    features = get_ner_features(word, prev_word, next_word)
    all_feats.append(features)
    all_labels.append(label)

In [75]:
vec = DictVectorizer()
clf = LogisticRegression(penalty='l1')
model = Pipeline([('vec', vec), ('clf', clf)])
model.fit(all_feats, all_labels)

Pipeline(memory=None,
     steps=[('vec', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [76]:
def ner_recognize(sent, model):
    tokens = tokenize_words(sent)
    feats = []
    for (i, t) in enumerate(tokens):
        if i == 0:
            prev_word = '.'
        else:
            prev_word = tokens[i-1]
        if i == len(tokens)-1:
            next_word = '.'
        else:
            next_word = tokens[i+1]
        feats.append(get_ner_features(t, prev_word, next_word))
    labels = model.predict(feats)
    return list(zip(tokens, labels))

In [77]:
with open('test_questions.txt', 'r') as f:
    tq = f.read().split('\n')

In [78]:
for q_text in tq[:-1]:
    print(ner_recognize(q_text, model))

[('яка', '-'), ('площа', '-'), ('Мексики', 'LOC')]
[('яка', '-'), ('площа', '-'), ('території', '-'), ('Португалії', 'LOC')]
[('яка', '-'), ('територія', '-'), ('Гвінеї', 'LOC')]
[('який', '-'), ('розмір', '-'), ('Гвінеї', 'LOC')]
[('яка', '-'), ('столиця', '-'), ('Мексики', 'LOC')]
[('яке', '-'), ('місто', '-'), ('є', '-'), ('столиця', '-'), ('Мексики', 'LOC')]
[('яка', '-'), ('офіційна', '-'), ('мова', '-'), ('Австралії', 'LOC')]
[('яка', '-'), ('мова', '-'), ('визнана', '-'), ('в', '-'), ('Мексиці', 'LOC'), ('офіційною', '-'), ('?', '-')]
[('яка', '-'), ('форма', '-'), ('правління', '-'), ('Мексики', 'LOC')]
[('хто', '-'), ('є', '-'), ('президентом', '-'), ('України', 'LOC')]
[('хто', '-'), ('польский', '-'), ('президент', '-'), ('?', '-')]
[('коли', '-'), ('відбулося', '-'), ('хрещення', '-'), ('Гвінеї', 'LOC')]
[('у', '-'), ('якому', '-'), ('році', '-'), ('відбулось', '-'), ('хрещення', '-'), ('Гвінеї', 'LOC')]
[('яка', '-'), ('чисельність', '-'), ('населення', '-'), ('Гвінеї', 'L

In [79]:
joblib.dump(model, 'NER_model.pkl')

['NER_model.pkl']