In [1]:
import random
import re
import pickle
import string
import pymorphy2
morph = pymorphy2.MorphAnalyzer(lang='uk')
from ukr_stemmer3 import UkrainianStemmer
from tokenize_uk import tokenize_words, tokenize_sents
from perceptron_tagger.tagger import PerceptronTagger # POS tagger
from qa_perceptron import AveragedPerceptron # model for parsing question
from sklearn.externals import joblib
from difflib import get_close_matches
import nltk

In [12]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion

In [2]:
# open cleaned data from saved dictionaries
with open('countries.pkl', 'rb') as f:
    country_dict = pickle.load(f)
    
with open('cities.pkl', 'rb') as cf:
    city_dict = pickle.load(cf)

In [3]:
obj_dict = dict(country_dict, **city_dict)

In [4]:
def lemmatize_phrase(phrase):
    """
    Also we can stem instead of lemmatizing...
    """
    words = fix_hyphens(tokenize_words(phrase))
    if len(words) == 1:
        wparsed = morph.parse(phrase)[0]
        if not wparsed:
            return phrase
        return wparsed.normal_form
    else:
        new_phrase = ''
        for w in words:
            new_phrase += morph.parse(w)[0].normal_form + ' '
        return new_phrase.strip()
    
def fix_hyphens(sent):
    """
    sent is tokenized with tokenize_uk
    """
    new_sent = []
    i = 0
    while i < len(sent):
        w = sent[i]
        if w == '—' or w == '-':
            new_sent.pop()
            new_word = sent[i-1]+'-'+sent[i+1]
            new_sent.append(new_word)
            i += 1
        else:
            new_sent.append(w)
        i += 1
    return new_sent

def gender_agree(w_parsed):
    """
    Inflect noun phrase with adjective the right way
    """
    gender = w_parsed.tag.gender
    if not gender:
        return w_parsed.normal_form
    w = w_parsed.inflect({gender, 'nomn'}).word
    return w
    
def get_matches(ent, all_ents):
    matches = get_close_matches(ent, all_ents)
    if not matches:
        for entry in all_ents:
            if ent.lower() in entry.lower():
                return entry
    return matches[0]

In [95]:
class QuestionParser():
    
    def __init__(self, obj_dict, model = 'perceptron'):
        allkeys = []
        for c in obj_dict:
            for k in obj_dict[c]:
                allkeys.append(k)
        allkeys = set(allkeys)
        self.pos_tagger = PerceptronTagger()
        self.classes = allkeys
        self.obj_dict = obj_dict
        with open('units.pkl', 'rb') as f:
            self.unit_dict = pickle.load(f)
        self.lem_dict = [morph.parse(ent.split()[0])[0].normal_form 
                         for ent in self.obj_dict.keys()]
        try:
            self.ner_model = joblib.load('NER_model.pkl')
        except:
            self.ner_model = {}
            print('No NER model found!')
        if model == 'perceptron':
            self.model_name = model
            self.qa_model = AveragedPerceptron()
            try:
                self.load('qa_model.pkl')
            except:
                print('Please provide qa_model.pkl file.')
        elif model == 'logistic':
            self.model_name = model
            try:
                self.qa_model = joblib.load('qa_skl_model.pkl')
            except:
                print('Please provide qa_skl_model.pkl file.')
        elif model == None:
            print('Please train the model for QA.')
            
    def load(self, loc):
        try:
            weights, classes = pickle.load(open(loc, 'rb'))
        except IOError:
            msg = ("Missing a pickle file for QA model.")
        self.qa_model.weights = weights
        self.qa_model.classes = classes
        return None
    
    def get_entity_pymorphy(self, q_text):
        """
        Look for (capitalized) entities in q_text.
        For this specific application pymorphy2 tagging is enough.
        """
        forbidden = ['ВВП', 'HDI', 'ISO', 'ООН', 'UN', 'UTC', 
                     'Utc-Поправка', 'Utc-Поправка']
        words = fix_hyphens(tokenize_words(q_text))
        phrase = []
        for i, w in enumerate(words[1:]):
            if w in forbidden:
                continue
            if w[0] == w[0].upper():
                w_parsed = morph.parse(w.strip(' ?'))[0]
                w_lemma = w_parsed.normal_form
                if w_lemma in self.lem_dict:
                    if 'ADJF' in w_parsed.tag:
                        phrase.append(gender_agree(w_parsed).title())
                        phrase.append(morph.parse
                                      (words[i+2].strip(' ?'))[0].normal_form)
                        return ' '.join(phrase).title()
                    elif 'NOUN' in w_parsed.tag:
                        return w_lemma.title()
                    elif 'UNKN' in w_parsed.tag:
                        return w_lemma.title()
                matches = get_close_matches(w_lemma.title(), list(self.obj_dict.keys()))
                if matches:
                    return matches[0]
                else:
                    continue
        return None
    
    def _get_ner_features(self, word, prev_word, next_word):
        features = {
            'word': word,
            'word_stem': UkrainianStemmer(word).stem_word(),
            'prev_word': prev_word,
            'next_word': next_word,
            'prev_stem': UkrainianStemmer(prev_word).stem_word(),
            'next_stem': UkrainianStemmer(next_word).stem_word(),
            'is_uppercase': word.title() == word,
            'is_after_punct': prev_word in string.punctuation,
            'is_after_uppercase': prev_word.title() == prev_word,
            'pos': self.pos_tagger.tag(' '.join([prev_word, word, next_word]))[1][1]
        }
        return features
    
    def ner_recognize(self, sent):
        sent = sent.strip(string.punctuation)
        tokens = tokenize_words(sent)
        feats = []
        for (i, t) in enumerate(tokens):
            if i == 0:
                prev_word = '.'
            else:
                prev_word = tokens[i-1]
            if i == len(tokens)-1:
                next_word = '.'
            else:
                next_word = tokens[i+1]
            feats.append(self._get_ner_features(t, prev_word, next_word))
        labels = self.ner_model.predict(feats)
        return list(zip(tokens, labels))
    
    def get_entity(self, q):
        all_ents = self.obj_dict.keys()
        parsed = self.ner_recognize(q)
        entities = [e[0] for e in parsed if e[1] == 'LOC']
        if not entities:
            return self.get_entity_pymorphy(q)
        match = get_matches(entities[0], all_ents)
        if not match:
            print("Не вдалось знайти географічний об'єкт!")
            print(ent)
            return None
        return match
    
    def parse_question(self, q):
        ent = self.get_entity(q)
        if not ent:
            print("Не вдалось знайти географічний об'єкт!")
            return None
        lem_sent = lemmatize_phrase(q)
        lem_ent = lemmatize_phrase(ent)
        new_sent = lem_sent.replace(lem_ent, '').replace('  ', ' ')
        new_sent = new_sent.replace('який', '')
        return ent, new_sent.strip()
    
    def get_features(self, q):
        try:
            ent, sent = self.parse_question(q)
        except:
            return None
        if self.model_name == 'perceptron':
            return self.get_features_perc(ent, sent)
        elif self.model_name == 'logistic':
            return self.get_features_sklearn(ent, sent)
    
    def get_features_perc(self, ent, sent):
        """
        Given question, get features from it.
        """
        features = {}
        words = fix_hyphens(tokenize_words(sent))
        for i, w in enumerate(words):
            features['word_{i}={w}'.format(i=i, w=w)] = 1
        features['words'] = [('w={w}'.format(w=w), 1) for w in words]
        bigrams = ['_'.join(b) for b in nltk.bigrams(words)]
        features['bigrams'] = [('bg={bg}'.format(bg=bg), 1) for bg in bigrams]
        n = 3
        char_trigrams = [sent[i:i+n] for i in range(len(sent)-n+1)]
        features['trigrams'] = [('t={t}'.format(t=t), 1) for t in char_trigrams]
        return ent, features
    
    def get_features_sklearn(self, ent, sent):
        features = dict()
        words = fix_hyphens(tokenize_words(sent))
        bigrams = ['_'.join(b) for b in nltk.bigrams(words)]
        n = 3
        char_trigrams = [sent[i:i+n] for i in range(len(sent)-n+1)]
        for w in words:
            features[w] = 1
        for b in bigrams:
            features[b] = 1
        for c in char_trigrams:
            features[c] = 1
        return ent, features
    
    def train(self, train_df, n_iter=5):
        if self.model_name == 'perceptron':
            self.train_perc(train_df, n_iter)
        elif self.model_name == 'logistic':
            self.train_sklearn(train_df)
    
    def train_sklearn(self, train_df):
        features = []
        labels = []
        for i, row in train_df.iterrows():
            q = row['Q']
            k = row['K']
            try:
                ent, feats = self.get_features(q)
            except:
                continue
            if ent not in self.obj_dict:
                print('Cannot find an entity in a dictionary for', q)
                print(ent)
            labels.append(k)
            features.append(feats)
        model = Pipeline([
                    ('vec', DictVectorizer()),
                    ('clf', LogisticRegression(penalty='l1'))
        ])
        model.fit(features, labels)
        joblib.dump(model, 'qa_skl_model.pkl')
        self.qa_model = model
    
    def train_perc(self, train_df, n_iter=5):
        """
        train_df contains columns Q and A
        """
        self.qa_model.classes = self.classes
        for iteration in range(n_iter):
            print('Training iteration number', iteration+1)
            train_df = train_df.sample(len(train_df))
            for i, row in train_df.iterrows():
                q = row['Q']
                k = row['K']
                true_keys = []
                try:
                    ent, feats = self.get_features(q)
                except:
                    continue
                if ent not in self.obj_dict:
                    print('Cannot find an entity in a dictionary for', q)
                    print(ent)
                guess = self.qa_model.predict(feats)
                self.qa_model.update(k, guess, feats)
        self.qa_model.average_weights()
        self.qa_model.save('qa_model.pkl')
    
    def provide_gen_case(self, ent):
        if 'Назва в родовому відмінку' in self.obj_dict[ent].keys():
            if not 'нема інформації' in self.obj_dict[ent]['Назва в родовому відмінку']:
                return self.obj_dict[ent]['Назва в родовому відмінку']
        if len(ent.split()) == 1:
            w_parsed = morph.parse(ent)[0]
            return w_parsed.inflect({'gent'}).word.title()
        else:
            res = ''
            for w in ent.split():
                w_parsed = morph.parse(w)[0]
                gender = w_parsed.tag.gender
                if not gender:
                    res += w
                else:
                    res += w_parsed.inflect({gender, 'gent'}).word
            return res
    
    def answer_text(self, ent, pred_class):
        answer_template = '{pred} {ent} - {a} {units}'
        gen_name = self.provide_gen_case(ent)
        a = self.obj_dict[ent][pred_class]
        units = self.unit_dict.get(pred_class)
        if a == '' or ('нема інформації' in a):
            units = ''
        if not units:
            units = ''
        res = answer_template.format(pred=pred_class, 
                                     ent=gen_name,
                                     a=a,
                                     units=units).strip()
        return res
    
    def find_answer(self, q):
        try:
            ent, feats = self.get_features(q)
        except:
            return 'Відповідь не знайшлась.'
        if self.model_name == 'perceptron':
            all_classes = self.qa_model.get_scores(feats)
            pred_classes = self.qa_model.get_scored_classes(feats)
            for cl in pred_classes:
                if cl in self.obj_dict[ent] and cl in all_classes:
                    pred_class = cl
                    return self.answer_text(ent, pred_class)
        elif self.model_name == 'logistic':
            pred_class = self.qa_model.predict([feats])[0]
            if pred_class in self.obj_dict[ent]:
                return self.answer_text(ent, pred_class)
        return 'Відповідь не знайшлась.'

In [96]:
qp = QuestionParser(obj_dict, model='logistic')

In [70]:
ent, feats = qp.get_features_sklearn('у якому році відбулась хрещення Польщі')
results = qp.qa_model.predict_proba([feats])[0]
#prob_per_class_dictionary = dict(zip(qp.qa_model.classes_, results[0]))
results_ordered_by_probability = list(map(lambda x: x[0], sorted(zip(qp.qa_model.classes_, results), key=lambda x: x[1], reverse=True)))
#results_ordered_by_probability

In [98]:
qp.find_answer('де знаходиться Гватемала')

'Відповідь не знайшлась.'

In [66]:
with open('test_questions.txt', 'r') as f:
    tq = f.read().split('\n')

In [90]:
for q_text in tq:
    print('---')
    print(q_text)
    print(qp.find_answer(q_text))
    print('---')

---
яка площа Мексики
Площа Мексики - 1972550
---
---
яка площа території Португалії
Площа Португалії - нема інформації в базі
---
---
яка територія Гвінеї
Площа Гвінеї - 245.857
---
---
який розмір Гвінеї
Площа Гвінеї - 245.857
---
---
яка столиця Мексики
Столиця Мексики - Мехіко
---
---
яке місто є столиця Мексики
Столиця Мексики - Мехіко
---
---
яка офіційна мова Австралії
Офіційні мови Австралії - Англійська мова (англійська1)
---
---
яка мова визнана в Мексиці офіційною?
Офіційні мови Мексики - іспанська мова
---
---
яка форма правління Мексики
Форма правління Мексики - Федеративна республіка
---
---
хто є президентом України
Імена лідерів України - Порошенко Петро Олексійович, Гройсман Володимир Борисович, Парубій Андрій Володимирович
---
---
хто польский президент?
Не вдалось знайти географічний об'єкт!
Відповідь не знайшлась.
---
---
коли відбулося хрещення Гвінеї
Ключові дати в історії Гвінеї - 2 жовтня 1958
---
---
у якому році відбулось хрещення Гвінеї
Індекс розвитку (HDI) 

AttributeError: 'NoneType' object has no attribute 'word'