In [1]:
import time
import random
import pandas as pd
import gzip
import re
from collections import OrderedDict, defaultdict
import pickle
import string
import pymorphy2
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [17]:
import nltk
from ukr_stemmer3 import UkrainianStemmer
from tokenize_uk import tokenize_words, tokenize_sents
from perceptron_tagger.tagger import PerceptronTagger
tagger = PerceptronTagger()
from sklearn.externals import joblib
from difflib import get_close_matches

In [63]:
class AveragedPerceptron(object):
    """
    A class for training a model for answering the questions.
    """

    def __init__(self):
        # Each feature gets its own weight vector, so weights is a dict-of-dicts
        self.weights = {}
        self.classes = set()
        # The accumulated values, for the averaging. These will be keyed by
        # feature/clas tuples
        self._totals = defaultdict(int)
        # The last time the feature was changed, for the averaging. Also
        # keyed by feature/clas tuples
        # (tstamps is short for timestamps)
        self._tstamps = defaultdict(int)
        # Number of instances seen
        self.i = 0

    def get_scores(self, features):
        '''Dot-product the features and current weights and return the best label.'''
        scores = defaultdict(float)
        for feat, value in features.items():
            if isinstance(value, list):
                wordlist = value
                for i, w in enumerate(wordlist):
                    wf, wfvalue = w
                    if wf not in self.weights or wfvalue == 0:
                        continue
                    weights = self.weights[wf]
                    for label, weight in weights.items():
                        scores[label] += wfvalue * weight
            else:
                if feat not in self.weights or value == 0:
                    continue
                weights = self.weights[feat]
                for label, weight in weights.items():
                    scores[label] += value * weight
        return scores
    
    def get_scored_classes(self, features):
        scores = self.get_scores(features)
        return sorted(self.classes, key=lambda label: (scores[label], label), reverse=True)
    
    def predict(self, features):
        scores = self.get_scores(features)
        return max(self.classes, key=lambda label: (scores[label], label))

    def update(self, truth, guess, features):
        '''Update the feature weights.'''
        def upd_feat(c, f, w, v):
            param = (f, c)
            self._totals[param] += (self.i - self._tstamps[param]) * w
            self._tstamps[param] = self.i
            self.weights[f][c] = w + v

        self.i += 1
        if truth == guess:
            return None
        for f in features:
            if isinstance(features[f], list):
                for w in features[f]:
                    wf = w[0]
                    weights = self.weights.setdefault(wf, {})
                    upd_feat(truth, wf, weights.get(truth, 0.0), 1.0)
                    upd_feat(guess, wf, weights.get(guess, 0.0), -1.0)
            else:
                weights = self.weights.setdefault(f, {})
                upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
                upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
        return None

    def average_weights(self):
        '''Average weights from all iterations.'''
        for feat, weights in self.weights.items():
            new_feat_weights = {}
            for clas, weight in weights.items():
                param = (feat, clas)
                total = self._totals[param]
                total += (self.i - self._tstamps[param]) * weight
                averaged = round(total / float(self.i), 3)
                if averaged:
                    new_feat_weights[clas] = averaged
            self.weights[feat] = new_feat_weights
        return None

    def save(self, path):
        '''Save the pickled model weights.'''
        return pickle.dump((dict(self.weights), self.classes), open(path, 'wb'))

    def load(self, path):
        '''Load the pickled model weights.'''
        self.weights = pickle.load(open(path))
        return None

In [4]:
# helper functions
def pos_chunk(sent, tagger, parser):
    """
    Use tagger and chunk parser to create chunks.
    """
    vps, nps  = [], []
    pos_sent = tagger.tag(sent)
    parsed_sent = parser.parse(pos_sent)
    for s in parsed_sent.subtrees():
        if s.label() == 'VP':
            vps.append(s.leaves())
        elif s.label() == 'NP':
            nps.append(s.leaves())
    return nps + vps

def pos_chunk2(sent, tagger, parser):
    vps, nps  = [], []
    pos_sent = tagger.tag(sent)
    parsed_sent = parser.parse(pos_sent)
    for s in parsed_sent.subtrees():
        if s.label() == 'VP':
            vps.append([w[0] for w in s.leaves()])
        elif s.label() == 'NP':
            nps.append([w[0] for w in s.leaves()])
    return [' '.join(phrase) for phrase in (nps + vps)]

# create grammar and initialize chunk parser
grammar = r"""
NP: {<DET><ADJ><NOUN>+}
    {<DET>*<ADJ><NOUN>+}
    {<DET><ADJ>*<NOUN>+}
    {<DET><ADJ><NOUN>*<PROPN>*}
    {<DET>*<ADJ>*<NOUN>+<ADJ>*<NOUN|PROPN>+}
    {<NOUN><NOUN|PROPN>(<CCONJ><NOUN|PROPN>)?}
    {<DET>*(<ADJ>|<ADJ><CCONJ><ADJ>)*(<NOUN|PROPN><CCONJ><NOUN|PROPN>|<NOUN|PROPN>)+}
VP: {<ADV>*<VERB>+}
    {<AUX>+<VERB>+}
"""
chunk_parser = nltk.RegexpParser(grammar)

In [5]:
# open cleaned data from saved dictionaries
with open('countries.pkl', 'rb') as f:
    country_dict = pickle.load(f)
    
allkeys = []
for c in country_dict:
    for k in country_dict[c]:
        allkeys.append(k)
allkeys = set(allkeys)

In [6]:
# open csv with pairs for training
traindf = pd.read_csv('train2.csv')
traindf.head()

Unnamed: 0,Q,K
0,Яка загальноприйнята назва Польща,Загальновживана назва
1,Яка поширена назва Польща,Загальновживана назва
2,Яка загальнопоширена назва Польща,Загальновживана назва
3,Яка довга назва Польща,Офіційна назва
4,Яка повна назва Польща,Офіційна назва


In [7]:
def fix_hyphens(sent):
    """
    sent is tokenized with tokenize_uk
    """
    new_sent = []
    i = 0
    while i < len(sent):
        w = sent[i]
        if w == '—' or w == '-':
            new_sent.pop()
            new_word = sent[i-1]+'-'+sent[i+1]
            new_sent.append(new_word)
            i += 1
        else:
            new_sent.append(w)
        i += 1
    return new_sent

In [8]:
def gender_agree(w_parsed):
    """
    Inflect noun phrase with adjective the right way
    """
    gender = w_parsed.tag.gender
    if not gender:
        return w_parsed.normal_form
    w = w_parsed.inflect({gender, 'nomn'}).word
    return w
   
def get_entity(q_text, lem_dict):
    """
    Look for (capitalized) entities in q_text.
    For this specific application pymorphy2 tagging is enough.
    """
    forbidden = ['ВВП', 'HDI', 'ISO', 'ООН', 'UN', 'UTC', 
                 'Utc-Поправка', 'Utc-Поправка']
    words = fix_hyphens(tokenize_words(q_text))
    phrase = []
    for i, w in enumerate(words[1:]):
        if w in forbidden:
            continue
        if w[0] == w[0].upper():
            w_parsed = morph.parse(w.strip(' ?'))[0]
            if w_parsed.normal_form not in lem_dict:
                continue
            if 'ADJF' in w_parsed.tag:
                phrase.append(gender_agree(w_parsed).title())
                phrase.append(morph.parse
                              (words[i+2].strip(' ?'))[0].normal_form)
                return ' '.join(phrase).title()
            elif 'NOUN' in w_parsed.tag:
                return w_parsed.normal_form.title()
            elif 'UNKN' in w_parsed.tag:
                return w_parsed.normal_form.title()
            else:
                continue
    return None

In [9]:
def lemmatize_phrase(phrase):
    """
    Also we can stem instead of lemmatizing...
    """
    words = fix_hyphens(tokenize_words(phrase))
    if len(words) == 1:
        return morph.parse(phrase)[0].normal_form
    else:
        new_phrase = ''
        for w in words:
            new_phrase += morph.parse(w)[0].normal_form + ' '
        return new_phrase.strip()

In [10]:
def parse_question(q, lem_dict):
    ent = get_entity(q, lem_dict)
    lem_sent = lemmatize_phrase(q)
    lem_ent = lemmatize_phrase(ent)
    new_sent = lem_sent.replace(lem_ent, '').replace('  ', ' ')
    new_sent = new_sent.replace('який', '')
    phrases = pos_chunk2(new_sent, tagger, cp)
    return new_sent.strip(), phrases

In [18]:
ner_model = joblib.load('NER_model.pkl')

pos_tagger = tagger
def get_ner_features(word, prev_word, next_word):
    features = {
        'word': word,
        'word_stem': UkrainianStemmer(word).stem_word(),
        'prev_word': prev_word,
        'next_word': next_word,
        'prev_stem': UkrainianStemmer(prev_word).stem_word(),
        'next_stem': UkrainianStemmer(next_word).stem_word(),
        'is_uppercase': word.title() == word,
        'is_after_punct': prev_word in string.punctuation,
        'is_after_uppercase': prev_word.title() == prev_word,
        'pos': pos_tagger.tag(' '.join([prev_word, word, next_word]))[1][1]
    }
    return features

def ner_recognize(sent, model):
    sent = sent.strip(string.punctuation)
    tokens = tokenize_words(sent)
    feats = []
    for (i, t) in enumerate(tokens):
        if i == 0:
            prev_word = '.'
        else:
            prev_word = tokens[i-1]
        if i == len(tokens)-1:
            next_word = '.'
        else:
            next_word = tokens[i+1]
        feats.append(get_ner_features(t, prev_word, next_word))
    labels = model.predict(feats)
    return list(zip(tokens, labels))

def ner_recognize(sent, model):
    tokens = tokenize_words(sent)
    feats = []
    for (i, t) in enumerate(tokens):
        if i == 0:
            prev_word = '.'
        else:
            prev_word = tokens[i-1]
        if i == len(tokens)-1:
            next_word = '.'
        else:
            next_word = tokens[i+1]
        feats.append(get_ner_features(t, prev_word, next_word))
    labels = model.predict(feats)
    return list(zip(tokens, labels))

In [19]:
ner_recognize('Яка висота Гімалаїв', ner_model)

[('Яка', '-'), ('висота', '-'), ('Гімалаїв', 'LOC')]

In [20]:
get_close_matches('Південна Корея', country_dict.keys(), cutoff=0.6)

['Південна Корея', 'Північна Корея', 'Південна Осетія']

In [37]:
def get_entity2(q, ner_model, info_dict, lem_dict):
    all_ents = info_dict.keys()
    parsed = ner_recognize(q, ner_model)
    entities = [e[0] for e in parsed if e[1] == 'LOC']
    if not entities:
        return get_entity(q, lem_dict)
    matches = get_close_matches(entities[0], all_ents)
    if not matches:
        print("Не вдалось знайти географічний об'єкт!")
        print(ent)
        return None
    return matches[0]

In [77]:
class QuestionParser():
    
    def __init__(self, obj_dict, load=True, model_loc = 'qa_model.pkl'):
        allkeys = []
        for c in obj_dict:
            for k in obj_dict[c]:
                allkeys.append(k)
        allkeys = set(allkeys)
        self.qa_model = AveragedPerceptron()
        self.classes = allkeys
        self.obj_dict = obj_dict
        self.lem_dict = [morph.parse(ent.split()[0])[0].normal_form 
                         for ent in self.obj_dict.keys()]
        try:
            self.ner_model = joblib.load('NER_model.pkl')
        except:
            self.ner_model = {}
            print('No NER model found!')
        if load:
            self.load(model_loc)
        else:
            print('Please train the model for QA.')
            
    def load(self, loc):
        try:
            weights, classes = pickle.load(open(loc, 'rb'))
        except IOError:
            msg = ("Missing a pickle file for QA model.")
        self.qa_model.weights = weights
        self.qa_model.classes = classes
        return None
    
    def get_features(self, q):
        """
        Given question, get features from it.
        """
        sent, phrases = parse_question(q, self.lem_dict)
        features = {}
        words = fix_hyphens(tokenize_words(sent))
        #phrases = ['_'.join([p for p in fix_hyphens(tokenize_words(phrase))]) for phrase in phrases]
        for i, w in enumerate(words):
            features['word_{i}={w}'.format(i=i, w=w)] = 1
        features['words'] = [('w={w}'.format(w=w), 1) for w in words]
        bigrams = ['_'.join(b) for b in nltk.bigrams(words)]
        features['bigrams'] = [('bg={bg}'.format(bg=bg), 1) for bg in bigrams]
        n = 3
        char_trigrams = [sent[i:i+n] for i in range(len(sent)-n+1)]
        features['trigrams'] = [('t={t}'.format(t=t), 1) for t in char_trigrams]
        #features['phrases'] = []
        #for ph in phrases:
        #    if not ph in bigrams:
        #        features['phrases'].append(('ph={ph}'.format(ph=ph), 1))
        return features

    def train(self, train_df, n_iter=5, save=True, loc='qa_model.pkl'):
        """
        obj_dict is currently country_dict.
        train_df contains columns Q and A
        """
        self.qa_model.classes = self.classes
        for iteration in range(n_iter):
            print('Training iteration number', iteration+1)
            train_df = train_df.sample(len(train_df))
            for i, row in train_df.iterrows():
                q = row['Q']
                k = row['K']
                ent = get_entity2(q, self.ner_model, self.obj_dict,
                                  self.lem_dict)
                if not ent:
                    print('Cannot find an entity for', q)
                    continue
                true_keys = []
                if ent not in self.obj_dict:
                    print('Cannot find an entity in a dictionary for', q)
                    print(ent)
                feats = self.get_features(q)
                guess = self.qa_model.predict(feats)
                self.qa_model.update(k, guess, feats)
        self.qa_model.average_weights()
        if save:
            self.qa_model.save(loc)
        return None
    
    def find_answer(self, q):
        ent = get_entity2(q, self.ner_model, self.obj_dict,
                          self.lem_dict)
        if not ent:
            return 'Відповідь не знайшлась.'
        feats = self.get_features(q)
        all_classes = self.qa_model.get_scores(feats)
        pred_classes = self.qa_model.get_scored_classes(feats)
        for cl in pred_classes:
            if cl in self.obj_dict[ent] and cl in all_classes:
                a = self.obj_dict[ent][cl]
                answer = '{cl} для {ent} - {a}'.format(cl=cl, ent=ent, a=a)
                return answer
        return 'Відповідь не знайшлась.'

In [78]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [79]:
def train2(train_df):
    def get_features(self, q):
        """
        Given question, get features from it.
        """
        sent, phrases = parse_question(q, lem_dict)
        features = {}
        words = fix_hyphens(tokenize_words(sent))
        for i, w in enumerate(words):
            features['word_{i}={w}'.format(i=i, w=w)] = 1
        features['words'] = [('w={w}'.format(w=w), 1) for w in words]
        bigrams = ['_'.join(b) for b in nltk.bigrams(words)]
        features['bigrams'] = [('bg={bg}'.format(bg=bg), 1) for bg in bigrams]
        n = 3
        char_trigrams = [sent[i:i+n] for i in range(len(sent)-n+1)]
        features['trigrams'] = [('t={t}'.format(t=t), 1) for t in char_trigrams]
        return features
    
    features, labels = [], []
    for i, row in train_df.iterrows():
        q = row['Q']
        k = row['K']
        ent = get_entity2(q, self.ner_model, self.obj_dict,
                          self.lem_dict)
        if not ent:
            print('Cannot find an entity for', q)
            continue
        feats = get_features(q)
        features.append(feats)
        labels.append(k)
    vec = DictVectorizer()
    train_feats = vec.fit_transform(features)
    clf = LogisticRegression(penalty='l1')
    clf.fit(train_feats, labels)
    return clf

In [80]:
lr_model = train2(traindf)

NameError: name 'self' is not defined

In [81]:
ner_recognize('Який Алжир за населенням у світі?', ner_model)

[('Який', '-'),
 ('Алжир', 'LOC'),
 ('за', '-'),
 ('населенням', '-'),
 ('у', '-'),
 ('світі', '-'),
 ('?', '-')]

In [82]:
qp = QuestionParser(country_dict, load=False)
qp.train(traindf, 20)

Please train the model for QA.
Training iteration number 1
Training iteration number 2
Training iteration number 3
Training iteration number 4
Training iteration number 5
Training iteration number 6
Training iteration number 7
Training iteration number 8
Training iteration number 9
Training iteration number 10
Training iteration number 11
Training iteration number 12
Training iteration number 13
Training iteration number 14
Training iteration number 15
Training iteration number 16
Training iteration number 17
Training iteration number 18
Training iteration number 19
Training iteration number 20


In [76]:
qp = QuestionParser(country_dict)
qp.find_answer('яка столиця Екваторіальної Гвінеї')

'Столиця для Екваторіальна Гвінея - Малабо'

In [55]:
feats = qp.get_features('яка столиця Екваторіальної Гвінеї')
qp.qa_model.get_scores(feats)
#feats

defaultdict(float,
            {'Столиця': 9.912000000000003,
             'Валюта': -4.952,
             'Густота населення': 1.966,
             'Офіційна назва': -0.999,
             'ВВП (ПКС)': -0.995,
             'Найбільше місто': -1.971,
             'Ключові дати в історії': -0.385,
             'Місце за густотою населення': 1.964,
             'Місце за населенням': -1.971,
             'Місце за площею': -0.962,
             'Ключові події і дати в історії': 0.382,
             'Офіційні мови': -0.992,
             'Посади лідерів': -0.996})

In [62]:
q = 'Яка столиця Польщі'
get_entity2(q, qp.ner_model, qp.obj_dict,
                          qp.lem_dict)
feats = qp.get_features(q)
all_classes = qp.qa_model.get_scores(feats)
pred_classes = qp.qa_model.get_scored_classes(feats)
#for cl in pred_classes:
#    if cl in self.obj_dict[ent] and cl in all_classes:
#        a = self.obj_dict[ent][cl]
#        answer = '{cl} для {ent} - {a}'.format(cl=cl, ent=ent, a=a)
#        return answer
qp.qa_model.classes

set()

In [79]:
with open('test_questions.txt', 'r') as f:
    tq = f.read().split('\n')

In [22]:
21/25

0.84

In [80]:
for q_text in tq:
    print('---')
    print(q_text)
    print(qp.find_answer(q_text))
    print('---')

---
яка площа Мексики
Площа для Мексика - 1972550
---
---
яка площа території Португалії
Площа для Португалія - нема інформації в базі
---
---
яка територія Гвінеї
Площа для Гвінея - 245.857
---
---
який розмір Гвінеї
Площа для Гвінея - 245.857
---
---
яка столиця Мексики
Столиця для Мексика - Мехіко
---
---
яке місто є столиця Мексики
Найбільше місто для Мексика - Мехіко
---
---
яка офіційна мова Австралії
Офіційні мови для Австралія - Англійська мова (англійська1)
---
---
яка мова визнана в Мексиці офіційною?
Офіційні мови для Мексика - іспанська мова
---
---
яка форма правління Мексики
Форма правління для Мексика - Федеративна республіка
---
---
хто є президентом України
Імена лідерів для Україна - Порошенко Петро Олексійович, Гройсман Володимир Борисович, Парубій Андрій Володимирович
---
---
хто польский президент?
Відповідь не знайшлась.
---
---
коли відбулося хрещення Гвінеї
Ключові дати в історії для Гвінея - 2 жовтня 1958
---
---
у якому році відбулось хрещення Гвінеї
UTC-попра

AttributeError: 'NoneType' object has no attribute 'lower'