In [1]:
from os import listdir
import jsonlines
import itertools
import nltk
from nltk.corpus import stopwords
import pymorphy2 as pm
import networkx as nx
import re
from bisect import bisect_left
#from joblib import Parallel, delayed
from collections import Counter
from tqdm import trange
from tqdm import tqdm
from math import log
from random import shuffle
from sys import exit
import numpy as np
import json
import codecs
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.neighbors import NearestCentroid
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.externals import joblib
from sklearn.metrics import recall_score, precision_score, f1_score

from scipy.sparse import load_npz
from scipy.sparse import hstack
from scipy.sparse import vstack
#nltk.download()
tknzr = nltk.TweetTokenizer()

In [5]:
path = listdir('C:/Users/Maxim/MyPy/wta/c')
path = sorted(path)[:9]
print(path)

data = []
for p in path:
    with jsonlines.open('C:/Users/Maxim/MyPy/wta/c/' + p, 'r') as f:
        for entry in f:
            data.append(entry)

['AA', 'AB', 'AD', 'AE', 'AF', 'AI', 'AJ', 'AK', 'AQ']


In [7]:
ok_set = set()
with open("sources/accepted_categories.txt", mode="r", encoding="utf-8") as inp:
    for line in inp:
        line = line[:-1]
        ok_set.add(line)

ok = sorted(ok_set)

categories_dict = {}
with open("sources/article_cat.json", mode="r") as input:
    categories_dict = json.loads(input.read())
    
category_article = {}
with open("sources/cat_article.json", mode="r") as input:
    category_article = json.loads(input.read())
    
texts = {item['id']: item['text'] for item in jsonlines.open('sources/normalized_texts.jl', 'r')}

In [8]:
ids = sorted([id for id in texts.keys()])

### Подготваливаем заголовки

In [9]:
sw_ru = nltk.corpus.stopwords.words('russian')
morph = pm.MorphAnalyzer()
id_title = {}

stems = []
unsup_symb = set('.,\(\):«»?!')
for i in trange(len(data)):
    id = data[i]['id']
    if id in ids:
        tokens = tknzr.tokenize(text=data[i]['title'])
        tokens = [t.lower() for t in tokens if t not in unsup_symb]
        tokens = [morph.parse(t)[0].normal_form for t in tokens]
        tokens = [t for t in tokens if t not in sw_ru]
        if len(tokens) > 0:
            id_title[id] = ' '.join(tokens)
            stems.extend(tokens)
    
stems = set(stems)
print("Done")

100%|█████████████████████████████████████████████████████████████████████████| 116584/116584 [02:54<00:00, 669.66it/s]


Done


In [10]:
title_id = {}
for id in id_title:
    if title_id.get(id_title[id]) is not None:
        title_id[id_title[id]].append(id)
    else:
        title_id[id_title[id]] = [id]

### Вычисляем словарь категорий

In [8]:
categories_vocab = {}

for id in tqdm(categories_dict):
    for c in categories_dict[id]:
        if c in ok:
            if id_title.get(id) is not None:
                if categories_vocab.get(c) is None:
                    categories_vocab[c] = id_title[id]
                else:
                    categories_vocab[c] += ' ' + id_title[id]


for c in categories_vocab:
    categories_vocab[c] = set(categories_vocab[c].split())

100%|██████████████████████████████████████████████████████████████████████████| 96794/96794 [00:27<00:00, 3554.66it/s]


In [11]:
'''with open("sources/categories_vocab", "w") as output:
    output.write(json.dumps({c: sorted(vocab) for c, vocab in categories_vocab.items()}))
'''
with open("sources/categories_vocab", "r") as input:
    categories_vocab = json.loads(input.read())
    categories_vocab = {c: set(vocab) for c, vocab in categories_vocab.items()}

In [12]:
ids = [id for id in ids if id in id_title.keys()]

In [13]:
for id in tqdm(ids):
    tokens = texts[id].split()
    tokens = [tok for tok in tokens if tok in stems]
    texts[id] = ' '.join(tokens)
    
data[:] = []

100%|██████████████████████████████████████████████████████████████████████████| 96769/96769 [00:11<00:00, 8232.73it/s]


### Начинаем анализировать документы

In [14]:
kf = KFold(n_splits=5, shuffle=True, random_state=27)

train_index, test_index = 0, 0
for item in kf.split(ids):
    train_index, test_index = item[0], item[1]
    break

print("TRAIN:", train_index, "TEST:", test_index)
print(len(train_index), len(test_index))

TRAIN: [    0     1     2 ... 96766 96767 96768] TEST: [    4     7    27 ... 96754 96755 96764]
77415 19354


1. Взвешиваем слова в документe по формуле $ R_w = ft_w \log{\frac{N}{cf_w}} $.
2. Взвешиваем заголовки по формуле $R_t = \sum_{w \rightarrow t} R_w \frac{1}{t_w} \frac{1}{a_t} \frac{S_t}{L_t}$.
3. Взвешиваем статьи $R_a = \max_{t \rightarrow a} R_t$ (почти так).
4. Взвештваем категории $R_c = \frac{v_c}{d_c} \sum_{a \rightarrow c}R_a$ и получам ответ.
5. Модификация: обновляем веса категорий по правилу $R_c' = R_c \frac{\sum_{w \in B_c}d_w}{|B_c|}$, $d'_w = \frac{d_w}{2}$ для $d_w \in B_c$ в порядке убывания весов категорий.

In [15]:
topn = 20


results = {}
for i in tqdm(test_index[:int(len(test_index)/6)]):
    document = texts[ids[i]]
    #Шаг 1
    words = re.split(r' ', document)
    words_set = set(words)
    d_w = {w: 1 for w in words_set}
    # Вычисляем tf_w
    R_w = dict(Counter(words))
    N = len(categories_dict)
    # Вычисляем log(...)
    for w in words_set:
        cf_w = 0
        for c in categories_vocab:
            if w in categories_vocab[c]:
                cf_w += 1
        if cf_w > 0:
            R_w[w] = R_w[w] * log(N / cf_w)
        else:
            R_w[w] = 0
    
    # Шаг 2
    R_t = dict()
    supp_w_t = {}
    title_vocab = dict(Counter(' '.join(title_id.keys()).split()))
    for title in title_id.keys():
        title_words = title.split()
        supp_words = []
        flag = 1
        for w in title_words:
            if w not in words_set:
                flag = flag - 1
            else:
                supp_words.append(w)
        if flag < 0:
            continue
        sub_sum = 0
        # Вынесем 1 / a_t * S_t / L_t как mutual_mul
        mutual_mul = len(supp_words) / (len(title_id[title]) * len(title_words))
        for w in supp_words:
            sub_sum += R_w[w] / title_vocab[w]
        
        R_t[title] = mutual_mul * sub_sum
        supp_w_t[title] = supp_words
    
    # Шаг 3
    #R_a = {d['id']: R_t.get(d['title']) for d in data if R_t.get(d['title']) is not None and R_t.get(d['title']) > 0.0}
    #supp_w_a = {d['id']: supp_w_t.get(d['title']) for d in data if supp_w_t.get(d['title']) is not None 
    #            and  R_t.get(d['title']) > 0.0}
    R_a = {id: R_t.get(title) for (id, title) in id_title.items() if R_t.get(title) is not None and R_t.get(title) > 0.0}
    supp_w_a = {id: supp_w_t.get(title) for (id, title) in id_title.items() if supp_w_t.get(title) is not None 
                and R_t.get(title) > 0.0}
    
    # Шаг 4
    R_c = {}
    for c in ok:
        v_c = 0
        d_c = len(categories_vocab[c])
        r_c = 0
        for id in category_article[c]:
            if R_a.get(id) is not None:
                r_c += R_a.get(id)
                v_c += len(supp_w_a.get(id))
        R_c[c] = r_c * v_c / d_c
    
    # Шаг 5    
    R__c = sorted([k for k in R_c.items() if k[1] > 0], key=lambda t: t[1], reverse=True)[:topn]
    R_c = {}
    for r_c in R__c:
        B_c = set(' '.join([' '.join(supp_w_a.get(id)) for id in category_article[r_c[0]] 
                            if supp_w_a.get(id) is not None]).split())
        sub_sum = 0
        for b in B_c:
            sub_sum += d_w[b]
            d_w[b] = d_w[b] / 2
        
        R_c[r_c[0]] = r_c[1] * sub_sum / len(B_c)
    
    # Влом исправлять
    results[ids[i]] = sorted([k for k in R_c.items() if k[1] > 0], key=lambda t: t[1], reverse=True)[:topn]

100%|████████████████████████████████████████████████████████████████████████████| 3225/3225 [1:26:15<00:00,  1.60s/it]


### Применяем бинарную логистическую регрессию

In [32]:
clf = joblib.load("sources/clf_binlogreg.pkl")
X = load_npz("sources/text_tfidf.npz")
class_centroids = load_npz("sources/sparce_centroids_nosvd.npz")

In [21]:
X.shape

(96794, 121303)

In [28]:
_ids = [id for id in texts.keys()]

In [56]:
precision = 0
recall = 0
f1 = 0
_confirmed = []
_true = []

N = len(results)
for result in tqdm(results):
    n = _ids.index(result)
    suggested = [res[0] for res in results[result]]
    rows = [ok.index(s) for s in suggested]
    X_right = class_centroids[rows]
    X_left = X[[n]*len(rows)]
    X_full = hstack([X_left, X_right])
    prediction = clf.predict(X_full)
    confirmed = [sugg for i, sugg in enumerate(suggested) if prediction[i] == 1]
    true = categories_dict[result]
    
    if len(confirmed) > 0:
        _confirmed.append([1 if cat in confirmed else 0 for cat in ok])
    else:
        _confirmed.append([1 if cat in suggested else 0 for cat in ok])
        
    _true.append([1 if cat in true else 0 for cat in ok])
    
    if len(confirmed) > 0:
        precision += metric(true, confirmed) / N
        recall += metric(confirmed, true) / N
    
#precision = precision_score(_true, _confirmed, average='micro')
#recall = recall_score(_true, _confirmed, average='micro')
#f1 = f1_score(_true, _confirmed, average='micro')
f1 = 2 * precision * recall / (precision + recall)

print(precision)
print(recall)
print(f1)

100%|██████████████████████████████████████████████████████████████████████████████| 2150/2150 [01:20<00:00, 26.65it/s]


0.635502592681878
0.1163376519681132
0.1966717796190236


In [27]:
def metric(res, true):
    hits = 0
    for c in res:
        if c in true:
            hits += 1
    return hits / len(res)

In [42]:
class MGIA:
    
    def __init__(self, G):
        self.G = G.to_undirected()
        self.flow = nx.DiGraph()
        
    def score(self, TRUE, PRED):
        if (len(TRUE) == 0) or (len(PRED) == 0):
            return 0
            
        self.flow.clear()
        M = len(PRED)
        N = len(TRUE)
        infinity = 130 #10 #G.number_of_edges()
        self.flow.add_node('source', demand=0)
        self.flow.add_node('sink', demand=M-N)
        self.flow.add_node('DP', demand=0)
        self.flow.add_node('DT', demand=0)
        self.flow.add_edge('source', 'DP', capacity=N, weight=0)
        self.flow.add_edge('DT', 'sink', capacity=M, weight=0)
        self.flow.add_edge('sink', 'source', capacity=(M)*(N+1), weight=0)
        for pred in PRED:
            _pred = 'p' + str(pred)
            self.flow.add_node(_pred, demand=-1)
            self.flow.add_edge('source', _pred, capacity=N, weight=0)
            self.flow.add_edge(_pred, 'DT', capacity=1, weight=infinity)
        for true in TRUE:
            _true = 't' + str(true)
            self.flow.add_node(_true, demand=1)
            self.flow.add_edge(_true, 'sink', capacity=M, weight=0)
            self.flow.add_edge('DP', _true, capacity=1, weight=infinity)
            for pred in PRED:
                k_ij = nx.shortest_path_length(self.G, source=pred, target=true)
                self.flow.add_edge('p' + str(pred), _true, capacity=1, weight=k_ij * k_ij * k_ij)

        flowCost, flowDict = nx.network_simplex(self.flow)
        low = (len(PRED.union(TRUE).difference(PRED.intersection(TRUE))))
        #print("Len = ", len(TRUE), len(PRED), low)
        if low == 0:
            accuracy = 1
        else:
            accuracy = 1 - flowCost / (low * infinity)
        #print(accuracy)
        return accuracy
    
cat_graph = nx.read_gpickle("sources/graph.gpickle")
mgia = MGIA(cat_graph)

In [44]:
cat_id = {}
with open("sources/cat_id.json", mode="r") as input:
    cat_id = json.loads(input.read())

In [94]:
prec, rec, f1 = 0, 0, 0
mgia_score, mean = 0, 0
p = 0.35
for r in tqdm(results):
    res = results[r][:10]
    decision_crit = p * max([_r[1] for _r in res])
    y_pred = []
    for _r in res:
        if _r[1] >= decision_crit:
            y_pred.append(_r[0])
    
    y_true = categories_dict[r]
    
    Y_true = set(cat_id[c] for c in y_true)
    Y_pred = set(cat_id[c] for c in y_pred)
    
    prec += metric(y_pred, y_true)
    rec += metric(y_true, y_pred)
    mgia_score += mgia.score(Y_true, Y_pred)
    mean += len(y_pred)

100%|█████████████████████████████████████████████████████████████████████████████| 3225/3225 [00:23<00:00, 135.09it/s]


In [95]:
prec = prec / len(results)
rec = rec / len(results)
mgia_score = mgia_score / len(results)
f1 = 2 * prec * rec / (prec + rec)
print(prec)
print(rec)
print(f1)
print(mgia_score)
print(mean / len(results))

0.46406829088224366
0.4278531468820072
0.4452254878407877
0.49337154340790573
2.7748837209302324
