In [1]:
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm
from tqdm import trange
import jsonlines
from scipy import sparse
from os import listdir
import itertools
import networkx as nx
import re
from collections import Counter
import json
import pymorphy2 as pm
import matplotlib.pyplot as plt
from scipy.sparse import load_npz
import networkx as nx
from math import log

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib


from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


stopwords_ru = set(stopwords.words('russian'))
tknzr = TweetTokenizer()
morph = pm.MorphAnalyzer()

In [2]:
ok_set = set()
with open("sources/accepted_categories.txt", mode="r", encoding="utf-8") as inp:
    for line in inp:
        line = line[:-1]
        ok_set.add(line)

ok = sorted(ok_set)

categories_dict = {}
with open("sources/article_cat.json", mode="r") as input:
    categories_dict = json.loads(input.read())
    
    
cat_id = {}
with open("sources/cat_id.json", mode="r") as input:
    cat_id = json.loads(input.read())
    
id_cat ={}
with open("sources/id_cat.json", mode="r") as input:
    id_cat = json.loads(input.read())
    
cat_graph = nx.read_gpickle("sources/graph.gpickle")

In [3]:
clf_binlogreg = [joblib.load("sources/binlogreg_clfs/binlogreg_tfidf_tfidf0.pkl"),
                joblib.load("sources/binlogreg_clfs/binlogreg_tfidf_tfidf1.pkl"),
                joblib.load("sources/binlogreg_clfs/binlogreg_tfidf_tfidf2.pkl")]
X = load_npz("sources/tf_idf.npz")
class_centroids = load_npz("sources/centroids_tfidf.npz")
filter_centroids = load_npz("sources/centroids_tfidf.npz")

In [3]:
#for svd
X = np.load("sources/svd500.npy")
class_centroids = np.load("sources/centroids_svd500.npy")
X = X.astype(np.float16)
class_centroids = class_centroids.astype(np.float16)

In [4]:
ids = sorted([item['id'] for item in jsonlines.open('sources/normalized_texts.jl', 'r')])

In [5]:
class CustomKNN:
    
    def __init__(self, n_neighbors=5):
        self.clf = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1)
        self.n_neighbors = n_neighbors
    
    
    def fit(self, X_train, y_train):
        self.clf.fit(X_train)
        self.categories = y_train
        
        
    def predict(self, X_test, n_neighbors=None):
        if n_neighbors is None:
            n_neighbors = self.n_neighbors
            
        kneighbors = self.clf.kneighbors(X_test, n_neighbors, return_distance=False)
        res = []
        for neighs in kneighbors:
            curr_neigh = np.zeros(self.categories.shape[1], dtype=int)
            for neigh in neighs:
                curr_neigh = np.bitwise_or(self.categories[neigh], curr_neigh)
            res.append(curr_neigh)
        
        return np.array(res)
        

In [6]:
def metric(res, true):
    if len(res) == 0:
        return 0
    
    hits = 0
    for c in res:
        if c in true:
            hits += 1
    return hits / len(res)


In [7]:
class MGIA:
    
    def __init__(self, G):
        self.G = G.to_undirected()
        self.flow = nx.DiGraph()
        
    def score(self, TRUE, PRED):
        self.flow.clear()
        M = len(PRED)
        N = len(TRUE)
        infinity = 130 #10 #G.number_of_edges()
        self.flow.add_node('source', demand=0)
        self.flow.add_node('sink', demand=M-N)
        self.flow.add_node('DP', demand=0)
        self.flow.add_node('DT', demand=0)
        self.flow.add_edge('source', 'DP', capacity=N, weight=0)
        self.flow.add_edge('DT', 'sink', capacity=M, weight=0)
        self.flow.add_edge('sink', 'source', capacity=(M)*(N+1), weight=0)
        for pred in PRED:
            _pred = 'p' + str(pred)
            self.flow.add_node(_pred, demand=-1)
            self.flow.add_edge('source', _pred, capacity=N, weight=0)
            self.flow.add_edge(_pred, 'DT', capacity=1, weight=infinity)
        for true in TRUE:
            _true = 't' + str(true)
            self.flow.add_node(_true, demand=1)
            self.flow.add_edge(_true, 'sink', capacity=M, weight=0)
            self.flow.add_edge('DP', _true, capacity=1, weight=infinity)
            for pred in PRED:
                k_ij = nx.shortest_path_length(self.G, source=pred, target=true)
                self.flow.add_edge('p' + str(pred), _true, capacity=1, weight=k_ij * k_ij * k_ij)

        flowCost, flowDict = nx.network_simplex(self.flow)
        low = (len(PRED.union(TRUE).difference(PRED.intersection(TRUE))))
        #print("Len = ", len(TRUE), len(PRED), low)
        if low == 0:
            accuracy = 1
        else:
            accuracy = 1 - flowCost / (low * infinity)
        #print(accuracy)
        return accuracy

### Тут просто custom knn

In [8]:
y = np.zeros((len(ids), len(ok)), dtype=np.int8)
for i in range(len(ids)):
    for c in categories_dict[ids[i]]:
        y[i][ok.index(c)] = 1

In [9]:
n_splits = 3
kf = KFold(n_splits=n_splits, shuffle=True, random_state=27)
mgia = MGIA(cat_graph)

train_index, test_index = 0, 0
for item in kf.split(ids):
    
    train_index, test_index = item[0], item[1]
    
    print("TRAIN:", train_index, "TEST:", test_index)
    print(len(train_index), len(test_index))
    
    n_neighbors = 10
    clf = CustomKNN(n_neighbors=n_neighbors)
    clf.fit(X[train_index], y[train_index])
    
    prec, rec, f1 = 0, 0, 0
    mgia_score = 0
    batch_size = 800
    for i1 in trange(0, len(test_index), batch_size):
        res = clf.predict(X[test_index[i1:i1+batch_size]])
        y_true = y[test_index[i1:i1+batch_size]]
        
        lprec, lrec = 0, 0
        lmgia_score = 0
        for i in range(len(res)):
            ind1 = [j for j in range(len(y_true[i])) if y_true[i][j] == 1]
            ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]

            _y_true = set(ok[j] for j in ind1)
            _res = set(ok[j] for j in ind2)
            
            _Y_true = set(cat_id[c] for c in _y_true)
            _Res = set(cat_id[c] for c in _res)
            
            _mgia_score = mgia.score(_Y_true, _Res)
            _prec = metric(_res, _y_true)
            _rec = metric(_y_true, _res)
            lmgia_score += _mgia_score
            lprec += _prec
            lrec += _rec
        prec += lprec
        rec += lrec
        mgia_score += lmgia_score
        

    prec = prec / len(test_index)
    rec = rec / len(test_index)
    mgia_score = mgia_score / len(test_index)
    f1 = 2 * prec * rec / (prec + rec)
    print(prec)
    print(rec)
    print(f1)
    print(mgia_score)

TRAIN: [    0     1     2 ... 96790 96792 96793] TEST: [    3     4     7 ... 96785 96789 96791]
64529 32265


100%|██████████████████████████████████████████████████████████████████████████████████| 41/41 [53:01<00:00, 77.60s/it]


0.23135548321005528
0.6340072339966272
0.33900478276539175
0.5352424581086974
TRAIN: [    2     3     4 ... 96791 96792 96793] TEST: [    0     1     5 ... 96787 96788 96790]
64529 32265


100%|██████████████████████████████████████████████████████████████████████████████████| 41/41 [53:23<00:00, 78.14s/it]


0.2258104856797174
0.6250803515181074
0.3317692272489149
0.5327344293795426
TRAIN: [    0     1     3 ... 96789 96790 96791] TEST: [    2     6     9 ... 96786 96792 96793]
64530 32264


 63%|████████████████████████████████████████████████████                              | 26/41 [37:07<21:24, 85.66s/it]

KeyboardInterrupt: 

### Тут custom knn на центроидаx

In [8]:
y_centroids = np.diag([1] * len(ok))

In [12]:
n_neighbors = 10
clf = CustomKNN(n_neighbors=n_neighbors)
mgia = MGIA(cat_graph)

In [13]:
clf.fit(class_centroids, y_centroids)

In [14]:
n_splits = 3
kf = KFold(n_splits=n_splits, shuffle=True, random_state=27)

train_index, test_index = 0, 0
for item in kf.split(ids):
    
    train_index, test_index = item[0], item[1]
    
    print("TRAIN:", train_index, "TEST:", test_index)
    print(len(train_index), len(test_index))
    
    res = clf.predict(X[test_index])
    
    y_true = []
    for i in test_index:
        indices = [ok.index(cat) for cat in categories_dict[ids[i]]]
        _y_true = np.zeros(len(ok), dtype=int)
        for index in indices:
            _y_true[index] = 1
        y_true.append(_y_true)
        

    prec, rec, f1, mgia_score = 0, 0, 0, 0
    for i in trange(len(res)):
        ind1 = [j for j in range(len(y_true[i])) if y_true[i][j] == 1]
        ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]

        _y_true = set(ok[j] for j in ind1)
        _res = set(ok[j] for j in ind2)
            
        _Y_true = set(cat_id[c] for c in _y_true)
        _Res = set(cat_id[c] for c in _res)

        prec += metric(_res, _y_true)
        rec += metric(_y_true, _res)
        mgia_score += mgia.score(_Y_true, _Res)


    prec = prec / len(res)
    rec = rec / len(res)
    mgia_score = mgia_score / len(res)
    f1 = 2 * prec * rec / (prec + rec)
    print("Precision", prec)
    print("Recall", rec)
    print("F1", f1)
    print("MGIA", mgia_score)

TRAIN: [    0     1     2 ... 96790 96792 96793] TEST: [    3     4     7 ... 96785 96789 96791]
64529 32265


100%|████████████████████████████████████████████████████████████████████████████| 32265/32265 [13:31<00:00, 39.78it/s]


Precision 0.2412924221293217
Recall 0.9365051081301189
F1 0.3837188991683869
MGIA 0.5154485635803016
TRAIN: [    2     3     4 ... 96791 96792 96793] TEST: [    0     1     5 ... 96787 96788 96790]
64529 32265


 24%|██████████████████▏                                                          | 7635/32265 [03:37<11:40, 35.14it/s]

KeyboardInterrupt: 

 24%|██████████████████▏                                                          | 7635/32265 [03:50<12:24, 33.07it/s]

### Тут custom knn + фильтрация результатов с помощью логистической регрессией

In [10]:
n_splits = 3
kf = KFold(n_splits=n_splits, shuffle=True, random_state=27)

k = 0
train_index, test_index = 0, 0
for item in tqdm(kf.split(ids)):
    
    train_index, test_index = item[0], item[1]
    
    print("TRAIN:", train_index, "TEST:", test_index)
    print(len(train_index), len(test_index))
    
    res = clf.predict(X[test_index])
    
    for i in range(len(test_index)):
        X_left = sparse.vstack([X[test_index[i]]] * n_neighbors)
        ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]
        X_full = sparse.hstack([X_left, filter_centroids[ind2]])
        _proba = clf_binlogreg[k].predict_proba(X_full)
        
        _proba = _proba.transpose()[1]
        _res = [1 if p > 0.5 else 0 for p in _proba]
        
        for j, ind in enumerate(ind2):
            if _res[j] == 0:
                res[i][ind] = 0
    
    print("Applied logistic regression")
    y_true = []
    for i in test_index:
        indices = [ok.index(cat) for cat in categories_dict[ids[i]]]
        _y_true = np.zeros(len(ok), dtype=int)
        for index in indices:
            _y_true[index] = 1
        y_true.append(_y_true)
        

    prec, rec, f1, mean_res_len = 0, 0, 0, 0
    for i in range(len(res)):
        ind1 = [j for j in range(len(y_true[i])) if y_true[i][j] == 1]
        ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]
        mean_res_len += len(ind2)
        
        _y_true = [ok[j] for j in ind1]
        _res =[ok[j] for j in ind2]

        prec += metric(_res, _y_true)
        rec += metric(_y_true, _res)


    prec = prec / len(res)
    rec = rec / len(res)
    f1 = 2 * prec * rec / (prec + rec)
    print(prec)
    print(rec)
    print(f1)
    print(mean_res_len / len(res))
    
    k += 1

0it [00:00, ?it/s]

TRAIN: [    0     1     2 ... 96790 96792 96793] TEST: [    3     4     7 ... 96785 96789 96791]
64529 32265
Applied logistic regression
0.09874151065457365
0.16274867769990758
0.12291130611247503
1.622160235549357


1it [14:39, 879.87s/it]

TRAIN: [    2     3     4 ... 96791 96792 96793] TEST: [    0     1     5 ... 96787 96788 96790]
64529 32265
Applied logistic regression
0.10253198340134595
0.16662513070537002
0.12694745366423002
1.6812025414535874


2it [29:29, 884.78s/it]

TRAIN: [    0     1     3 ... 96789 96790 96791] TEST: [    2     6     9 ... 96786 96792 96793]
64530 32264
Applied logistic regression
0.10071588216656961
0.16340755757321265
0.12462155066498448
1.630424001983635


3it [44:22, 887.41s/it]


In [6]:
a = [i for i in range(11)]
for i in range(0, len(a), 2):
    print(a[i:i+2])

[0, 1]
[2, 3]
[4, 5]
[6, 7]
[8, 9]
[10]
