In [1]:
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm
from tqdm import trange
import jsonlines
from scipy import sparse
from os import listdir
import itertools
import networkx as nx
import re
from collections import Counter
import json
import pymorphy2 as pm
import matplotlib.pyplot as plt
from scipy.sparse import load_npz

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib


from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


stopwords_ru = set(stopwords.words('russian'))
tknzr = TweetTokenizer()
morph = pm.MorphAnalyzer()

In [2]:
ok_set = set()
with open("sources/accepted_categories.txt", mode="r", encoding="utf-8") as inp:
    for line in inp:
        line = line[:-1]
        ok_set.add(line)

ok = sorted(ok_set)

categories_dict = {}
with open("sources/article_cat.json", mode="r") as input:
    categories_dict = json.loads(input.read())

In [3]:
clf_binlogreg = [joblib.load("sources/binlogreg_clfs/binlogreg_tfidf_titles0.pkl"),
                joblib.load("sources/binlogreg_clfs/binlogreg_tfidf_titles1.pkl"),
                joblib.load("sources/binlogreg_clfs/binlogreg_tfidf_titles2.pkl")]
X = load_npz("sources/tf_idf.npz")
class_centroids = load_npz("sources/centroids_tfidf.npz")
filter_centroids = load_npz("sources/centroids_titles.npz")

In [3]:
#for svd
X = np.load("sources/svd1000.npy")
class_centroids = np.load("sources/centroids_svd1000.npy")
X = X.astype(np.float16)
class_centroids = class_centroids.astype(np.float16)

In [4]:
ids = sorted([item['id'] for item in jsonlines.open('sources/normalized_texts.jl', 'r')])

In [5]:
class CustomKNN:
    
    def __init__(self, n_neighbors=5):
        self.clf = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1)
        self.n_neighbors = n_neighbors
    
    
    def fit(self, X_train, y_train):
        self.clf.fit(X_train)
        self.categories = y_train
        
        
    def predict(self, X_test, n_neighbors=None):
        if n_neighbors is None:
            n_neighbors = self.n_neighbors
            
        kneighbors = self.clf.kneighbors(X_test, n_neighbors, return_distance=False)
        res = []
        for neighs in kneighbors:
            curr_neigh = np.zeros(self.categories.shape[1], dtype=int)
            for neigh in neighs:
                curr_neigh = np.bitwise_or(self.categories[neigh], curr_neigh)
            res.append(curr_neigh)
        
        return np.array(res)
        

In [6]:
def metric(res, true):
    if len(res) == 0:
        return 0
    
    hits = 0
    for c in res:
        if c in true:
            hits += 1
    return hits / len(res)


### Тут просто custom knn

In [7]:
y = np.zeros((len(ids), len(ok)), dtype=np.int8)
for i in trange(len(ids)):
    for c in categories_dict[ids[i]]:
        y[i][ok.index(c)] = 1

100%|██████████████████████████████████████████████████████████████████████████| 96794/96794 [00:32<00:00, 2976.87it/s]


In [8]:
n_splits = 3
kf = KFold(n_splits=n_splits, shuffle=True, random_state=27)

train_index, test_index = 0, 0
for item in kf.split(ids):
    
    train_index, test_index = item[0], item[1]
    
    print("TRAIN:", train_index, "TEST:", test_index)
    print(len(train_index), len(test_index))
    
    n_neighbors = 5
    clf = CustomKNN(n_neighbors=n_neighbors)
    clf.fit(X[train_index], y[train_index])
    
    prec, rec, f1 = 0, 0, 0
    batch_size = 800
    for i1 in trange(0, len(test_index), batch_size):
        res = clf.predict(X[test_index[i1:i1+batch_size]])
        y_true = y[test_index[i1:i1+batch_size]]

        for i in range(len(res)):
            ind1 = [j for j in range(len(y_true[i])) if y_true[i][j] == 1]
            ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]

            _y_true = [ok[j] for j in ind1]
            _res =[ok[j] for j in ind2]

            prec += metric(_res, _y_true)
            rec += metric(_y_true, _res)


    prec = prec / len(test_index)
    rec = rec / len(test_index)
    f1 = 2 * prec * rec / (prec + rec)
    print(prec)
    print(rec)
    print(f1)

TRAIN: [    0     1     2 ... 96790 96792 96793] TEST: [    3     4     7 ... 96785 96789 96791]
64529 32265


100%|██████████████████████████████████████████████████████████████████████████████████| 41/41 [47:50<00:00, 70.02s/it]


0.2835922952123947
0.5592310200393454
0.37633892099806676
TRAIN: [    2     3     4 ... 96791 96792 96793] TEST: [    0     1     5 ... 96787 96788 96790]
64529 32265


 15%|████████████▏                                                                      | 6/41 [07:41<44:52, 76.93s/it]

KeyboardInterrupt: 

### Тут custom knn на центроидаx

In [7]:
y_centroids = np.diag([1] * len(ok))

In [8]:
n_neighbors = 5
clf = CustomKNN(n_neighbors=n_neighbors)

In [9]:
clf.fit(class_centroids, y_centroids)

In [10]:
n_splits = 3
kf = KFold(n_splits=n_splits, shuffle=True, random_state=27)

train_index, test_index = 0, 0
for item in tqdm(kf.split(ids)):
    
    train_index, test_index = item[0], item[1]
    
    print("TRAIN:", train_index, "TEST:", test_index)
    print(len(train_index), len(test_index))
    
    res = clf.predict(X[test_index])
    
    y_true = []
    for i in test_index:
        indices = [ok.index(cat) for cat in categories_dict[ids[i]]]
        _y_true = np.zeros(len(ok), dtype=int)
        for index in indices:
            _y_true[index] = 1
        y_true.append(_y_true)
        

    prec, rec, f1 = 0, 0, 0
    for i in range(len(res)):
        ind1 = [j for j in range(len(y_true[i])) if y_true[i][j] == 1]
        ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]

        _y_true = [ok[j] for j in ind1]
        _res =[ok[j] for j in ind2]

        prec += metric(_res, _y_true)
        rec += metric(_y_true, _res)


    prec = prec / len(res)
    rec = rec / len(res)
    f1 = 2 * prec * rec / (prec + rec)
    print(prec)
    print(rec)
    print(f1)

0it [00:00, ?it/s]

TRAIN: [    0     1     2 ... 96790 96792 96793] TEST: [    3     4     7 ... 96785 96789 96791]
64529 32265
0.24454362312094718
0.6019977304362014
0.34780275173294944


1it [10:45, 645.01s/it]

TRAIN: [    2     3     4 ... 96791 96792 96793] TEST: [    0     1     5 ... 96787 96788 96790]
64529 32265
0.24589493258941247
0.6016326410306291
0.34910585169029457


2it [21:51, 655.74s/it]

TRAIN: [    0     1     3 ... 96789 96790 96791] TEST: [    2     6     9 ... 96786 96792 96793]
64530 32264


KeyboardInterrupt: 

### Тут custom knn + фильтрация результатов с помощью логистической регрессией

In [10]:
n_splits = 3
kf = KFold(n_splits=n_splits, shuffle=True, random_state=27)

k = 0
train_index, test_index = 0, 0
for item in tqdm(kf.split(ids)):
    
    train_index, test_index = item[0], item[1]
    
    print("TRAIN:", train_index, "TEST:", test_index)
    print(len(train_index), len(test_index))
    
    res = clf.predict(X[test_index])
    
    for i in range(len(test_index)):
        X_left = sparse.vstack([X[test_index[i]]] * n_neighbors)
        ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]
        X_full = sparse.hstack([X_left, filter_centroids[ind2]])
        _proba = clf_binlogreg[k].predict_proba(X_full)
        
        _proba = _proba.transpose()[1]
        _res = [1 if p > 0.5 else 0 for p in _proba]
        
        for j, ind in enumerate(ind2):
            if _res[j] == 0:
                res[i][ind] = 0
    
    print("Applied logistic regression")
    y_true = []
    for i in test_index:
        indices = [ok.index(cat) for cat in categories_dict[ids[i]]]
        _y_true = np.zeros(len(ok), dtype=int)
        for index in indices:
            _y_true[index] = 1
        y_true.append(_y_true)
        

    prec, rec, f1, mean_res_len = 0, 0, 0, 0
    for i in range(len(res)):
        ind1 = [j for j in range(len(y_true[i])) if y_true[i][j] == 1]
        ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]
        mean_res_len += len(ind2)
        
        _y_true = [ok[j] for j in ind1]
        _res =[ok[j] for j in ind2]

        prec += metric(_res, _y_true)
        rec += metric(_y_true, _res)


    prec = prec / len(res)
    rec = rec / len(res)
    f1 = 2 * prec * rec / (prec + rec)
    print(prec)
    print(rec)
    print(f1)
    print(mean_res_len / len(res))
    
    k += 1

0it [00:00, ?it/s]

TRAIN: [    0     1     2 ... 96790 96792 96793] TEST: [    3     4     7 ... 96785 96789 96791]
64529 32265
Applied logistic regression
0.09874151065457365
0.16274867769990758
0.12291130611247503
1.622160235549357


1it [14:39, 879.87s/it]

TRAIN: [    2     3     4 ... 96791 96792 96793] TEST: [    0     1     5 ... 96787 96788 96790]
64529 32265
Applied logistic regression
0.10253198340134595
0.16662513070537002
0.12694745366423002
1.6812025414535874


2it [29:29, 884.78s/it]

TRAIN: [    0     1     3 ... 96789 96790 96791] TEST: [    2     6     9 ... 96786 96792 96793]
64530 32264
Applied logistic regression
0.10071588216656961
0.16340755757321265
0.12462155066498448
1.630424001983635


3it [44:22, 887.41s/it]


In [6]:
a = [i for i in range(11)]
for i in range(0, len(a), 2):
    print(a[i:i+2])

[0, 1]
[2, 3]
[4, 5]
[6, 7]
[8, 9]
[10]
