In [1]:
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm
from tqdm import trange
import jsonlines
from scipy import sparse
from os import listdir
import itertools
import networkx as nx
import re
from collections import Counter
import json
import pymorphy2 as pm
import matplotlib.pyplot as plt
from scipy.sparse import load_npz

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib


from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


stopwords_ru = set(stopwords.words('russian'))
tknzr = TweetTokenizer()
morph = pm.MorphAnalyzer()

In [2]:
ok_set = set()
with open("sources/accepted_categories.txt", mode="r", encoding="utf-8") as inp:
    for line in inp:
        line = line[:-1]
        ok_set.add(line)

ok = sorted(ok_set)

categories_dict = {}
with open("sources/article_cat.json", mode="r") as input:
    categories_dict = json.loads(input.read())
    
cat_id = {}
with open("sources/cat_id.json", mode="r") as input:
    cat_id = json.loads(input.read())
    
id_cat ={}
with open("sources/id_cat.json", mode="r") as input:
    id_cat = json.loads(input.read())

In [3]:
clf_binlogreg = joblib.load("sources/clf_binlogreg.pkl")
X = load_npz("sources/text_tfidf.npz")
class_centroids = load_npz("sources/sparce_centroids_nosvd.npz")

In [4]:
ids = sorted([item['id'] for item in jsonlines.open('sources/normalized_texts.jl', 'r')])

In [5]:
y_centroids = np.diag([1] * len(ok))

In [7]:
class CustomKNN:
    
    def __init__(self, n_neighbors=5):
        self.clf = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1)
        self.n_neighbors = n_neighbors
    
    
    def fit(self, X_train, y_train):
        self.clf.fit(X_train)
        self.categories = y_train
        
        
    def predict(self, X_test, n_neighbors=None):
        if n_neighbors is None:
            n_neighbors = self.n_neighbors
            
        kneighbors = self.clf.kneighbors(X_test, n_neighbors, return_distance=False)
        res = []
        for neighs in kneighbors:
            curr_neigh = np.zeros(self.categories.shape[1], dtype=int)
            for neigh in neighs:
                curr_neigh = np.bitwise_or(self.categories[neigh], curr_neigh)
            res.append(curr_neigh)
        
        return np.array(res)

In [8]:
def metric(res, true):
    if len(res) == 0:
        return 0
    
    hits = 0
    for c in res:
        if c in true:
            hits += 1
    return hits / len(res)


In [14]:
#clf = KNeighborsClassifier(n_neighbors=1, n_jobs=-1, weights='distance')
n_neighbors = 15
clf = CustomKNN(n_neighbors=n_neighbors)

In [15]:
clf.fit(class_centroids, y_centroids)

### Тут просто custom knn на центроидах

In [28]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=27)

train_index, test_index = 0, 0
for item in tqdm(kf.split(ids)):
    
    train_index, test_index = item[0], item[1]
    
    print("TRAIN:", train_index, "TEST:", test_index)
    print(len(train_index), len(test_index))
    
    res = clf.predict(X[test_index])
    
    y_true = []
    for i in test_index:
        indices = [ok.index(cat) for cat in categories_dict[ids[i]]]
        _y_true = np.zeros(len(ok), dtype=int)
        for index in indices:
            _y_true[index] = 1
        y_true.append(_y_true)
        

    prec, rec, f1 = 0, 0, 0
    for i in range(len(res)):
        ind1 = [j for j in range(len(y_true[i])) if y_true[i][j] == 1]
        ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]

        _y_true = [ok[j] for j in ind1]
        _res =[ok[j] for j in ind2]

        prec += metric(_res, _y_true)
        rec += metric(_y_true, _res)


    prec = prec / len(res)
    rec = rec / len(res)
    f1 = 2 * prec * rec / (prec + rec)
    print(prec)
    print(rec)
    print(f1)
    break

0it [00:00, ?it/s]

TRAIN: [    0     1     2 ... 96791 96792 96793] TEST: [    4     7    27 ... 96770 96779 96789]
77435 19359
0.3525388708093805
0.7522821981098591
0.48009351760657765


1it [05:09, 309.99s/it]

TRAIN: [    0     1     2 ... 96790 96792 96793] TEST: [    3    10    13 ... 96785 96788 96791]
77435 19359
0.40946329872404125
0.8488683547147046
0.5524464647361782


2it [10:09, 304.81s/it]

TRAIN: [    2     3     4 ... 96791 96792 96793] TEST: [    0     1     5 ... 96778 96781 96790]
77435 19359


KeyboardInterrupt: 

### Тут custom knn + фильтрация результатов с помощью логистической регрессии

In [17]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=27)

train_index, test_index = 0, 0
for item in tqdm(kf.split(ids)):
    
    train_index, test_index = item[0], item[1]
    
    print("TRAIN:", train_index, "TEST:", test_index)
    print(len(train_index), len(test_index))
    
    res = clf.predict(X[test_index])
    
    for i in range(len(test_index)):
        X_left = sparse.vstack([X[test_index[i]]] * n_neighbors)
        ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]
        X_full = sparse.hstack([X_left, class_centroids[ind2]])
        #_res = clf_binlogreg.predict(X_full)
        _proba = clf_binlogreg.predict_proba(X_full)
        
        #print(_proba)
        _proba = _proba.transpose()[1]
        _res = [1 if p > 0.3 else 0 for p in _proba]
        #print(_res)
        #print("\n")
        
        for j, ind in enumerate(ind2):
            if _res[j] == 0:
                res[i][ind] = 0
    
    print("Applied logistic regression")
    y_true = []
    for i in test_index:
        indices = [ok.index(cat) for cat in categories_dict[ids[i]]]
        _y_true = np.zeros(len(ok), dtype=int)
        for index in indices:
            _y_true[index] = 1
        y_true.append(_y_true)
        

    prec, rec, f1, mean_res_len = 0, 0, 0, 0
    for i in range(len(res)):
        ind1 = [j for j in range(len(y_true[i])) if y_true[i][j] == 1]
        ind2 = [j for j in range(len(res[i])) if res[i][j] == 1]
        mean_res_len += len(ind2)
        
        _y_true = [ok[j] for j in ind1]
        _res =[ok[j] for j in ind2]

        prec += metric(_res, _y_true)
        rec += metric(_y_true, _res)


    prec = prec / len(res)
    rec = rec / len(res)
    f1 = 2 * prec * rec / (prec + rec)
    print(prec)
    print(rec)
    print(f1)
    print(mean_res_len / len(res))
    break

0it [00:00, ?it/s]

TRAIN: [    0     1     2 ... 96791 96792 96793] TEST: [    4     7    27 ... 96770 96779 96789]
77435 19359
Applied logistic regression
0.1483269734896763
0.8265121851528797
0.2515164678997413
14.784338033989359
