In [49]:
import os
import numpy as np
import pandas as pd
from multi_rake import Rake
from collections import Counter
from string import punctuation
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))
morph = MorphAnalyzer()

In [2]:
PATH_TO_DATA = './data'

In [3]:
files = [os.path.join(PATH_TO_DATA, file) for file in os.listdir(PATH_TO_DATA)]

In [4]:
data = pd.concat([pd.read_json(file, lines=True) for file in files][:5], axis=0, ignore_index=True)

In [5]:
def evaluate(true_kws, predicted_kws):
    assert len(true_kws) == len(predicted_kws)
    
    precisions = []
    recalls = []
    f1s = []
    jaccards = []
    
    for i in range(len(true_kws)):
        true_kw = set(true_kws[i])
        predicted_kw = set(predicted_kws[i])
        
        tp = len(true_kw & predicted_kw)
        union = len(true_kw | predicted_kw)
        fp = len(predicted_kw - true_kw)
        fn = len(true_kw - predicted_kw)
        
        if (tp+fp) == 0:
            prec = 0
        else:
            prec = tp / (tp + fp)
        
        if (tp+fn) == 0:
            rec = 0
        else:
            rec = tp / (tp + fn)
        if (prec+rec) == 0:
            f1 = 0
        else:
            f1 = (2*(prec*rec))/(prec+rec)
            
        jac = tp / union
        
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        jaccards.append(jac)
    print('Precision - ', round(np.mean(precisions), 2))
    print('Recall - ', round(np.mean(recalls), 2))
    print('F1 - ', round(np.mean(f1s), 2))
    print('Jaccard - ', round(np.mean(jaccards), 2))

In [6]:
def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0] for word in words if word and word not in stops]
    words = [word.normal_form for word in words if word.tag.POS == 'NOUN']

    return words

In [7]:
data['content_norm'] = data['content'].apply(normalize)

In [8]:
data['title_norm'] = data['title'].apply(normalize)

In [9]:
data['content_norm_str'] = data['content_norm'].apply(' '.join)

In [10]:
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=5)

In [11]:
tfidf.fit(data['content_norm_str'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=5, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [22]:
id2word = {i:word for i,word in enumerate(tfidf.get_feature_names())}

In [23]:
texts_vectors = tfidf.transform(data['content_norm_str'])

In [98]:
keywords = [[id2word[w] for w in top] for top in texts_vectors.toarray().argsort()[:,:-11:-1]] 

In [15]:
evaluate(data['keywords'], keywords)

Precision -  0.13
Recall -  0.25
F1 -  0.16
Jaccard -  0.09


### 1. Counter и нормализация

In [16]:
evaluate(data['keywords'], data['content_norm'].apply(lambda x: [x[0] for x in Counter(x).most_common(10)]))

Precision -  0.09
Recall -  0.33
F1 -  0.13
Jaccard -  0.07


Повысился Recall, упало все остальное.

### 2. RAKE

In [20]:
rake = Rake()

In [42]:
keywords_clean = []

In [43]:
for i in range(len(keywords_rk)):
    keywords_clean.append([])

In [46]:
for i in range(len(keywords_rk)):
    for j in range(len(keywords_rk[i])):
        if len(keywords_rk[i]) > 0:
            keywords_clean[i].append(keywords_rk[i][j][0])
        else:
            keywords_clean[i].append(keywords_rk[i])

In [48]:
evaluate(data['keywords'], keywords_clean)

Precision -  0.01
Recall -  0.01
F1 -  0.01
Jaccard -  0.0


Получилось что-то невразумительно низкое.

### 3. Измененный tf-idf

In [92]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, max_features=1200)

In [93]:
tfidf.fit(data['content_norm_str'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=2000,
                min_df=3, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [94]:
id2word = {i:word for i,word in enumerate(tfidf.get_feature_names())}

In [95]:
texts_vectors = tfidf.transform(data['content_norm_str'])

In [96]:
keywords = [[id2word[w] for w in top] for top in texts_vectors.toarray().argsort()[:,:-11:-1]] 

In [97]:
evaluate(data['keywords'], keywords)

Precision -  0.12
Recall -  0.22
F1 -  0.15
Jaccard -  0.08


Меняя параметры min_df, max_df, max_features, удалось добиться только несильного отклонения от бейзлайна в нижнюю сторону, а не улучшения.