In [1]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary

import re
import numpy as np
from nltk.corpus import stopwords

from razdel import tokenize
import pymorphy2

import nltk

from itertools import product

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#import itertools
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [4]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [5]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [6]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищенный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [7]:
%%time
#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  from ipykernel import kernelapp as app


Wall time: 25.8 s


In [8]:
%%time
#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 3min 10s


In [9]:
# news.head(3)

In [10]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts] #делаем мешок слов

In [11]:
# common_corpus[0]

In [12]:
%%time
from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary, random_state = 42)#, passes=10)

Wall time: 33.4 s


In [13]:
# from gensim.test.utils import datapath
# # Save model to disk.
# temp_file = datapath("model.lda")
# lda.save(temp_file)

# # Load a potentially pretrained model from disk.
# lda = LdaModel.load(temp_file)

In [14]:
# # Create a new corpus, made of previously unseen documents.
# other_texts = [t for t in news['title'].iloc[:3]]
# other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

# unseen_doc = other_corpus[2]
# print(other_texts[2])
# lda[unseen_doc] 

In [15]:
x = lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: год который проект население технология также область
topic_1: ракета пенсия мышь израиль сосед сша компьютер
topic_2: который год это украина свой украинский страна
topic_3: это мочь который всё человек научный компания
topic_4: антонов нация сербия близость сербский вечеринка серб
topic_5: военный банк который млрд год сша тыс
topic_6: год стать который млн тыс университет nn
topic_7: фестиваль мероприятие эстония латвия литва корь корзина
topic_8: фонд исследование пациент китай китайский изучение кризис
topic_9: рубль год суд млн доллар сумма размер
topic_10: район nn вода участок квартира авария катастрофа
topic_11: это год россия который страна nn российский
topic_12: рак взрыв смерть nn первый физика команда
topic_13: погибнуть миссия советовать майкл провинция взрывной южноафриканский
topic_14: газ запуск медицина испытание установка надёжный стандарт
topic_15: девочка вирус кг трансляция лечить сражение сибирь
topic_16: граница белоруссия активность остров доклад сниз

In [16]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [17]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.353032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.013165,0.0,0.0,0.0,0.0,0.0,0.0,0.091835,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.05525,0.0,0.0,0.0,0.057549,0.0,0.0,0.0,0.0
2,4897,0.0,0.048618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100141,0.119389,0.0
3,4898,0.0,0.0,0.0,0.627483,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4899,0.327728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.572274,0.0,0.0,0.0,0.0


### векторные представления пользователей

In [18]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf0 = TfidfVectorizer()
tf0.fit(users['articles'])

TfidfVectorizer()

In [20]:
df_idf = dict(zip(np.array(tf0.get_feature_names(), dtype = np.int_), tf0.idf_))
df_idf

{10: 8.888709524182016,
 100: 7.9078802711702885,
 1000: 8.041411663794811,
 1001: 8.888709524182016,
 1002: 8.888709524182016,
 1003: 8.041411663794811,
 1005: 8.601027451730232,
 1006: 9.29417463229018,
 1007: 9.29417463229018,
 1008: 9.29417463229018,
 1009: 8.888709524182016,
 101: 7.589426540051753,
 1010: 8.601027451730232,
 1011: 9.29417463229018,
 1012: 8.377883900416023,
 1013: 8.888709524182016,
 1014: 8.888709524182016,
 1015: 9.29417463229018,
 1016: 8.888709524182016,
 1017: 8.888709524182016,
 1018: 8.888709524182016,
 1019: 8.377883900416023,
 102: 7.9078802711702885,
 1020: 8.888709524182016,
 1021: 9.29417463229018,
 1022: 9.29417463229018,
 1023: 8.601027451730232,
 1024: 9.29417463229018,
 1025: 8.888709524182016,
 1026: 8.888709524182016,
 1027: 8.888709524182016,
 1028: 8.888709524182016,
 103: 7.9078802711702885,
 1030: 8.601027451730232,
 1031: 8.377883900416023,
 1032: 8.377883900416023,
 1033: 8.601027451730232,
 1034: 8.377883900416023,
 1035: 8.88870952418201

In [21]:
# df_idf = pd.DataFrame(tf0.idf_, index=tf0.get_feature_names(),columns=["idf_weights"]) 
# df_idf.sort_values(by=['idf_weights'])

In [22]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [23]:
# d1 = {0: 1, 1: 3, 2: 2, 3: -1}
# d2 = {0: 2, 2: 4}
# [v * d2[k] for k, v in d1.items() if (k in d2 and k in [0, 1])]

In [24]:
# df_idf[323329]

In [25]:
# doc_dict[323329]

In [26]:
# doc_dict[323329] * df_idf[323329]

In [27]:
def get_user_embedding(user_articles_list, method = 'mean', weighted = False):
    user_articles_list = eval(user_articles_list)
    docs = {k: doc_dict[k] for k in user_articles_list}
    idf = {k: df_idf[k] for k in user_articles_list if k in df_idf}
    if weighted:
#         user_vector = np.array([doc_dict[doc_id] * df_idf[doc_id] for doc_id in user_articles_list])
#         user_vector = np.array([v * df_idf[k] for k, v in doc_dict.items() if (k in user_articles_list and k in df_idf)])
        user_vector = np.array([v * idf[k] for k, v in docs.items() if k in idf])
    else:
        user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    
    if method == 'median':
        user_vector = np.median(user_vector, 0)
    elif method == 'max':
        user_vector = np.max(user_vector, 0)
    else:
        user_vector = np.mean(user_vector, 0)
        
    return user_vector

In [28]:
pd.Series(get_user_embedding(users['articles'].iloc[33], weighted = True, method = 'median')) #33ий юзер

0     0.490969
1     0.000000
2     1.288101
3     1.102327
4     0.000000
5     0.252315
6     0.000000
7     0.000000
8     0.000000
9     0.000000
10    0.000000
11    1.509604
12    0.000000
13    0.000000
14    0.000000
15    0.000000
16    0.000000
17    0.000000
18    0.000000
19    0.000000
20    0.442631
21    0.078465
22    0.000000
23    0.000000
24    0.000000
dtype: float64

In [29]:
pd.concat([pd.DataFrame([(r[0], r[1]) for r in topics_words], columns = ['topic_id', 'words']), pd.Series(get_user_embedding(users['articles'].iloc[33], weighted = True, method = 'median'), name = 'articles')], axis = 1).sort_values(by = ['articles'], ascending=False).head(3)

Unnamed: 0,topic_id,words,articles
11,11,"[это, год, россия, который, страна, nn, россий...",1.509604
2,2,"[который, год, это, украина, свой, украинский,...",1.288101
3,3,"[это, мочь, который, всё, человек, научный, ко...",1.102327


In [30]:
# эмбеддинги для всех пользователей
def get_user_embeddings(weighted, method):
    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, weighted = weighted, method = method), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
    
    return user_embeddings


In [31]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [32]:
def get_metrics(y_test, preds):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    
    return thresholds[ix], fscore[ix], precision[ix], recall[ix]
#     print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
#                                                                             fscore[ix],
#                                                                             precision[ix],
#                                                                             recall[ix]))

In [33]:
w = [True, False]
m = ['mean', 'median', 'max']
# w = [False]
# m = ['mean']
res = {}
for w_, m_ in product(w, m):
    user_embeddings = get_user_embeddings(weighted = w_, method = m_)
    X = pd.merge(user_embeddings, target, 'left')
    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=42)
    logreg = LogisticRegression(max_iter = 500)
    logreg.fit(X_train, y_train)
    preds = logreg.predict_proba(X_test)[:, 1]
    
    threshold, fscore, precision, recall = get_metrics(y_test, preds)
    
    y_pred = (preds>= threshold).astype('int')
    roc_auc = roc_auc_score(y_test, y_pred)
    
#     print(fscore, precision, recall, roc_auc)
    
    res['w_' + format(w_) + ' ' + 'm_' + format(m_)] = [fscore, precision, recall, roc_auc]

0.8024948024948024 0.8464912280701754 0.7628458498023716 0.8714057526058223
0.8228782287822878 0.7716262975778547 0.8814229249011858 0.9218219375507646
0.7210884353741497 0.6328358208955224 0.8379446640316206 0.8837691265206759
0.6265060240963854 0.5548780487804879 0.7193675889328063 0.8178978757486011
0.7245283018867924 0.6931407942238267 0.758893280632411 0.8551192218846085
0.7595419847328245 0.7343173431734318 0.7865612648221344 0.8726738779748908


In [34]:
pd.DataFrame.from_dict(res, orient='index', columns = ['fscore', 'precision', 'recall', 'roc_auc'])

Unnamed: 0,fscore,precision,recall,roc_auc
w_True m_mean,0.802495,0.846491,0.762846,0.871406
w_True m_median,0.822878,0.771626,0.881423,0.921822
w_True m_max,0.721088,0.632836,0.837945,0.883769
w_False m_mean,0.626506,0.554878,0.719368,0.817898
w_False m_median,0.724528,0.693141,0.758893,0.855119
w_False m_max,0.759542,0.734317,0.786561,0.872674


Со взвешиванием статей и метрики качества моделей выше, а с медианой в качестве метода f1 и roc_auc максимальны