In [1]:
import pandas as pd

In [2]:
news = pd.read_csv("materials.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [4]:
#from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

In [5]:
!pip install razdel
!pip install pymorphy2



In [6]:
#предобработка текстов
import re
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from razdel import tokenize # https://github.com/natasha/razdel

import pymorphy2 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ekate\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [8]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [9]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [10]:
%%time
#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


CPU times: total: 34.3 s
Wall time: 40.6 s


In [11]:
%%time
#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: total: 4min 44s
Wall time: 7min 20s


In [12]:
news['title']

0        [заместитель, председатель, правительство, рф,...
1        [матч, финал, кубок, россия, футбол, приостано...
2        [форвард, авангард, томаш, заборский, прокомме...
3        [главный, тренер, кубань, юрий, красножанин, п...
4        [решение, попечительский, совет, владивостокск...
                               ...                        
26995    [учёный, токийский, университет, морской, наук...
26996    [глава, кафедра, отечественный, история, xx, в...
26997    [американский, учёный, уточнить, возраст, расп...
26998    [последний, год, тропический, углеродный, цикл...
26999    [жить, примерно, тыс, год, назад, территория, ...
Name: title, Length: 27000, dtype: object

In [13]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [14]:
common_dictionary[10]

'ватутин'

In [15]:
%%time
from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

CPU times: total: 1min 44s
Wall time: 1min 59s


In [16]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

In [17]:
# Create a new corpus, made of previously unseen documents.
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(0, 0.037430983), (3, 0.5120158), (5, 0.06961071), (12, 0.3621793)]

In [18]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: космос год работник лётчик вино особенность разбираться
topic_1: двигатель топливо авария скорость блок теория бомба
topic_2: университет наука километр физика студент рекорд норматив
topic_3: статья товар космонавт атмосферный австралия ст бензин
topic_4: россия год который российский военный президент сша
topic_5: женщина журнал дом возраст температура день год
topic_6: год компания проект рубль это млрд рост
topic_7: год млн это тыс фонд который рынок
topic_8: альянс груз коллекция предсказать провал проявляться машина
topic_9: мозг памятник сон девочка лауреат диск лекарство
topic_10: карта тенденция выражение страдать больной автомобиль вес
topic_11: погибнуть рак nn человек форум солнце стать
topic_12: это год который свой мочь весь всё
topic_13: берег горный порт лодка медведь фонд маршрут
topic_14: суд дело иск решение приговор судья подать
topic_15: исследование который система газ также это территория
topic_16: украина гражданин сша россия nn российский украинский
to

In [19]:
#text = news['title'].iloc[0]

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [20]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.014036,0.273324,0.0,0.045293,0.156415,0.0,...,0.0,0.425063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.298605,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.622183,0.0,0.0,0.0,0.0,0.0
2,4897,0.037434,0.0,0.0,0.51204,0.0,0.069634,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4898,0.016026,0.0,0.0,0.213826,0.058345,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4899,0.122963,0.0,0.0,0.0,0.851418,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Следующий шаг - векторные представления пользователей

In [21]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [22]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [23]:
doc_dict[293622]

array([0.        , 0.        , 0.        , 0.        , 0.30761334,
       0.05346847, 0.        , 0.08442589, 0.        , 0.        ,
       0.        , 0.24379148, 0.28086957, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.01829056])

In [24]:
user_articles_list = users['articles'].iloc[33]

def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector


In [25]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.003176,0.00219,0.0,0.100624,0.014038,0.078655,0.054662,0.0,...,0.029886,0.097591,0.0,0.014744,0.075281,0.0,0.062277,0.033897,0.105566,0.003048
1,u108690,0.0,0.0,0.0,0.0,0.18995,0.017741,0.028539,0.041573,0.0,...,0.021474,0.095385,0.005918,0.014895,0.065017,0.0,0.103886,0.022063,0.062748,0.002573
2,u108339,0.0,0.002058,0.0,0.0,0.170225,0.060433,0.069542,0.017036,0.0,...,0.031859,0.046967,0.004303,0.049532,0.160287,0.0,0.09307,0.002195,0.126375,0.001921


In [26]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [27]:
X = pd.merge(user_embeddings, target, 'left')

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(25)]],X['churn'], random_state=0)

In [30]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)


LogisticRegression()

In [31]:
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]


array([0.28833479, 0.15213626, 0.37191185, 0.29982558, 0.10340564,
       0.06752824, 0.15102673, 0.07259135, 0.06735003, 0.05362726])

In [32]:
metrics = pd.DataFrame(columns=['thresholds', 'f-score', 'precision', 'recall', 'ROC AUC'])
metrics

Unnamed: 0,thresholds,f-score,precision,recall,ROC AUC


In [33]:
from sklearn.metrics import (f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix)

In [34]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)

print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))


Best Threshold=0.269614, F-Score=0.711, Precision=0.669, Recall=0.759


In [35]:
metrics = metrics.append({
    'model': 'mean',
    'thresholds': thresholds[ix],
    'f-score': fscore[ix],
    'precision': precision[ix],
    'recall': recall[ix],
    'ROC AUC': roc_auc_score(y_test, preds)
}, ignore_index=True)

metrics

  metrics = metrics.append({


Unnamed: 0,thresholds,f-score,precision,recall,ROC AUC,model
0,0.269614,0.711281,0.669065,0.759184,0.955288,mean


In [36]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [37]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.0,0.0,0.0,0.033087,0.0,0.009286,0.007587,0.0,...,0.0,0.021896,0.0,0.0,0.0,0.0,0.0,0.0,0.084868,0.0
1,u108690,0.0,0.0,0.0,0.0,0.120854,0.020607,0.0,0.00886,0.0,...,0.009342,0.068008,0.005507,0.011331,0.070333,0.0,0.079129,0.0,0.04388,0.0
2,u108339,0.0,0.0,0.0,0.0,0.160668,0.046313,0.031638,0.008736,0.0,...,0.031942,0.017787,0.0,0.025362,0.160235,0.0,0.104554,0.0,0.122107,0.0


In [38]:
X = pd.merge(user_embeddings, target, 'left')

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(25)]],X['churn'], random_state=0)

In [40]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [41]:
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.24139704, 0.07199252, 0.484805  , 0.41658028, 0.16270937,
       0.03063461, 0.06657358, 0.15566892, 0.10233704, 0.03326436])

In [42]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
roc_auc_score(y_test, preds)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.294980, F-Score=0.788, Precision=0.788, Recall=0.788


In [43]:
metrics = metrics.append({
    'model': 'median',
    'thresholds': thresholds[ix],
    'f-score': fscore[ix],
    'precision': precision[ix],
    'recall': recall[ix],
    'ROC AUC': roc_auc_score(y_test, preds)
}, ignore_index=True)

metrics

  metrics = metrics.append({


Unnamed: 0,thresholds,f-score,precision,recall,ROC AUC,model
0,0.269614,0.711281,0.669065,0.759184,0.955288,mean
1,0.29498,0.787755,0.787755,0.787755,0.973903,median


In [44]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [45]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.019058,0.013138,0.0,0.307613,0.053468,0.273498,0.228372,0.0,...,0.104249,0.370916,0.0,0.088465,0.349707,0.0,0.196536,0.203383,0.241711,0.018291
1,u108690,0.0,0.0,0.0,0.0,0.595983,0.037105,0.111274,0.126428,0.0,...,0.077905,0.247679,0.012264,0.046199,0.153318,0.0,0.334863,0.132376,0.16959,0.015438
2,u108339,0.0,0.012349,0.0,0.0,0.36255,0.172964,0.183059,0.049994,0.0,...,0.070928,0.173521,0.015162,0.126279,0.322578,0.0,0.210965,0.013173,0.24931,0.011528


In [46]:
X = pd.merge(user_embeddings, target, 'left')

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(25)]],X['churn'], random_state=0)

In [48]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [49]:
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.28216515, 0.00504246, 0.41647596, 0.56061546, 0.03163345,
       0.03833237, 0.17659563, 0.00853554, 0.00234008, 0.11032024])

In [50]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
roc_auc_score(y_test, preds)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.402301, F-Score=0.746, Precision=0.812, Recall=0.690


In [51]:
metrics = metrics.append({
    'model': 'max',
    'thresholds': thresholds[ix],
    'f-score': fscore[ix],
    'precision': precision[ix],
    'recall': recall[ix],
    'ROC AUC': roc_auc_score(y_test, preds)
}, ignore_index=True)

metrics

  metrics = metrics.append({


Unnamed: 0,thresholds,f-score,precision,recall,ROC AUC,model
0,0.269614,0.711281,0.669065,0.759184,0.955288,mean
1,0.29498,0.787755,0.787755,0.787755,0.973903,median
2,0.402301,0.746137,0.8125,0.689796,0.954216,max
