## Домашнее задание

In [4]:
# Загрузка необходимых библиотек

import re
import numpy as np
import pandas as pd

import nltk
import pymorphy2

from gensim.corpora.dictionary import Dictionary
from gensim.test.utils import common_texts
from gensim.models import LdaModel
from gensim.test.utils import datapath

from nltk.corpus import stopwords
from razdel import tokenize

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

%matplotlib inline

In [24]:
# Загрузка данных

df_news = pd.read_csv('articles.csv')
df_users = pd.read_csv('users_articles.csv')
df_target = pd.read_csv('users_churn.csv')

---

### Действия с вебинара

In [6]:
nltk.download('stopwords')

stopword_ru = stopwords.words('russian')

# Проверка

len(stopword_ru)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ilya.ivolgin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


151

In [7]:
morph = pymorphy2.MorphAnalyzer()

In [8]:
with open('stopwords.txt') as f:
    
    additional_stopwords = [w.strip() for w in f.readlines() if w]
    
stopword_ru += additional_stopwords

# Проверка

len(stopword_ru)

776

In [9]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())

    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        
        if w[0] == '-': # [2]
            
            w = w[1:]
            
        if len(w)>1: # [3]
            
            if w in cache: # [4]
                
                words_lem.append(cache[w])
                
            else: # [5]
                
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords = [i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [10]:
# Очистка текста

df_news['title'] = df_news['title'].apply(lambda x: clean_text(x), 1)

%time

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [11]:
# Лемматизация текста

df_news['title'] = df_news['title'].apply(lambda x: lemmatization(x), 1)

%time

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.05 µs


In [13]:
# Сформируем список наших текстов, разбив еще и на пробелы

texts = [t for t in df_news['title'].values]

# Create a corpus from a list of texts

common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [14]:
# Train model on the corpus

lda = LdaModel(common_corpus, num_topics = 25, id2word = common_dictionary)

# Save model to disk

temp_file = datapath("model.lda")

lda.save(temp_file)

# Load a potentially pretrained model from disk

lda = LdaModel.load(temp_file)

In [16]:
# Create a new corpus, made of previously unseen documents

other_texts = [t for t in df_news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]

print(other_texts[2])

lda[unseen_doc]

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'играть', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'хороший']


[(2, 0.35098556),
 (5, 0.11448648),
 (6, 0.03015216),
 (12, 0.05263341),
 (13, 0.040538915),
 (17, 0.024619753),
 (21, 0.07387488),
 (22, 0.29475683)]

In [17]:
x = lda.show_topics(num_topics = 25, num_words = 7, formatted = False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

# Below Code Prints Only Words

for topic,words in topics_words:
    
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: фонд статья земля работник мвд ск грузия
topic_1: женщина наука nn мужчина следствие девочка вуз
topic_2: исследование nn пенсия связанный расследование форум американский
topic_3: рубль банк федеральный область nn депутат департамент
topic_4: всё большой очень район проблема рынок дом
topic_5: японский япония южный северный корея лётчик китай
topic_6: ii орден су армения офицер сопровождать задать
topic_7: украина политический египет фотография граница партия писать
topic_8: китай китайский рейтинг место теория сон лауреат
topic_9: россия сша российский nn новый глава власть
topic_10: смерть эксперимент взрыв пострадать чиновник причина задержать
topic_11: военный россия газ nn население первый статья
topic_12: мозг станция тыс закон рейс принять nn
topic_13: ракета россия фестиваль российский флот виза риа
topic_14: научный температура изучение nn скорость русский вода
topic_15: новый россия армия проект доход ссср первый
topic_16: земля ребёнок остров российский механизм ис

In [18]:
def get_lda_vector(text):
    
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    
    for i in range(25):
        
        if i not in not_null_topics:
            
            output_vector.append(0)
            
        else:
            
            output_vector.append(not_null_topics[i])
            
    return np.array(output_vector)

In [19]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in df_news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = df_news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]

# Проверка

topic_matrix.head(10)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.049258,0.0,0.491971,0.0,0.025118,0.0,0.0,0.0,...,0.0,0.23847,0.0,0.0,0.0,0.0,0.01199,0.041746,0.0,0.0
1,4896,0.096547,0.0,0.0,0.0,0.0,0.254693,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.348976,0.0,0.0,0.107974,0.03022,0.0,0.093045,...,0.0,0.0,0.024863,0.0,0.0,0.0,0.0,0.31423,0.0,0.0
3,4898,0.0,0.0,0.0,0.0,0.273331,0.049796,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091573,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.105871,0.0,0.149084,0.217083,0.0
5,4900,0.0,0.235192,0.409069,0.0,0.0,0.131271,0.0,0.0,0.0,...,0.094908,0.0,0.0,0.0,0.0,0.0,0.0,0.115508,0.0,0.0
6,4901,0.654593,0.0,0.0,0.0,0.0,0.282824,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,4902,0.0,0.0,0.0,0.0,0.0,0.380073,0.0,0.0,0.215007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068518,0.0,0.0
8,4903,0.0,0.204798,0.0,0.0,0.20391,0.229796,0.0,0.0,0.099742,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,4904,0.0,0.565015,0.0,0.0,0.0,0.034469,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115077,0.0,0.0


In [20]:
# Doc dict

doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

---

### Домашнее задание

1. Модифицировать код функции ``get_user_embedding`` таким образом, чтобы считалось не среднее (как в примере ``np.mean``), а медиана.
2. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: ``roc auc``, ``precision/recall/f_score``(для 3 последних - подобрать оптимальный порог с помощью ``precision_recall_curve``, как это делалось на уроке)

In [22]:
# Добавляем еще один аргумент

def get_user_embedding(user_articles_list, method):
    
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = method(user_vector, 0)
    
    return user_vector

In [33]:
def get_user_embeddings(data, method):
    
    user_embeddings = pd.DataFrame([i for i in data.apply(lambda x: get_user_embedding(x, method), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
    user_embeddings['uid'] = df_users['uid'].values
    
    return user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]

In [34]:
# Проверка df_target

df_target.head(10)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0
3,u103439,0
4,u104300,0
5,u102598,0
6,u107753,0
7,u103650,0
8,u106926,0
9,u103486,0


In [35]:
# Train, test

def get_trains_tests(df_users, df_target, method_list=[np.mean, np.median, np.max]):
    
    trains_tests = []
    
    for method in method_list:
        
        user_embeddings = get_user_embeddings(df_users['articles'], method)
        X = pd.merge(user_embeddings, df_target, 'left')
        X_train, X_test, y_train, y_test = train_test_split(
            X[['topic_{}'.format(i) for i in range(25)]], 
            X['churn'], random_state = 0
        )

        trains_tests += [(X_train, X_test, y_train, y_test, method.__name__,)]
        
    return trains_tests

In [36]:
# Проверка

train_tests = get_trains_tests(df_users, df_target)

In [37]:
results = []

for train_test in train_tests:
    
    X_train, X_test, y_train, y_test, method_name = train_test

    logreg = LogisticRegression()
    
    # Обучение
    
    logreg.fit(X_train, y_train)

    preds = logreg.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    
    fscore = (2 * precision * recall) / (precision + recall)
    
    ix = np.argmax(fscore)

    roc_auc = roc_auc_score(y_true=y_test, y_score=preds)

    results += [roc_auc, precision[ix], recall[ix], fscore[ix], thresholds[ix]]

    print('Method %s: ROC_AUC=%f Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (method_name,
                                                                            roc_auc,
                                                                            thresholds[ix], 
                                                                            fscore[ix],
                                                                            precision[ix],
                                                                            recall[ix]))

Method mean: ROC_AUC=0.958865 Best Threshold=0.224340, F-Score=0.713, Precision=0.636, Recall=0.812
Method median: ROC_AUC=0.986918 Best Threshold=0.260489, F-Score=0.849, Precision=0.829, Recall=0.869
Method amax: ROC_AUC=0.978171 Best Threshold=0.381780, F-Score=0.817, Precision=0.878, Recall=0.763


In [38]:
df_results = pd.DataFrame(results)

In [39]:
df_results.to_csv('results.csv')

**Вывод:** если требуется увеличить ``Recall``, то ``median`` для рассчета embedding. Если ``Precison``, то ``max``.