## Импорты

In [1]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
!pip install plotly.express

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [4]:
!pip install pymorphy3

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [5]:
import pandas as pd
import typing as tp
import numpy as np
import matplotlib.pyplot as plt
import pytz
import gensim
import pyLDAvis.gensim_models
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
import pymorphy3
import inspect
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords')
russian_stopwords = stopwords.words("russian")
nltk.download('punkt')
from tqdm import tqdm
from gensim.models import LdaMulticore, CoherenceModel
from gensim.models import Phrases

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/lldckv/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lldckv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lldckv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Обработка данных

In [11]:
video_info = pd.read_csv('video_info_v2.csv')
train_events_df = pd.read_csv('train_events.csv')
train_targets_df = pd.read_csv('train_targets.csv')

video_info = video_info[['title', 'rutube_video_id']]
train_events_df = train_events_df[['viewer_uid', 'rutube_video_id']]
train_targets_df = train_targets_df[['viewer_uid']]

merged_df_v1 = pd.merge(train_events_df, train_targets_df, on='viewer_uid', how='outer')

merged_df_v2 = pd.merge(merged_df_v1, video_info, on='rutube_video_id', how='left')

video_data = merged_df_v2[['rutube_video_id', 'title']].drop_duplicates()

video_data.head()

Unnamed: 0,rutube_video_id,title
0,video_133074,Папа с особенностями. Мужское / Женское. Выпус...
1,video_61152,День защиты детей. Мужское / Женское. Выпуск о...
2,video_96775,"Новая Битва экстрасенсов, 24 сезон, 11 выпуск"
3,video_402535,Свадьба башкирских стиляг VS Свадьба в стиле бохо
4,video_180483,Мультфильм История игрушек: Большой побег I To...


In [None]:
analyzer = pymorphy3.MorphAnalyzer()

In [None]:
def text_to_wordlist(text: tp.List[str], remove_stopwords: bool=False, bi: bool=False) -> tp.List[str]:
    text = re.sub(r'[^а-яА-ЯёЁa-zA-Z]', " ", text)
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    norm_words = [analyzer.parse(word)[0].normal_form for word in words]
    if remove_stopwords:
        stops = stopwords.words("russian") + ["серия", "сезон", "смотреть", "бесплатно", "фильм",
                                              "новый", "часть", "мульсериал", "кино", "последний",
                                              "озвучка", "шоу", "выпуск", "сериал"]
        if not bi:
            stops = stops + ['and', 'the']
        norm_words = [w for w in norm_words if w not in stops]
    norm_words = [w for w in norm_words if len(w) >= 3]

    return norm_words

In [None]:
video_data["content"] = video_data.apply(lambda row: text_to_wordlist(row["title"], remove_stopwords=True), axis=1)
video_data.head()

### LDA с униграммами

In [None]:
id2word_uni = corpora.Dictionary(video_data["content"].tolist())
texts = video_data["content"].tolist()
corpus_uni = [id2word_uni.doc2bow(text) for text in texts]

In [None]:
st=2
lim=20
stp=2


def compute_coherence_values(dictionary, corp, texts, start:int=st, limit:int = lim, step:int=stp):
    """
	Подсчет c_v когерентности для различного количества тем
	dictionary : Gensim словарь
	corpus : Gensim корпус
	texts : Список текста
	limit : Максимальное количество тем

	model_list : Список LDA моделей
	coherence_values :Когерентности, соответствующие модели LDA с количеством тем
    """
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model=LdaMulticore(corpus=corp,id2word=dictionary, num_topics=num_topics, random_state=np.random.RandomState(42))
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values


In [None]:
model_list_uni, coherence_values_uni = compute_coherence_values(dictionary=id2word_uni, corp=corpus_uni, texts=texts)

In [None]:
def plot_coherence_values(coherence_values: np.array, start:int=st, limit:int = lim, step:int=stp, bi:bool=False):
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    if bi:
        name = 'Биграммы'
    else:
        name = 'Униграммы'
    plt.xlabel(f"Количество тем: {name}")
    plt.ylabel("Согласованность")
    plt.legend(("coherence_values"), loc='best')
    plt.show()
    return x[coherence_values_uni.index(max(coherence_values_uni))]


In [None]:
n_topics_uni = plot_coherence_values(coherence_values=coherence_values_uni)

In [None]:
lda_model_uni = LdaMulticore(corpus=corpus_uni, id2word=id2word_uni, num_topics=n_topics_uni)
lda_model_uni.print_topics()[:3]

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared_uni = pyLDAvis.gensim_models.prepare(lda_model_uni, corpus_uni, id2word_uni)

In [None]:
pyLDAvis.save_html(LDAvis_prepared_uni, 'lda_visualization_unigram.html')

In [None]:
LDAvis_prepared_uni

### LDA с биграммами

In [None]:
bigram = Phrases(texts, min_count=2, threshold=1)
bigram_mod = gensim.models.phrases.Phraser(bigram)

texts_with_bigrams = [bigram_mod[text] for text in texts]

id2word_b = corpora.Dictionary(texts_with_bigrams)
corpus_b = [id2word_b.doc2bow(text) for text in texts_with_bigrams]
model_list_b, coherence_values_b = compute_coherence_values(dictionary=id2word_b, corp=corpus_b, texts=texts_with_bigrams, start=2, limit=20, step=2)

In [None]:
n_topics_b = plot_coherence_values(coherence_values=coherence_values_b)

In [None]:
lda_model_b = LdaMulticore(corpus=corpus_b, id2word=id2word_b, num_topics=n_topics_b)
lda_model_b.print_topics()[:3]

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared_b = pyLDAvis.gensim_models.prepare(lda_model_b, corpus_b, id2word_b)

In [None]:
# pyLDAvis.save_html(LDAvis_prepared_b, 'lda_visualization_bigram.html')

In [None]:
LDAvis_prepared_b