# Домашнее задание  № 5. Матричные разложения/Тематическое моделирование

### Задание № 1 (4 балла)

Попробуйте матричные разложения с 5 классификаторами - SGDClassifier, KNeighborsClassifier,  RandomForest, ExtraTreesClassifier (про него подробнее почитайте в документации, он похож на RF). Используйте и NMF и SVD. Сравните результаты на кросс-валидации и выберите лучшее сочетание.

В итоге у вас должно получиться, как минимум 10 моделей (два разложения на каждый классификатор). Используйте 1 и те же параметры кросс-валидации. Параметры векторизации, параметры K в матричных разложениях, параметры классификаторов могут быть разными между экспериментами.

Можете взять поменьше данных, если все будет обучаться слишком долго (не ставьте параметр K слишком большим в NMF, иначе точно будет слишком долго)

In [34]:
import gensim
import pandas as pd
import numpy as np
from pymorphy2 import MorphAnalyzer
import pyLDAvis.gensim_models
from collections import Counter
from string import punctuation
from razdel import tokenize as razdel_tokenize
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold

In [18]:
morph = MorphAnalyzer()

In [10]:
data = pd.read_csv('avito_category_classification.csv')

In [11]:
def normalize(text):
    normalized_text = [word.text.strip(punctuation) for word in razdel_tokenize(text)]
    normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20 ]
    normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
    return ' '.join(normalized_text)

In [19]:
data['description_norm'] = data['description'].apply(normalize)

In [23]:
def eval_table(X, y, pipeline, N=6):
    # зафиксируем порядок классов
    labels = list(set(y))
    
    # метрики отдельных фолдов будет хранить в табличке
    fold_metrics = pd.DataFrame(index=labels)
    # дополнительно также соберем таблицу ошибок
    errors = np.zeros((len(labels), len(labels)))
    
    # создаем стратегию кросс-валидации
    # shuffle=True (перемешивание) - часто критично важно указать
    # т.к. данные могут быть упорядочены и модель на этом обучится
    kfold = StratifiedKFold(n_splits=N, shuffle=True, )
    
    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        # fit-predict как и раньше, но сразу пайплайном
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])
        
        # записываем метрику и индекс фолда
        fold_metrics[f'precision_{i}'] = precision_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'recall_{i}'] = recall_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)
        errors += confusion_matrix(y[test_index], preds, labels=labels, normalize='true')
    
    # таблица для усредненных значений
    # тут мы берем колонки со значениями и усредняем их
    # часто также все метрики сразу суммируют и в конце просто делят на количество фолдов
    # но мы тут помимо среднего также хотим посмотреть на стандартное отклонение
    # чтобы понять как сильно варьируются оценки моделей
    result = pd.DataFrame(index=labels)
    result['precision'] = fold_metrics[[f'precision_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['precision_std'] = fold_metrics[[f'precision_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['recall'] = fold_metrics[[f'recall_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['recall_std'] = fold_metrics[[f'recall_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['f1'] = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['f1_std'] = fold_metrics[[f'f1_{i}' for i in range(N)]].std(axis=1).round(2)
    
    # добавим одну колонку со средним по всем классам
    result.loc['mean'] = result.mean().round(2)
    # проценты ошибок просто усредняем
    errors /= N
    
    return result, errors

**SGDClassifier**

In [35]:
pipeline_nmf_sgd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('nmf', NMF(200)),
    ('clf', SGDClassifier(max_iter=200))
])

pipeline_svd_sgd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(200)),
    ('clf', SGDClassifier(max_iter=200))
])

In [36]:
metrics_nmf_sgd, errors_bow_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_sgd)
metrics_svd_sgd, errors_svd_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_svd_sgd)

In [37]:
metrics_nmf_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.52,0.09,0.63,0.1,0.56,0.03
Мебель и интерьер,0.73,0.14,0.14,0.08,0.22,0.11
Бытовая техника,0.54,0.21,0.12,0.08,0.19,0.1
Предложение услуг,0.58,0.19,0.55,0.19,0.5,0.08
Детская одежда и обувь,0.63,0.09,0.62,0.09,0.62,0.02
Телефоны,0.7,0.1,0.49,0.09,0.57,0.03
Ремонт и строительство,0.56,0.13,0.19,0.08,0.27,0.08
Квартиры,0.76,0.2,0.91,0.07,0.81,0.14
Товары для детей и игрушки,0.6,0.11,0.39,0.09,0.46,0.04
Автомобили,0.73,0.28,0.71,0.16,0.66,0.2


In [38]:
metrics_svd_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.65,0.03,0.79,0.04,0.71,0.02
Мебель и интерьер,0.64,0.07,0.56,0.06,0.59,0.03
Бытовая техника,0.63,0.09,0.42,0.09,0.5,0.08
Предложение услуг,0.72,0.06,0.71,0.07,0.71,0.02
Детская одежда и обувь,0.77,0.05,0.69,0.03,0.73,0.02
Телефоны,0.81,0.09,0.78,0.03,0.79,0.04
Ремонт и строительство,0.55,0.09,0.44,0.08,0.48,0.04
Квартиры,0.95,0.03,0.95,0.02,0.95,0.01
Товары для детей и игрушки,0.68,0.11,0.63,0.08,0.65,0.04
Автомобили,0.85,0.05,0.89,0.05,0.87,0.02


In [39]:
metrics_nmf_sgd - metrics_svd_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",-0.13,0.06,-0.16,0.06,-0.15,0.01
Мебель и интерьер,0.09,0.07,-0.42,0.02,-0.37,0.08
Бытовая техника,-0.09,0.12,-0.3,-0.01,-0.31,0.02
Предложение услуг,-0.14,0.13,-0.16,0.12,-0.21,0.06
Детская одежда и обувь,-0.14,0.04,-0.07,0.06,-0.11,0.0
Телефоны,-0.11,0.01,-0.29,0.06,-0.22,-0.01
Ремонт и строительство,0.01,0.04,-0.25,0.0,-0.21,0.04
Квартиры,-0.19,0.17,-0.04,0.05,-0.14,0.13
Товары для детей и игрушки,-0.08,0.0,-0.24,0.01,-0.19,0.0
Автомобили,-0.12,0.23,-0.18,0.11,-0.21,0.18


По всем метрикам SVD гораздо лучше.

**KNeighborsClassifier**

In [136]:
pipeline_nmf_knc = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), max_features = 2000)),
    ('nmf', NMF(300)),
    ('clf', KNeighborsClassifier())
])

pipeline_svd_knc = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), max_features = 2000)),
    ('svd', TruncatedSVD(300)),
    ('clf', KNeighborsClassifier())
])

In [138]:
metrics_nmf_knc, errors_bow_knc = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_knc)
metrics_svd_knc, errors_svd_knc = eval_table(data['description_norm'], data['category_name'], pipeline_svd_knc)

In [139]:
metrics_nmf_knc

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.48,0.03,0.59,0.02,0.53,0.01
Мебель и интерьер,0.31,0.07,0.25,0.05,0.27,0.06
Бытовая техника,0.19,0.03,0.2,0.02,0.19,0.02
Предложение услуг,0.47,0.04,0.46,0.08,0.46,0.06
Детская одежда и обувь,0.5,0.03,0.58,0.05,0.53,0.04
Телефоны,0.59,0.05,0.37,0.09,0.45,0.07
Ремонт и строительство,0.29,0.1,0.17,0.06,0.21,0.07
Квартиры,0.8,0.06,0.62,0.06,0.69,0.05
Товары для детей и игрушки,0.41,0.05,0.24,0.05,0.3,0.05
Автомобили,0.45,0.02,0.58,0.08,0.51,0.04


In [140]:
metrics_svd_knc

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.47,0.02,0.59,0.02,0.52,0.02
Мебель и интерьер,0.34,0.08,0.25,0.02,0.28,0.03
Бытовая техника,0.28,0.05,0.32,0.04,0.3,0.04
Предложение услуг,0.57,0.03,0.57,0.07,0.57,0.04
Детская одежда и обувь,0.49,0.02,0.6,0.04,0.54,0.03
Телефоны,0.68,0.11,0.33,0.04,0.44,0.05
Ремонт и строительство,0.28,0.05,0.15,0.03,0.2,0.03
Квартиры,0.93,0.02,0.75,0.04,0.83,0.02
Товары для детей и игрушки,0.49,0.07,0.24,0.02,0.32,0.03
Автомобили,0.58,0.08,0.61,0.05,0.6,0.06


In [141]:
metrics_nmf_knc - metrics_svd_knc

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.01,0.01,0.0,0.0,0.01,-0.01
Мебель и интерьер,-0.03,-0.01,0.0,0.03,-0.01,0.03
Бытовая техника,-0.09,-0.02,-0.12,-0.02,-0.11,-0.02
Предложение услуг,-0.1,0.01,-0.11,0.01,-0.11,0.02
Детская одежда и обувь,0.01,0.01,-0.02,0.01,-0.01,0.01
Телефоны,-0.09,-0.06,0.04,0.05,0.01,0.02
Ремонт и строительство,0.01,0.05,0.02,0.03,0.01,0.04
Квартиры,-0.13,0.04,-0.13,0.02,-0.14,0.03
Товары для детей и игрушки,-0.08,-0.02,0.0,0.03,-0.02,0.02
Автомобили,-0.13,-0.06,-0.03,0.03,-0.09,-0.02


По F-мере незначительно лучше SVD.

**RandomForestClassifier**

In [48]:
pipeline_nmf_rf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('nmf', NMF(200)),
    ('clf', RandomForestClassifier(n_estimators=50, max_depth=10))
])

pipeline_svd_rf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(200)),
    ('clf', RandomForestClassifier(n_estimators=50, max_depth=10))
])

In [49]:
metrics_nmf_rf, errors_bow_rf = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_rf)
metrics_svd_rf, errors_svd_rf = eval_table(data['description_norm'], data['category_name'], pipeline_svd_rf)

In [50]:
metrics_nmf_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.45,0.02,0.82,0.01,0.58,0.02
Мебель и интерьер,0.76,0.07,0.31,0.08,0.44,0.08
Бытовая техника,0.0,0.0,0.0,0.0,0.0,0.0
Предложение услуг,0.73,0.06,0.51,0.04,0.6,0.05
Детская одежда и обувь,0.67,0.02,0.68,0.02,0.67,0.02
Телефоны,0.81,0.08,0.61,0.08,0.69,0.06
Ремонт и строительство,0.78,0.2,0.05,0.03,0.1,0.05
Квартиры,0.9,0.02,0.94,0.02,0.92,0.02
Товары для детей и игрушки,0.86,0.06,0.37,0.04,0.52,0.05
Автомобили,0.9,0.04,0.75,0.07,0.81,0.04


In [51]:
metrics_svd_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.47,0.01,0.73,0.02,0.57,0.01
Мебель и интерьер,0.78,0.23,0.06,0.02,0.1,0.03
Бытовая техника,0.84,0.2,0.08,0.06,0.14,0.1
Предложение услуг,0.68,0.07,0.59,0.05,0.63,0.04
Детская одежда и обувь,0.47,0.01,0.67,0.02,0.55,0.01
Телефоны,0.89,0.06,0.41,0.02,0.56,0.03
Ремонт и строительство,0.63,0.09,0.12,0.02,0.2,0.04
Квартиры,0.91,0.04,0.89,0.04,0.9,0.02
Товары для детей и игрушки,0.73,0.07,0.24,0.05,0.36,0.07
Автомобили,0.8,0.02,0.67,0.05,0.73,0.04


In [52]:
metrics_nmf_rf - metrics_svd_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",-0.02,0.01,0.09,-0.01,0.01,0.01
Мебель и интерьер,-0.02,-0.16,0.25,0.06,0.34,0.05
Бытовая техника,-0.84,-0.2,-0.08,-0.06,-0.14,-0.1
Предложение услуг,0.05,-0.01,-0.08,-0.01,-0.03,0.01
Детская одежда и обувь,0.2,0.01,0.01,0.0,0.12,0.01
Телефоны,-0.08,0.02,0.2,0.06,0.13,0.03
Ремонт и строительство,0.15,0.11,-0.07,0.01,-0.1,0.01
Квартиры,-0.01,-0.02,0.05,-0.02,0.02,0.0
Товары для детей и игрушки,0.13,-0.01,0.13,-0.01,0.16,-0.02
Автомобили,0.1,0.02,0.08,0.02,0.08,0.0


По F-мере чуть лучше NMF.

**ExtraTreesClassifier**

In [53]:
pipeline_nmf_et = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=7, max_df=0.4)),
    ('nmf', NMF(100)),
    ('clf', ExtraTreesClassifier(n_estimators = 50, max_depth=5))
])

pipeline_svd_et = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=7, max_df=0.4)),
    ('svd', TruncatedSVD(100)),
    ('clf', ExtraTreesClassifier(n_estimators = 50, max_depth=5))
])

In [54]:
metrics_nmf_et, errors_bow_et = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_et)
metrics_svd_et, errors_svd_et = eval_table(data['description_norm'], data['category_name'], pipeline_svd_et)

In [55]:
metrics_nmf_et

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.35,0.05,0.82,0.08,0.49,0.03
Мебель и интерьер,0.0,0.0,0.0,0.0,0.0,0.0
Бытовая техника,0.0,0.0,0.0,0.0,0.0,0.0
Предложение услуг,0.24,0.37,0.01,0.01,0.02,0.02
Детская одежда и обувь,0.44,0.1,0.56,0.05,0.49,0.05
Телефоны,1.0,0.0,0.06,0.03,0.12,0.05
Ремонт и строительство,0.17,0.41,0.01,0.01,0.01,0.03
Квартиры,0.96,0.02,0.64,0.1,0.77,0.06
Товары для детей и игрушки,0.95,0.06,0.07,0.04,0.13,0.07
Автомобили,1.0,0.0,0.12,0.05,0.21,0.08


In [56]:
metrics_svd_et

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.34,0.01,0.78,0.05,0.47,0.01
Мебель и интерьер,0.33,0.52,0.01,0.01,0.01,0.02
Бытовая техника,0.8,0.4,0.04,0.03,0.07,0.05
Предложение услуг,0.17,0.41,0.0,0.01,0.01,0.02
Детская одежда и обувь,0.37,0.03,0.59,0.02,0.45,0.02
Телефоны,1.0,0.0,0.06,0.02,0.11,0.04
Ремонт и строительство,0.82,0.22,0.03,0.01,0.05,0.03
Квартиры,0.83,0.08,0.25,0.06,0.38,0.07
Товары для детей и игрушки,0.78,0.29,0.03,0.02,0.07,0.04
Автомобили,0.98,0.06,0.06,0.02,0.11,0.04


In [57]:
metrics_nmf_et - metrics_svd_et

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.01,0.04,0.04,0.03,0.02,0.02
Мебель и интерьер,-0.33,-0.52,-0.01,-0.01,-0.01,-0.02
Бытовая техника,-0.8,-0.4,-0.04,-0.03,-0.07,-0.05
Предложение услуг,0.07,-0.04,0.01,0.0,0.01,0.0
Детская одежда и обувь,0.07,0.07,-0.03,0.03,0.04,0.03
Телефоны,0.0,0.0,0.0,0.01,0.01,0.01
Ремонт и строительство,-0.65,0.19,-0.02,0.0,-0.04,0.0
Квартиры,0.13,-0.06,0.39,0.04,0.39,-0.01
Товары для детей и игрушки,0.17,-0.23,0.04,0.02,0.06,0.03
Автомобили,0.02,-0.06,0.06,0.03,0.1,0.04


По F-мере незначительно лучше NMF.

### Задание № 2 (6 баллов)

В Gensim тоже можно добавить нграммы и tfidf. Постройте 1 модель без них (как в семинаре) и еще 3 модели (1 с нграммами, 1 с tfidf и 1 с нграммами и с tfidf). Сранивте качество с помощью метрик (перплексия, когерентность) и на глаз. Определите лучшую модель. Для каждой модели выберите 1 самую красивую на ваш взгляд тему.

Используйте данные википедии из семинара. Можете взять поменьше данных, если все обучается долго.

Важное требование - получившиеся модели не должны быть совсем плохими. Если хороших тем не получается, попробуйте настроить гиперпараметры, отфильтровать словарь по-другому. 

**Обычная модель**

In [68]:
texts = open('wiki_data.txt', encoding='utf-8').read().splitlines()[:5000]
texts = ([normalize(text) for text in texts])

In [77]:
texts = [text.split() for text in texts]

In [78]:
dictionary = gensim.corpora.Dictionary(texts)
dictionary.filter_extremes(no_above=0.1, no_below=10)
dictionary.compactify()

In [80]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [81]:
lda = gensim.models.LdaMulticore(corpus, 
                                 100,
                                 alpha='asymmetric',
                                 id2word=dictionary, 
                                 passes=10) 

In [82]:
lda.print_topics()

[(99,
  '0.013*"доска" + 0.008*"пение" + 0.006*"школа" + 0.006*"тропа" + 0.006*"памятник" + 0.005*"голосование" + 0.005*"сша" + 0.005*"скульптор" + 0.005*"нидерландский" + 0.005*"премьер-министр"'),
 (98,
  '0.057*"река" + 0.041*"км" + 0.039*"озеро" + 0.032*"берег" + 0.013*"сельский" + 0.012*"поселение" + 0.010*"высота" + 0.010*"численность" + 0.009*"флаг" + 0.009*"иметься"'),
 (97,
  '0.028*"университет" + 0.015*"строительство" + 0.009*"мост" + 0.009*"здание" + 0.008*"студент" + 0.008*"проект" + 0.007*"общество" + 0.007*"рубль" + 0.007*"дом" + 0.006*"построить"'),
 (96,
  '0.038*"театр" + 0.014*"сергей" + 0.014*"александр" + 0.012*"дом" + 0.010*"московский" + 0.010*"москва" + 0.009*"дмитрий" + 0.009*"русский" + 0.009*"театральный" + 0.008*"спектакль"'),
 (95,
  '0.014*"штат" + 0.009*"опера" + 0.008*"партия" + 0.007*"задание" + 0.006*"оперный" + 0.006*"способность" + 0.005*"предприятие" + 0.005*"телеканал" + 0.005*"сеть" + 0.004*"италия"'),
 (94,
  '0.026*"казахский" + 0.024*"сср" + 0.

**Модель с n-граммами**

In [84]:
ph = gensim.models.Phrases(texts, scoring='npmi', threshold=0.4)
p = gensim.models.phrases.Phraser(ph)
ngrammed_texts = p[texts] 

In [85]:
dictionary_ngram = gensim.corpora.Dictionary(ngrammed_texts)
dictionary_ngram.filter_extremes(no_above=0.1, no_below=10)
dictionary_ngram.compactify()

In [86]:
corpus_ngrams = [dictionary_ngram.doc2bow(text) for text in texts]

In [87]:
lda_ngrams = gensim.models.LdaMulticore(corpus, 
                                        100,
                                        alpha='asymmetric',
                                        id2word=dictionary, 
                                        passes=10) 

In [88]:
lda_ngrams.print_topics()

[(99,
  '0.030*"военный" + 0.020*"суд" + 0.017*"ян" + 0.014*"должность" + 0.013*"лихтенштейн" + 0.013*"прокурор" + 0.011*"судья" + 0.011*"прокуратура" + 0.011*"израиль" + 0.010*"главное"'),
 (97,
  '0.029*"церковь" + 0.021*"храм" + 0.014*"памятник" + 0.013*"епархия" + 0.012*"собор" + 0.012*"здание" + 0.011*"культура" + 0.011*"епископ" + 0.010*"православный" + 0.009*"религиозный"'),
 (98,
  '0.054*"волость" + 0.023*"1-й" + 0.018*"брянский" + 0.014*"2-й" + 0.013*"венесуэла" + 0.011*"уезд" + 0.011*"образовать" + 0.011*"1977" + 0.011*"округ" + 0.010*"1978"'),
 (96,
  '0.029*"смотреть" + 0.025*"длина" + 0.023*"мм" + 0.016*"лист" + 0.016*"диаметр" + 0.015*"дерево" + 0.014*"высота" + 0.014*"эвкалипт" + 0.011*"ширина" + 0.011*"семейство"'),
 (95,
  '0.018*"игрок" + 0.015*"уровень" + 0.014*"угол" + 0.010*"каждый" + 0.009*"приём" + 0.008*"направление" + 0.008*"измерение" + 0.007*"игровой" + 0.007*"способ" + 0.007*"друг"'),
 (93,
  '0.065*"зимний" + 0.020*"1994" + 0.016*"2006" + 0.016*"чили" + 0.

**Модель с tf-idf**

In [89]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictionary, )
corpus_tfidf = tfidf[corpus]

In [90]:
lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                        100,
                                        alpha='asymmetric',
                                        id2word=dictionary, 
                                        passes=10)

In [92]:
lda_tfidf.print_topics()

[(99,
  '0.000*"рыцарский" + 0.000*"крест" + 0.000*"командир" + 0.000*"1944" + 0.000*"дубовый" + 0.000*"танковый" + 0.000*"наградить" + 0.000*"бой" + 0.000*"генерал-майор" + 0.000*"вернер"'),
 (98,
  '0.000*"хабаровский" + 0.000*"культура" + 0.000*"еврейский" + 0.000*"керамика" + 0.000*"э" + 0.000*"пещера" + 0.000*"регион" + 0.000*"автономный" + 0.000*"александр" + 0.000*"назад"'),
 (97,
  '0.009*"белград" + 0.000*"тим" + 0.000*"университет" + 0.000*"концлагерь" + 0.000*"грузовик" + 0.000*"лагерь" + 0.000*"ричард" + 0.000*"де" + 0.000*"снаряд" + 0.000*"ирландия"'),
 (96,
  '0.071*"полюс" + 0.000*"епархия" + 0.000*"нью-йоркский" + 0.000*"экспедиция" + 0.000*"руб" + 0.000*"собака" + 0.000*"церковь" + 0.000*"я" + 0.000*"кинокритик" + 0.000*"york"'),
 (94,
  '0.026*"файл" + 0.006*"вставка" + 0.003*"директива" + 0.000*"цикл" + 0.000*"беженец" + 0.000*"сериал" + 0.000*"параметр" + 0.000*"сыграть" + 0.000*"оон" + 0.000*"альбом"'),
 (95,
  '0.001*"переходный" + 0.000*"мост" + 0.000*"хутор" + 0

**Модель с n-граммами и tf-idf**

In [93]:
corpus_ngrams_tfidf = tfidf[corpus_ngrams]

In [94]:
lda_ngrams_tfidf = gensim.models.LdaMulticore(corpus, 
                                        100,
                                        alpha='asymmetric',
                                        id2word=dictionary, 
                                        passes=10)

In [95]:
lda_ngrams_tfidf.print_topics()

[(99,
  '0.021*"билет" + 0.011*"портрет" + 0.011*"выпуск" + 0.011*"рубль" + 0.009*"образец" + 0.008*"печать" + 0.008*"сторона" + 0.008*"знак" + 0.007*"макс" + 0.007*"серия"'),
 (98,
  '0.032*"лагерь" + 0.018*"проведение" + 0.015*"№" + 0.015*"ярославль" + 0.012*"ян" + 0.011*"заключить" + 0.010*"дата" + 0.010*"концентрационный" + 0.009*"акция" + 0.008*"городской"'),
 (97,
  '0.040*"музей" + 0.023*"дом" + 0.014*"художник" + 0.012*"искусство" + 0.012*"улица" + 0.012*"н" + 0.011*"произведение" + 0.010*"здание" + 0.009*"москва" + 0.009*"общество"'),
 (96,
  '0.029*"снукер" + 0.026*"lotus" + 0.020*"литературный" + 0.019*"писатель" + 0.013*"одесский" + 0.012*"одесса" + 0.012*"союз" + 0.011*"чемпионат" + 0.010*"профессиональный" + 0.009*"английский"'),
 (95,
  '0.021*"россия" + 0.011*"российский" + 0.009*"опера" + 0.009*"премия" + 0.008*"экономический" + 0.008*"смит" + 0.007*"международный" + 0.007*"общество" + 0.007*"музыкальный" + 0.007*"школа"'),
 (94,
  '0.058*"князь" + 0.027*"василий" + 0.

**Сравнение результатов по метрикам**

In [109]:
def coherence(lda):
    topics = []
    for topic_id, topic in lda.show_topics(num_topics=100, formatted=False):
        topic = [word for word, _ in topic]
        topics.append(topic)
        
    coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=texts, 
                                                   dictionary=dictionary, coherence='c_v')
    return coherence_model_lda.get_coherence()

In [97]:
def perplexity(corpus, lda):
    return np.exp2(-lda.log_perplexity(corpus[:1000]))

In [120]:
comparison = pd.DataFrame()
comparison['Обычная модель'] = [perplexity(corpus, lda), coherence(lda)]
comparison['N-gramms'] = [perplexity(corpus, lda_ngrams), coherence(lda_ngrams)]
comparison['Tf-idf'] = [perplexity(corpus, lda_tfidf), coherence(lda_tfidf)]
comparison['N-gramms_tf_idf'] = [perplexity(corpus, lda_ngrams_tfidf), coherence(lda_ngrams_tfidf)]

In [126]:
comparison.rename({0 : 'Перплексия', 1: 'Когерентность'}).apply(lambda x: round(x, 3))

Unnamed: 0,Обычная модель,N-gramms,Tf-idf,N-gramms_tf_idf
Перплексия,5631.453,5612.887,1881.794,5579.385
Когерентность,0.536,0.525,0.384,0.547


**Самые красивые темы**

*Обычная модель (тема - политика)*: 

In [128]:
lda.print_topics()[-3]

(2,
 '0.015*"партия" + 0.009*"правительство" + 0.007*"политический" + 0.006*"власть" + 0.005*"политика" + 0.005*"против" + 0.004*"движение" + 0.004*"право" + 0.004*"организация" + 0.004*"выборы"')

*Модель с n-граммами (тема - религия)*:

In [130]:
lda_ngrams.print_topics()[1]

(97,
 '0.029*"церковь" + 0.021*"храм" + 0.014*"памятник" + 0.013*"епархия" + 0.012*"собор" + 0.012*"здание" + 0.011*"культура" + 0.011*"епископ" + 0.010*"православный" + 0.009*"религиозный"')

*Модель с tf-ifd (тема - война, армия)*:

In [132]:
lda_tfidf.print_topics()[0]

(99,
 '0.000*"рыцарский" + 0.000*"крест" + 0.000*"командир" + 0.000*"1944" + 0.000*"дубовый" + 0.000*"танковый" + 0.000*"наградить" + 0.000*"бой" + 0.000*"генерал-майор" + 0.000*"вернер"')

*Модель с n-граммами и tf-idf (тема - типография)*:

In [135]:
lda_ngrams_tfidf.print_topics()[0]

(99,
 '0.021*"билет" + 0.011*"портрет" + 0.011*"выпуск" + 0.011*"рубль" + 0.009*"образец" + 0.008*"печать" + 0.008*"сторона" + 0.008*"знак" + 0.007*"макс" + 0.007*"серия"')

**Лучшая модель**

Каждая из моделей выдает одну или несколько адекватных тем. Если смотреть глазами, выбрать лучшую сложно.
С учетом метрик я бы сказал, что с небольшим отрывом побеждает модель с n-граммами и tf-idf.