In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
!pip  install pyLDAvis



In [4]:
import nltk; nltk.download('stopwords')


!python3 -m spacy download en

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [6]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [7]:
# Convert to list
data = df.content.values.tolist()


#YOURCODE

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [8]:
#tikenize

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [10]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [11]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['where', 'thing', 'car', 'nntp_poste', 'host', 'park', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [13]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 5), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)]]


Кажется, маллет не сработал

In [14]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = r'C:\Users\Лера\Downloads\mallet-2.0.8\bin\mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)


# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CalledProcessError: ignored

In [14]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('body', 1),
  ('bricklin', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('park', 1),
  ('production', 1),
  ('really', 1),
  ('rest', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thank', 1),
  ('thing', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [16]:
def optimum_topics(num_topics):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    num_acc = []
    num_acc.append(num_topics)
    num_acc.append(coherence_lda)
    return num_acc

In [42]:
def compare_accuracy (list_of_accs):
  values = []
  for el in list_of_accs:
      values.append(el[1])
  maximum = max(values)
  return maximum

In [49]:
great_list = []
for i in range (10, 30, 5):
    list_of_accs = optimum_topics(i)
    great_list.append(list_of_accs)
    print('ready')

ready
ready
ready
ready


In [50]:
great_list

[[10, 0.4973497938960755],
 [15, 0.44582805103485196],
 [20, 0.4392813747423439],
 [25, 0.431570330596339]]

In [51]:
maximum = compare_accuracy(great_list)
for el in great_list:
  if el[1] == maximum:
    print('Оптимальное число групп ', el[0])

Оптимальное число групп  10


In [58]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.028*"team" + 0.027*"year" + 0.026*"game" + 0.020*"play" + 0.018*"win" + '
  '0.014*"player" + 0.010*"run" + 0.010*"last" + 0.009*"good" + 0.009*"hit"'),
 (1,
  '0.022*"go" + 0.015*"time" + 0.013*"day" + 0.012*"come" + 0.011*"take" + '
  '0.011*"back" + 0.011*"get" + 0.009*"say" + 0.008*"see" + 0.008*"first"'),
 (2,
  '0.014*"space" + 0.008*"cost" + 0.008*"year" + 0.007*"high" + '
  '0.007*"research" + 0.007*"low" + 0.006*"item" + 0.006*"also" + 0.006*"test" '
  '+ 0.005*"large"'),
 (3,
  '0.033*"car" + 0.021*"drive" + 0.013*"bike" + 0.011*"power" + 0.011*"wire" + '
  '0.011*"slave" + 0.010*"reality" + 0.009*"speed" + 0.009*"engine" + '
  '0.009*"light"'),
 (4,
  '0.090*"ax" + 0.077*"max" + 0.018*"di_di" + 0.015*"tumor" + '
  '0.012*"homosexual" + 0.011*"gay" + 0.009*"taste" + 0.008*"liar" + '
  '0.007*"marry" + 0.006*"homosexuality"'),
 (5,
  '0.017*"government" + 0.013*"people" + 0.012*"gun" + 0.011*"state" + '
  '0.010*"kill" + 0.008*"year" + 0.007*"public" + 0.007*"attack"

In [17]:
topics = lda_model.show_topics(10, num_words=50, formatted=False)

In [18]:
from collections import Counter

In [19]:
texts_and_topics = []
n=0
for text in texts:
    n+=1
    counter = Counter()
    for word in text:
        for topic in topics:
            topic_id = topic[0]
            for i in range(len(topic[1])):
                if topic[1][i][0] == word:
                    counter[topic_id]+= topic[1][i][1]
    list_of_commons = list(counter.most_common())
    max = 0
    new_i = 0
    for i in list_of_commons:
      if i[1] > max:
        max = i[1]
        new_i = i[0]
      new_list = []
      new_list.append(n)
      new_list.append(new_i)
    texts_and_topics.append(new_list)

In [20]:
topic_1 = []
topic_2 = []
topic_3 = []
topic_4 = []
topic_5 = []
topic_6 = []
topic_7 = []
topic_8 = []
topic_9 = []
topic_10 = []
for el in texts_and_topics:
  if el[1] == 0:
    topic_1.append(el[0])
  if el[1] == 1:
    topic_2.append(el[0])
  if el[1] == 2:
    topic_3.append(el[0])
  if el[1] == 3:
    topic_4.append(el[0])
  if el[1] == 4:
    topic_5.append(el[0])
  if el[1] == 5:
    topic_6.append(el[0])
  if el[1] == 6:
    topic_7.append(el[0])
  if el[1] == 7:
    topic_8.append(el[0])
  if el[1] == 8:
    topic_9.append(el[0])
  if el[1] == 9:
    topic_10.append(el[0])

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
def idx_to_texts(topic, text):
  texts = []
  new_idx = []
  for el in topic:
      l = el - 1
      new_idx.append(l)
  for i in topic:
      texts.append()

In [23]:
new_idx_1 = []
for i in topic_1:
    k = i - 1
    new_idx_1.append(k)
texts_1 = []
for i in new_idx_1:
    texts_1.append(texts[i])

new_idx_2 = []
for i in topic_2:
    k = i - 1
    new_idx_2.append(k)
texts_2 = []
for i in new_idx_1:
    texts_2.append(texts[i])

new_idx_3 = []
for i in topic_3:
    k = i - 1
    new_idx_3.append(k)
texts_3 = []
for i in new_idx_3:
    texts_3.append(texts[i])

new_idx_4 = []
for i in topic_4:
    k = i - 1
    new_idx_4.append(k)
texts_4 = []
for i in new_idx_4:
    texts_4.append(texts[i])

new_idx_5 = []
for i in topic_5:
    k = i - 1
    new_idx_5.append(k)
texts_5 = []
for i in new_idx_5:
    texts_5.append(texts[i])

new_idx_6 = []
for i in topic_6:
    k = i - 1
    new_idx_6.append(k)
texts_6 = []
for i in new_idx_6:
    texts_6.append(texts[i])

new_idx_7 = []
for i in topic_7:
    k = i - 1
    new_idx_7.append(k)
texts_7 = []
for i in new_idx_7:
    texts_7.append(texts[i])

new_idx_8= []
for i in topic_8:
    k = i - 1
    new_idx_8.append(k)
texts_8 = []
for i in new_idx_8:
    texts_8.append(texts[i])

new_idx_9 = []
for i in topic_9:
    k = i - 1
    new_idx_9.append(k)
texts_9 = []
for i in new_idx_9:
    texts_9.append(texts[i])

new_idx_10 = []
for i in topic_10:
    k = i - 1
    new_idx_10.append(k)
texts_10 = []
for i in new_idx_10:
    texts_10.append(texts[i])

In [24]:
list_of_texts = [texts_1, texts_2, texts_3, texts_4, texts_5, texts_6, texts_7, texts_8, texts_9, texts_10]

In [25]:
#list_of_dfs_1 = []
texts_1_1 = []
for text in texts_1:
    t = ' '.join(text)
    #print(t)
    texts_1_1.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_1_1)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
#list_of_dfs_1.append(df)

In [26]:
df

Unnamed: 0,aaa,ab,abandon,abberation,abbot,abbotts,abbreviation,ability,able,abs,absolut,absolute,absolutely,absorbing,abstract,absurd,abuse,accelerator,accept,acceptable,acceptably,access,accident,acclimate,accolade,accompany,accomplish,accomplished,accomplishment,accord,according,account,accrue,accuracy,accurate,accurately,accuse,ace,achievement,acker,...,write,writer,writing,wrong,ws,wt,wtem,wwl,wynn,yake,yank,yankee,yankke,yankovic,yard,yawney,year,years_ago,yesterday,yet,yield,yike,yl_nen,young,youngbucs,youngster,youth,ysebaert,yserbeart,yuck,yup,yve,yzerman,zalapski,zap,zaphod,zombie,zombo,zone,zubov
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072540,0.0,0.272642,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096710,0.0,0.000000,0.0,0.0,0.0,0.0,0.101994,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.07123,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.057043,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.245210,0.0,0.000000,0.0,0.0,0.0,0.0,0.172404,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.054654,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.341661,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.050039,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.17083,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044636,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278412,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
439,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.118828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157460,0.0,0.131515,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.178576,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
440,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.045124,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043105,0.0,0.108007,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
summas = df.sum(axis=1) #tf-idf для каждого текста в texts_1

In [28]:
summas

0       8.150972
1       5.842721
2       8.061653
3       5.861019
4       7.111053
         ...    
437     6.472547
438     3.559441
439     6.103233
440     6.885943
441    10.277057
Length: 442, dtype: float64

In [29]:
df['summa'] = summas

In [30]:
texts_2_2 = []
for text in texts_2:
    t = ' '.join(text)
    #print(t)
    texts_2_2.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_2_2)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df2 = pd.DataFrame(denselist, columns=feature_names)

In [31]:
summas2 = df2.sum(axis=1) #tf-idf для каждого текста в texts_2
df2['summa'] = summas2

In [32]:
texts_3_3 = []
for text in texts_3:
    t = ' '.join(text)
    #print(t)
    texts_3_3.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_3_3)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df3 = pd.DataFrame(denselist, columns=feature_names)

In [33]:
summas3 = df3.sum(axis=1) #tf-idf для каждого текста в texts_3
df3['summa'] = summas3

In [34]:
texts_4_4 = []
for text in texts_4:
    t = ' '.join(text)
    #print(t)
    texts_4_4.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_4_4)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df4 = pd.DataFrame(denselist, columns=feature_names)

In [35]:
summas4 = df4.sum(axis=1) #tf-idf для каждого текста в texts_4
df4['summa'] = summas4

In [36]:
texts_5_5 = []
for text in texts_5:
    t = ' '.join(text)
    #print(t)
    texts_5_5.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_5_5)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df5 = pd.DataFrame(denselist, columns=feature_names)

In [38]:
summas5 = df5.sum(axis=1) #tf-idf для каждого текста в texts_5
df5['summa'] = summas5

In [39]:
texts_6_6 = []
for text in texts_6:
    t = ' '.join(text)
    #print(t)
    texts_6_6.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_6_6)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df6 = pd.DataFrame(denselist, columns=feature_names)

In [40]:
summas6 = df6.sum(axis=1) #tf-idf для каждого текста в texts_6
df6['summa'] = summas6

In [41]:
texts_7_7 = []
for text in texts_7:
    t = ' '.join(text)
    #print(t)
    texts_7_7.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_7_7)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df7 = pd.DataFrame(denselist, columns=feature_names)

In [42]:
summas7 = df7.sum(axis=1) #tf-idf для каждого текста в texts_7
df7['summa'] = summas7

In [None]:
texts_8_8 = []
for text in texts_8:
    t = ' '.join(text)
    #print(t)
    texts_8_8.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_8_8)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df8 = pd.DataFrame(denselist, columns=feature_names)

In [None]:
summas8 = df8.sum(axis=1) #tf-idf для каждого текста в texts_8
df8['summa'] = summas8

In [None]:
texts_9_9 = []
for text in texts_9:
    t = ' '.join(text)
    #print(t)
    texts_9_9.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_9_9)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df9 = pd.DataFrame(denselist, columns=feature_names)

In [None]:
summas9 = df9.sum(axis=1) #tf-idf для каждого текста в texts_9
df9['summa'] = summas9

In [None]:
texts_10_10 = []
for text in texts_10:
    t = ' '.join(text)
    #print(t)
    texts_10_10.append(t)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_10_10)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df10 = pd.DataFrame(denselist, columns=feature_names)

In [None]:
summas10 = df10.sum(axis=1) #tf-idf для каждого текста в texts_10
df10['summa'] = summas10

Coherence score: 
мера того, насколько хорошо выполнено тематическое моделирование. Coherence score темы оценивают одну тему, измеряя степень семантического сходства между словами с высокой оценкой в этой теме. Чем выше Coherence score, тем лучше выполнено тематическое моделирование.
