In [50]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ultra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [125]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import en_core_web_sm
from collections import Counter

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [4]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc


In [5]:
# Convert to list
data = df.content.values.tolist()


#YOURCODE

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [8]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['where', 'thing', 'car', 'nntp_poste', 'host', 'park', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [10]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 5), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)]]


In [11]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('body', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('park', 1),
  ('production', 1),
  ('really', 1),
  ('rest', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('thank', 1),
  ('thing', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

In [19]:
mallet_path = '/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

CalledProcessError: Command '/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input C:\Users\ultra\AppData\Local\Temp\65eb86_corpus.txt --output C:\Users\ultra\AppData\Local\Temp\65eb86_corpus.mallet' returned non-zero exit status 1.

как видите сверху, возвращает non-zero exit status 1, так что вот...не работает. я пыталась!

In [20]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [21]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.122*"drug" + 0.083*"door" + 0.051*"pin" + 0.046*"panel" + 0.046*"patient" '
  '+ 0.045*"medicine" + 0.032*"acquire" + 0.028*"programmer" + 0.023*"justify" '
  '+ 0.022*"newsletter"'),
 (1,
  '0.125*"line" + 0.055*"host" + 0.051*"nntp_poste" + 0.035*"thank" + '
  '0.029*"organization" + 0.028*"post" + 0.023*"reply" + 0.022*"mail" + '
  '0.021*"help" + 0.020*"send"'),
 (2,
  '0.054*"use" + 0.035*"system" + 0.022*"run" + 0.022*"problem" + '
  '0.020*"information" + 0.020*"also" + 0.018*"available" + 0.018*"bit" + '
  '0.017*"work" + 0.017*"program"'),
 (3,
  '0.043*"price" + 0.042*"cost" + 0.041*"high" + 0.038*"low" + 0.033*"sale" + '
  '0.032*"sell" + 0.025*"new" + 0.025*"machine" + 0.023*"rate" + '
  '0.022*"product"'),
 (4,
  '0.121*"key" + 0.045*"encryption" + 0.030*"bus" + 0.028*"development" + '
  '0.026*"scripture" + 0.026*"security" + 0.025*"clipper" + 0.024*"homosexual" '
  '+ 0.024*"technology" + 0.024*"marriage"'),
 (5,
  '0.109*"law" + 0.088*"gun" + 0.071*"government

In [22]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4494927348421709


In [25]:
def opti(x, y):
    ddd = {}
    for i in range(x, y):
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=i, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        ddd[i]= coherence_lda
    return ddd

In [29]:
ddd = opti(15,26)

In [32]:
max(ddd, key=ddd.get) #cамое оптимальное число топиков

17

In [12]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=17, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [28]:
pprint(lda_model.print_topics())

[(0,
  '0.079*"trial" + 0.075*"safety" + 0.067*"pin" + 0.060*"panel" + '
  '0.026*"identity" + 0.016*"warm" + 0.011*"vw" + 0.000*"ground" + '
  '0.000*"wire" + 0.000*"witness"'),
 (1,
  '0.061*"phone" + 0.052*"graphic" + 0.049*"science" + 0.042*"box" + '
  '0.040*"monitor" + 0.030*"blue" + 0.028*"bus" + 0.024*"king" + 0.023*"pain" '
  '+ 0.019*"band"'),
 (2,
  '0.042*"use" + 0.032*"problem" + 0.026*"run" + 0.025*"card" + 0.025*"system" '
  '+ 0.024*"drive" + 0.022*"window" + 0.021*"work" + 0.019*"bit" + '
  '0.016*"set"'),
 (3,
  '0.072*"law" + 0.057*"gun" + 0.029*"rate" + 0.027*"protect" + '
  '0.026*"package" + 0.025*"police" + 0.025*"criminal" + 0.020*"serve" + '
  '0.020*"black" + 0.019*"arrest"'),
 (4,
  '0.038*"life" + 0.034*"man" + 0.027*"child" + 0.025*"die" + '
  '0.020*"christian" + 0.020*"kill" + 0.018*"human" + 0.016*"love" + '
  '0.016*"death" + 0.014*"believe"'),
 (5,
  '0.077*"posting" + 0.045*"medical" + 0.035*"patient" + 0.029*"treatment" + '
  '0.028*"disease" + 0.027

In [61]:
dic_t = {}
for idt, topic in lda_model.show_topics(formatted=False, num_topics=17):
    dic_t[idt] = {t[0]:t[1] for t in topic} #словарь {id топика: {слово:вес}}

In [154]:
ans = [] #список каунтеров для каждого текста
only_topic_ans = [] #список из самых встречающихся топиков текстов
for text in data_lemmatized:
    countit =  Counter()
    for word in text:
        for topic in dic_t:
            if word in dic_t[topic].keys():
                countit[topic] += dic_t[topic][word]
    ans.append(countit)
    only_topic_ans.append(countit.most_common(1))

In [169]:
hhh = []
only_topic_ans[0][0][0]
for i in only_topic_ans:
    for j in i:
        for h in j:
            hhh.append(h)

In [171]:
yyy = hhh[::2]

In [235]:
ddd = (dict(zip(yyy, data_lemmatized))) #словарь текстов разделенных по группах (топикам)

In [239]:
sss = [' '.join(value) for value in ddd.values()] 

In [236]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [243]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(sss)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [244]:
df

Unnamed: 0,ability,able,abused,accept,accepting,account,acre,act,active,activist,...,would,write,writer,wrong,year,yet,young,zip,zoroaster,zoroastrian
0,0.0,0.0,0.0,0.0,0.031547,0.0,0.031547,0.0,0.0,0.0,...,0.031334,0.0,0.031547,0.031547,0.222371,0.031547,0.031547,0.0,0.094642,0.347019
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044563,0.039446,0.0,0.0,0.0,0.0,0.0,0.269203,0.0,0.0
2,0.0,0.0,0.0,0.094268,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.047458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.07309,0.07309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.064259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.242287,0.042893,0.0,0.0,0.038211,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.093507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.016479,0.0,0.065918,0.032959,0.049438,...,0.008184,0.014488,0.0,0.0,0.167788,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.098851,0.0875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.039341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


coherence score - это оценка, которая вычисляет, есть ли смысл у слов, объединенных одним топиком. Если coherence score высокая, то это значит, что в каждом топике будет больше связанных между собой слов, и этот топик будет иметь больше смысла. в (кровать, сон, кошмар, ночник) слова более тесно связаны чем в (кровать, учеба, зарядка, платье), а значит и coherence score будет больше.