In [7]:
import pandas as pd
import re
import gensim
import en_core_web_lg
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.utils import  simple_preprocess
import spacy
import gensim.corpora as corpora
from pprint import pprint
from gensim.models import CoherenceModel
from collections import Counter
from matplotlib import pyplot as plt
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stop_words = stopwords.words('english')
stop_words.extend(['well','let','from', 'subject', 'use', 'not', 'would', 'say', 'could', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

In [3]:
with open('Shakespeare.txt', encoding="utf8") as f:
    lines="".join(f.readlines())

docs=lines.split("\n")
docs=[doc for doc in docs if doc!=""]
indexes=[i-1 for i,val in enumerate(docs) if val=="Contents"]
titles=[docs[i] for i in indexes]
indexes.append(len(docs))
plays=[]
for i in range(len(indexes)-1):
    plays.append(docs[indexes[i]+1:indexes[i+1]])

plays_df=pd.DataFrame()
plays_df['Titles']=titles
plays_df['Plays']=[" ".join(play) for play in plays]


In [4]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)
        sent = re.sub('\s+', ' ', sent)
        sent = re.sub("\'", "", sent) 
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

data_words = list(sent_to_words(plays_df['Plays'].tolist()))

In [5]:
nlp = en_core_web_lg.load()

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN']):
    texts_out = []
    nlp = spacy.load("en_core_web_lg")
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc]) #if token.pos_ in allowed_postags])
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)



In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors


cols = [color for name, color in (mcolors.TABLEAU_COLORS.items()| mcolors.XKCD_COLORS.items())] 

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=4000,
                  height=4000,
                  max_words=20,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

fig, axes = plt.subplots(13, 2, figsize=(10,5), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    print(i)
    fig.add_subplot(ax)
    cloud.generate_from_text(" ".join(data_ready[i]))
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')
    


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [None]:
id2word = corpora.Dictionary(data_ready)

corpus = [id2word.doc2bow(text) for text in data_ready]

In [None]:

best_model=None
best_score=0


lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)


pprint(lda_model.print_topics())

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors


cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] 

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=1800,
                  height=1500,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 3, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()