In [None]:
from nltk.corpus import PlaintextCorpusReader
sonnetsCorpus = PlaintextCorpusReader("sonnets", ".*\.txt")
print(len(sonnetsCorpus.fileids()))

In [None]:
import nltk

def get_lists_of_words(corpus, **kwargs):
    documents = []
    for fileid in corpus.fileids():
        words = [token.lower() for token in corpus.words(fileid) if token[0].isalpha()]
        
        if "minLen" in kwargs and kwargs["minLen"]:
            words = [word for word in words if len(word) >= kwargs["minLen"]]
            
        if "stopwords" in kwargs and kwargs["stopwords"]:
            words = [word for word in words if word not in kwargs["stopwords"]]
        
        if "pos" in kwargs and kwargs["pos"]:
            tagged = nltk.pos_tag(words)
            words = [word for word, pos in tagged if pos in kwargs["pos"]]
        
        documents.append(words)
    return documents

In [None]:
sonnetsStopwords = nltk.corpus.stopwords.words('english')
sonnetsStopwords += ["thee", "thou", "thy"]
sonnetsWords = get_lists_of_words(sonnetsCorpus, stopwords=sonnetsStopwords, minLen=3)

for i in range(0,154):
    print("document", str(i), sonnetsWords[i])

In [None]:
from gensim import corpora, models


In [None]:
def get_lda_from_list_of_words(lists_of_words, **kwargs):
    # create a dictionary mapping terms to integer IDs and frequency counts
    dictionary = corpora.Dictionary(lists_of_words)

#    dictionary.save_as_text("fname.txt") 

    # convert each document (i.e., each word list in the lists_of_words)
    # into a bag-of-words format
    corpus = [dictionary.doc2bow(text) for text in lists_of_words]
    
    # Model the signficance of the words by document
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    # set the dictionary, then do the LDA topic modelling, returning the model to the fxn call
    kwargs["id2word"] = dictionary
    return models.LdaModel(corpus_tfidf, **kwargs)

In [None]:
dictionary = corpora.Dictionary(sonnetsWords)
test2bow = dictionary.doc2bow(sonnetsWords[0])
print(test2bow)

In [None]:
sonnetsLda = get_lda_from_list_of_words(sonnetsWords, num_topics=10, passes=20)
print(sonnetsLda)

In [None]:
def print_top_terms(lda, num_terms=10):
    for i in range(0, lda.num_topics):
        terms = [term for term, val in lda.show_topic(i, num_terms)]
        print("Top 10 terms for topic #", str(i), ": ", ", ".join(terms))

In [None]:
sonnetsLda.print_topic(5)

In [None]:
print(sonnetsLda.show_topics(num_topics=10))

In [None]:
print_top_terms(sonnetsLda)

In [None]:
import networkx as nx

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
G = nx.Graph()
G.add_edge("A", "X")
G.add_edge("A", "Y")
G.add_edge("B", "X")
G.add_edge("C", "Y")
nx.draw(G)

In [None]:
pos = nx.spring_layout(G)
nx.draw_networkx_labels(G, pos, font_color='r') # font colour is "r" for red
nx.draw_networkx_edges(G, pos, alpha=0.1) # set the line alpha transparency to .1
plt.axis('off') # don't show the axes for this plot
plt.show()

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

def graph_terms_to_topics(lda, num_terms=10):
    
    # create a new graph and size it
    G = nx.Graph()
    plt.figure(figsize=(10,10))

    # generate the edges
    for i in range(0, lda.num_topics):
        topicLabel = "topic "+str(i)
        terms = [val for val, term in lda.show_topic(i, num_terms)]
        for term in terms:
            G.add_edge(topicLabel, term)
    
    pos = nx.spring_layout(G) # positions for all nodes

    # we'll plot topic labels and terms labels separately to have different colours
  
    g = G.subgraph([topic for topic, _ in pos.items() if "topic " in str(topic)])
    nx.draw_networkx_labels(g, pos,  font_color='r')
    g = G.subgraph([term for term, _ in pos.items() if "topic " not in str(term)])
    nx.draw_networkx_labels(g, pos)
    
    # plot edges
    nx.draw_networkx_edges(G, pos, edgelist=G.edges(), alpha=0.1)

    plt.axis('off')
    plt.show()



In [None]:
graph_terms_to_topics(sonnetsLda)