In [14]:
import os
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import pandas as pd

#Input  : path and file_name
#Purpose: loading text file
#Output : list of paragraphs/documents and
             title(initial 100 words considred as title of document)

filename = 'articles.txt'
path = 'C:/Users/Shefali/Desktop'

file = open("C:/Users/Shefali/Desktop/articles.txt","r")
print(file.read())

path = 'C:/Users/Shefali/Desktop/'
file_name = 'articles.csv'

In [15]:
def load_data(path,file_name):
    documents_list = []
    titles=[]
    with open( os.path.join(path, file_name) ,"r") as fin:
        for line in fin.readlines():
            text = line.strip()
            documents_list.append(text)
    print("Total Number of Documents:",len(documents_list))
    titles.append( text[0:min(len(text),100)] )
    return documents_list,titles

In [16]:
number_of_topics=7
words=10
document_list,titles=load_data("C:/Users/Shefali/Desktop","articles.txt")

Total Number of Documents: 1


#documents_list,titles=load_data("C:/Users/Shefali/Desktop","articles.txt")

file = open("C:/Users/Shefali/Desktop/articles.txt","r")
with open("C:/Users/Shefali/Desktop/articles.txt","r") as f:
    contents = f.read()

    

 #Input  : docuemnt list
 #Purpose: preprocess text (tokenize, removing stopwords, and stemming)
 #Output : preprocessed text

In [17]:
def preprocess_data(doc_set):
    
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts,tokens

In [18]:
clean_text=preprocess_data(document_list)

#Input  : clean document
#Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
#Output : term dictionary and Document Term Matrix

In [19]:
def prepare_corpus(doc_set):
   
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_set)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

In [20]:
#Input  : clean document, number of topics and number of words associated with each topic
#Purpose: create LSA model using gensim
#Output : return LSA model

In [21]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [22]:
model=create_gensim_lsa_model(clean_text,number_of_topics,words)


TypeError: decoding to str: need a bytes-like object, list found

#Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
#purpose : Compute c_v coherence for various number of topics
#Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics

In [23]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [24]:
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

In [25]:
start,stop,step=2,12,1

plot_graph(doc_clean,start,stop,step)

NameError: name 'doc_clean' is not defined