In [None]:
import os.path
from gensim import corpora
from gensim.models import LsiModel
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt


In [None]:

def load_data(path,file_name):

    documents_list = []
    titles=[]
    with open("tweets.txt", mode="r", encoding="utf-8") as fp:
        for line in fp.readlines():
            text = line.strip()
            documents_list.append(text)
    print("Número total de documentos:",len(documents_list))
    titles.append( text[0:min(len(text),1000)] )
    return documents_list,titles

In [None]:
### Add words to stopwords list
import nltk
from nltk.corpus import stopwords

stopwords = stopwords.words('spanish')
with open ("stopwords.txt", mode="r", encoding="utf-8") as newSWlist:
    for nuevapalabra in newSWlist.readlines():
        palabritas = nuevapalabra.strip()
        stopwords.append(palabritas)    
print (stopwords[-15:-1])

In [None]:
def preprocess_data(doc_set):

        tokenizer = RegexpTokenizer(r"\w+")
        sp_stop = stopwords
        ps = PorterStemmer()
        texts = []

        for i in doc_set:
            raw = i.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in sp_stop]
            stemmed_tokens = [ps.stem(i) for i in stopped_tokens]
            texts.append(stemmed_tokens)

        return texts


In [None]:
def prepare_corpus(doc_clean):

    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    return dictionary,doc_term_matrix

In [None]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [None]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

In [None]:
# LSA Model
number_of_topics=5
words=5
document_list,titles=load_data("","tweets.txt")
clean_text=preprocess_data(document_list)
model=create_gensim_lsa_model(clean_text,number_of_topics,words)

In [None]:
start,stop,step=2,12,1
plot_graph(clean_text,start,stop,step)

In [None]:
#En este apartado vamos a crear una imagen visual que nos indique a través de su tamaño, cuales son las palabras que mas aparecen en nuestro corpus. Esto no tiene unicamente una función decorativa, sino que podemos :

    # Import the wordcloud library
from wordcloud import WordCloud

lista_definitiva = []
n = 1
while n < number_of_topics:
    lista_definitiva.append(clean_text[n])
    n =+1
print (lista_definitiva)

    # Join the different processed titles together.
long_string = ','.join(lista_definitiva))

    # Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=2, contour_color='steelblue')

    # Generate a word cloud
wordcloud.generate(long_string)

    # Visualize the word cloud
wordcloud.to_image()

In [None]:
print (clean_text)
for y in clean_text:
    for x in y:
        lista_definitiva.append(x)
print (lista_definitiva)