# Lecture 7 tutorial

In this tutorial we will look at how to use LSA, LDA and word embedding for text analysis on a dataset of StackOverflow question titles. First, we import the necessary libraries. 

In [3]:
import sys
print(sys.executable)
!pip install nltk
!pip install gensim

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

/Users/xliu/opt/miniconda3/envs/cs589/bin/python
Collecting nltk
  Using cached nltk-3.6.5-py3-none-any.whl (1.5 MB)
Collecting click
  Using cached click-8.0.3-py3-none-any.whl (97 kB)
Collecting regex>=2021.8.3
  Downloading regex-2021.10.23-cp39-cp39-macosx_10_9_x86_64.whl (288 kB)
[K     |████████████████████████████████| 288 kB 5.0 MB/s eta 0:00:01
[?25hInstalling collected packages: regex, click, nltk
Successfully installed click-8.0.3 nltk-3.6.5 regex-2021.10.23
Collecting gensim
  Downloading gensim-4.1.2-cp39-cp39-macosx_10_9_x86_64.whl (24.0 MB)
[K     |████████████████████████████████| 24.0 MB 6.0 MB/s eta 0:00:011
Collecting smart-open>=1.8.1
  Using cached smart_open-5.2.1-py3-none-any.whl (58 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.1.2 smart-open-5.2.1


In [4]:
import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/xliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/xliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/xliu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

After importing, we load the dataset lsa_example_2.txt, which can be downloaded from https://stevens0-my.sharepoint.com/:t:/g/personal/xliu127_stevens_edu/EUMGnDc7BAZOoEFaFiVGl5MBcu4KpMPdt1JCWfakCh4QMA?e=JQhLKE. We perform lemmatization and stopwords removal. 

lsa_example_2.txt consists of ~2400 documents. Each document is the title of a SO post. The first 800 documents are Python, the next 800 are Java, followed in Javascript. 

In [7]:
fin = open("lsa_example_2.txt", "r")
documents_list = fin.readlines()

processed_list = []
# Lemmatizer
lemmatizer = WordNetLemmatizer()

for doc in documents_list:
    tokens = word_tokenize(doc.lower())
    stopped_tokens = [token for token in tokens if token not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(i, pos="n") for i in stopped_tokens]
    processed_list.append(lemmatized_tokens)
fin.close()
print(processed_list)



Set the number of topics to 5:

In [8]:
NUM_TOPICS=5

# Using LSA for text analysis in a corpus

Next, let's use LSA to find the most similar document to a document, and print the topic. The code below is developed based on https://dsfabric.org/topic-modeling-in-python-latent-semantic-analysis

In [35]:
from gensim.corpora import Dictionary

def get_lsa_topic(processed_list, NUM_TOPICS):
    """
        Get the topics using the python gensim library of lda. 
    """
    from gensim.models import LsiModel
    word_dictionary = Dictionary(processed_list)
    doc_wd_mat = [word_dictionary.doc2bow(document) for document in processed_list]
    lsi_model = LsiModel(corpus=doc_wd_mat, num_topics=NUM_TOPICS, id2word=word_dictionary)
    lsi_topics = lsi_model.show_topics(num_topics=NUM_TOPICS, formatted=False)
    for each_topic in lsi_topics:
        print(each_topic)
        print()

def get_lsa_doc_vec(processed_list, NUM_TOPICS):
    """
        Get the document-topic vectors using the python gensim library of lda. 
    """
    from scipy.sparse.linalg import svds
    # Singular-value decomposition
    from numpy import array
    word_dictionary = Dictionary(processed_list)
    doc_wd_mat = [[each_pair[0] for each_pair in word_dictionary.doc2bow(document)] for document in processed_list]
    wd_doc_mat = transpose_matrix(doc_wd_mat)
    U, s, VT = svds(wd_doc_mat, k=NUM_TOPICS)
    return U, s, VT

def transpose_matrix(doc_wd_mat):
    """
        Transpose the document word matrix into the word document matrix 
    """
    from scipy.sparse import csr_matrix
    import numpy as np
    row = np.array([doc_idx for doc_idx in range(len(doc_wd_mat)) for x in range(len(doc_wd_mat[doc_idx]))])
    col = np.array([x for doc_idx in range(len(doc_wd_mat)) for x in doc_wd_mat[doc_idx]])
    data = np.array([1 for doc_idx in range(len(doc_wd_mat)) for x in range(len(doc_wd_mat[doc_idx]))])
    wd_doc_mat = csr_matrix((data, (row, col)), dtype=float).toarray()
    return wd_doc_mat.transpose()

def get_most_similar(doc_idx, VT):
    """
        Get the most similar document of a document (specified by doc_idx) based on the low dimensional vector computed by LSA. 
    """
    import numpy as np
    from scipy import spatial
    doc_vec = np.array(VT[:, doc_idx])
    from gensim.matutils import cossim
    simty_list = []
    for each_doc_idx in range(VT.shape[1]):
        each_vec = VT[:, each_doc_idx]
        simty = 1 - float(spatial.distance.cosine(each_vec, doc_vec))
        simty_list.append(simty)
    simty_list[doc_idx] = -1
    max_idx = np.argmax(simty_list)
    return max_idx

def print_most_similar_doc(VT, processed_list, query_idx):
    """
        Print the content of a document (specified by query_idx) and its most similar document
    """
    print(VT.shape)
    print(processed_list[query_idx])
    similar_idx = get_most_similar(query_idx, VT)
    print(processed_list[similar_idx])

# use LSA to retrieve the most similar document to one document
U, s, VT = get_lsa_doc_vec(processed_list, NUM_TOPICS)
print_most_similar_doc(VT, processed_list, 0)

# use LSA to obtain the 5 topics in the corpus
get_lsa_topic(processed_list, NUM_TOPICS)



(5, 2403)
['removing', 'word', 'number', 'text', 'file', 'python']
['python', 'storing', 'printing', 'data', 'text', 'file']
(0, [('javascript', -0.5865947288201491), ('python', -0.5263652965136045), ('java', -0.44423259239026625), ('using', -0.1901587413748863), ('file', -0.1306464366447001), ('function', -0.10541142409015869), ('string', -0.10012792808789805), ('array', -0.09577503285900521), ('object', -0.07953098663087799), ('list', -0.07285526387279281)])

(1, [('javascript', 0.6959326480033186), ('python', -0.6853882068937498), ('java', -0.1094706026154582), ('list', -0.07313473318579801), ('function', 0.050608582540178935), ('object', 0.046231185186997016), ('jquery', 0.044740897951970164), ('file', -0.04365775517136299), ('array', 0.04039457608431171), ('script', -0.029268829895845647)])

(2, [('java', 0.8529021476372963), ('python', -0.4190172079141435), ('javascript', -0.2710413502878443), ('function', -0.05230291072707668), ('class', 0.048640532507850595), ('lang', 0.0375183

# Using LDA for text analysis

Next, we use LDA also for getting the document topic vector and analyze the topics in the corpus. For LDA, we can also use gensim. After getting the results from gensim, I tested it on a small number of documents, and empirically observe whether their most similar document share the same language. The result wasn't very impressive. For example, when I tested the most similar document for document 2 (Python), the most similar document was Javascript. 

So I went on and tried mallet, and it looks better than the result by gensim. 

For mallet, because it is based on Java, you should first download the Java mallet library and unpack it in a local file, then run the executable mallet/bin for the documents in lsa_example_2. Please follow the tutorial from the official mallet website for this: http://mallet.cs.umass.edu/topics.php

Essentially, you just have to first run

In [None]:
bin/mallet import-dir --input /data/topic-input --output topic-input.mallet --keep-sequence --remove-stopwords

where /data/topic-input is a directory containing 2400 documents 0.txt, ..., 2402.txt, line x.txt is the x-th line in lsa_example_2.txt, then run

In [None]:
bin/mallet train-topics --input topic-input.mallet --num-topics NUM_TOPICS --output-doc-topics doc-topic.txt --output-topic-keys topic.txt

where the document-topic vectors will be stored in doc-topic.txt and the topics will be stored in topic.txt

In [43]:
def get_gensim_lda_doc_vec(processed_list, NUM_TOPICS, passes):
    """
        Get the document-topic vectors using python gensim library of lda. 
    """
    corpus = [dictionary.doc2bow(text) for text in processed_list]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=passes)
    lda_docwd_mat = ldamodel.get_document_topics(bow=corpus)
    VT = np.zeros((NUM_TOPICS, len(processed_list)))
    for doc_idx in range(len(lda_docwd_mat)):
        for topic_idx in range(NUM_TOPICS):
            #print(doc_topic_mat[doc_idx, topic_idx])
            try:
                VT[topic_idx, doc_idx] = lda_docwd_mat[doc_idx][topic_idx][1]
            except IndexError:
                pass
    return VT
    
    
def get_gensim_lda_topic(processed_list, NUM_TOPIC):
    """
        Get the topic-word vectors using python gensim library of lda
    """
    dictionary = corpora.Dictionary(processed_list)
    corpus = [dictionary.doc2bow(text) for text in processed_list]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)
    topics = ldamodel.print_topics(num_words=NUM_TOPICS)
    for each_topic in topics:
        print(each_topic)

def get_mallet_lda_doc_vec(processed_list, NUM_TOPICS):
    """
        Get the document-topic vectors using the mallet library (java) 
    """
    with open("mallet-2.0.8/doc-topic.txt", "r") as fin:
        VT = np.zeros((NUM_TOPICS, len(processed_list)))
        for line in fin:
            tokens = line.strip("\n").split("\t")
            line_idx = int(tokens[1][:-4].split("/")[-1])
            docvec = [float(tokens[x]) for x in range(-NUM_TOPICS, 0, 1)]
            VT[:, line_idx] = docvec
        return VT
    
VT = get_gensim_lda_doc_vec(processed_list, NUM_TOPICS, 10)
#VT = get_mallet_lda_doc_vec(processed_list, NUM_TOPICS)
print_most_similar_doc(VT, processed_list, 2)
        

(5, 2403)
['python', 'perl', 'match', 'capture', 'within', 'block']
['javascript', 'isnan', 'null', 'return', 'false']


Why mallet has more stable results than gensim: the mallet implementation uses Gibbs sampling, which is known to be a fast implementation of LDA. mallet can finish hundreds of iterations within seconds. On the other hand, gensim uses a the online algorithm for LDA inference: https://radimrehurek.com/gensim/models/ldamodel.html, which may result in a different efficiency of inference compared with Gibbs samplign. 

# Using word embedding for text analysis

In [33]:
import gensim.downloader
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
glove_vectors = gensim.downloader.load('glove-twitter-25')
glove_vectors.most_similar('twitter')
print(glove_vectors["ice"])

[-0.76812  -0.27978   0.60382   0.56937  -0.57322   0.9908    1.2969
 -0.34127   0.7435    0.48104   0.9561   -0.32841  -3.6687   -0.66473
  0.37543  -0.077737  1.0418   -0.83902  -0.052313  0.59333  -1.1597
  0.35368   0.41539   0.49875   0.89548 ]
