# INFS 770 - Advanced Data Mining Application
## Assignment 3
### John Herbert

## T0: Import Libraries

In [1]:
import os
import re

import nltk
from nltk import word_tokenize 
from nltk.stem import WordNetLemmatizer

import gensim
import pandas as pd
import numpy as np
from gensim.models import LdaModel, LsiModel
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import corpora
from gensim.models import CoherenceModel
from statistics import mean

from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD

## T1: Import Data

In [2]:
# Importing articles for assignment
docs = []
path = './tp_dataset/'
dirs = os.listdir(path)

for filename in dirs:
    with open(path+filename,'r') as f:
        docs.append(f.read())
print(docs)

['White House officials are preparing to present President Biden with a roughly $3 trillion infrastructure and jobs package that includes high profile domestic policy priorities such as free community college and universal prekindergarten, according to three people familiar with internal discussions.\n\nAfter completing the $1.9 trillion coronavirus relief package this month, Biden administration officials are piecing together the next major legislative priority. While no final announcement has been made, the White House is expected to push a multitrillion jobs and infrastructure plan as the centerpiece of the president’s “Build Back Better” agenda.\n\nThat effort is expected to be broken into two parts — one focused on infrastructure, and the other focused on other domestic priorities, such as expanding the newly expanded child tax credit for several years. The people, who spoke on the condition of anonymity to describe private conversations, stressed planning was preliminary and subj

## T2: Write code to tokenize the docs

In [3]:
# Creating tokenization function to convert alll words to lowercase, remove punctions, remove numbers, conduct lemmatizationn
# and remoe stop words

def before_token(documents):
    # conver words to lower case
    lower = map(str.lower, documents)
    # remove puntuations
    punctuationless = list(map(lambda x: " ".join(re.findall('\\b\\w\\w+\\b',x)), lower))
    # remove numbers
    return list(map(lambda x:re.sub('\\b[0-9]+\\b', '', x), punctuationless))
docs1 = before_token(docs)

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t,"v") for t in word_tokenize(doc)]
stopwords = nltk.corpus.stopwords.words("english")

# Setting the min_df parameter to 3, to remove words that appears in only 1 or 2 documents
# Setting the max_df parameter to 0.5 to remove words that appear in over 50% of the documents
vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(),norm='l2',stop_words=stopwords,min_df=3,max_df=0.5)

#vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(),stop_words=stopwords)
corpus_vect = vectorizer.fit_transform(docs1)
#print(corpus_vect) # sparse matrix
df_vect = pd.DataFrame(corpus_vect.toarray(), columns=vectorizer.get_feature_names())
print(df_vect)

     accord  actually  advisers    africa       age  american     among  \
0  0.029190  0.029190  0.000000  0.000000  0.000000  0.066075  0.066075   
1  0.090013  0.030004  0.135834  0.000000  0.000000  0.033959  0.000000   
2  0.000000  0.047302  0.107073  0.000000  0.000000  0.000000  0.000000   
3  0.069686  0.000000  0.000000  0.078870  0.197176  0.078870  0.039435   
4  0.000000  0.000000  0.035535  0.035535  0.106606  0.000000  0.071071   
5  0.000000  0.000000  0.000000  0.031479  0.062957  0.000000  0.000000   
6  0.000000  0.057728  0.000000  0.000000  0.000000  0.000000  0.000000   
7  0.100709  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
8  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

      apple     april    around  ...  university   vaccine  vaccines  \
0  0.000000  0.000000  0.033037  ...    0.000000  0.000000  0.000000   
1  0.000000  0.060008  0.000000  ...    0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.000000  



## T3: Discus the meaning of TF-IDF

TF-IDF is a text mining method to determine if certain terms, or this case words, are useful in each document within a corpus and if they are good indications of the document's topic. It assigns a score based on the number of times it appears within a document, but also the number of times it appears within the corpus. The first part $tf$ (the number of times it appears in a specific document) multiplied by the number of documents within the corpus divided by the frequency it appears within all documents. Therefore, the score will go down the more it appears in other documents, since we are attempting to seperate it from other documents. The formula is as follows:

$$tf.log\biggl(\frac{N}{df(w)}\biggr)$$

## T4: Convert the rectorized data to a gensum corpus object

In [4]:
# convert the vectorized data to a gensim corpus object
from gensim import corpora
word2id = dict((k, v) for k, v in vectorizer.vocabulary_.items())
id2word = dict((v, k) for k,v in vectorizer.vocabulary_.items())
d=corpora.Dictionary()
d.id2token = id2word
d.token2id = word2id
corpus = gensim.matutils.Sparse2Corpus(corpus_vect, documents_columns=False)
print(id2word)

{177: 'white', 67: 'house', 101: 'officials', 116: 'prepare', 117: 'president', 15: 'biden', 168: 'trillion', 71: 'infrastructure', 74: 'job', 106: 'package', 46: 'domestic', 121: 'priorities', 26: 'college', 171: 'universal', 0: 'accord', 55: 'familiar', 35: 'coronavirus', 111: 'piece', 99: 'next', 85: 'major', 82: 'legislative', 126: 'push', 58: 'focus', 163: 'tax', 181: 'years', 30: 'condition', 40: 'describe', 34: 'conversations', 170: 'unclear', 157: 'still', 176: 'weeks', 143: 'second', 6: 'among', 167: 'top', 39: 'democrats', 134: 'republicans', 68: 'however', 87: 'many', 64: 'hike', 154: 'spend', 61: 'give', 14: 'beyond', 54: 'emergency', 107: 'pandemic', 130: 'real', 148: 'since', 162: 'system', 36: 'could', 5: 'american', 92: 'measure', 145: 'senate', 118: 'press', 73: 'jen', 124: 'psaki', 155: 'statement', 156: 'step', 164: 'team', 32: 'consider', 129: 'range', 114: 'potential', 104: 'options', 72: 'invest', 56: 'families', 132: 'reform', 25: 'code', 138: 'reward', 175: 'wea

## T5: Compute choerence scores for different # of topics

In [5]:
# determine the optimal number of topics using coherence method
# Setting the iterations to 60 instead of 30 as I was getting different results each run 
# for the 3 highest topic (between 4 and 5 topics)

for i in [2,3,4,5]:
    cs = 0
    for j in range(60):
        lda = LdaModel(corpus, num_topics=i,id2word=id2word, passes=50)
        # print(lda.print_topics())
        coherence_model_lda = CoherenceModel(model=lda, corpus=corpus, dictionary=d, coherence='u_mass')
        coherence_lda = coherence_model_lda.get_coherence()
        cs += coherence_lda
    print('Coherence Score for %d topics: %f' % (i,cs/60))

Coherence Score for 2 topics: -5.380243
Coherence Score for 3 topics: -3.409956
Coherence Score for 4 topics: -2.601411
Coherence Score for 5 topics: -2.255650


## T6: Comparing number of topics to determine optimal

### Coherence Scores for 2 Topics

In [6]:
# building the LDA model after for the optimal model: Topics = 2
lda = LdaModel(corpus, num_topics=2,id2word=id2word, random_state=10, passes=50) # passing 50 times per instruction

print(lda.print_topics())

[(0, '0.016*"biden" + 0.014*"infrastructure" + 0.011*"tax" + 0.011*"house" + 0.010*"spend" + 0.009*"trillion" + 0.009*"white" + 0.008*"proposal" + 0.008*"next" + 0.008*"republicans"'), (1, '0.020*"vaccine" + 0.017*"apple" + 0.015*"homepod" + 0.011*"astrazeneca" + 0.010*"data" + 0.010*"efficacy" + 0.009*"device" + 0.009*"dose" + 0.008*"mini" + 0.008*"could"')]


In [7]:
# Printing the document/topic matrix as a data frame

# use the lda model to transform documents
lda_docs = lda[corpus]
# extract the scores and round them to 3 decimal places
scores = np.round([[doc[1] for doc in row] for row in lda_docs], 3)
# convert the documents scores into a data frame
df_lda = pd.DataFrame(scores, columns=["topic 1", "topic 2"])
df_lda

Unnamed: 0,topic 1,topic 2
0,0.922,0.078
1,0.922,0.078
2,0.923,0.077
3,0.092,0.908
4,0.093,0.907
5,0.096,0.904
6,0.133,0.867
7,0.117,0.883
8,0.118,0.882


### Coherence Scores for 3 Topics

In [8]:
# build the lda model after for the 2nd optimal model: Topics = 3
lda2 = LdaModel(corpus, num_topics=3,id2word=id2word, random_state=10, passes=50) # passing 50 times per instruction
print(lda2.print_topics())

[(0, '0.019*"biden" + 0.018*"infrastructure" + 0.013*"tax" + 0.012*"house" + 0.012*"spend" + 0.010*"trillion" + 0.010*"white" + 0.009*"proposal" + 0.009*"next" + 0.009*"republicans"'), (1, '0.026*"apple" + 0.023*"homepod" + 0.013*"device" + 0.011*"mini" + 0.010*"might" + 0.010*"speakers" + 0.009*"code" + 0.009*"could" + 0.009*"speaker" + 0.009*"measure"'), (2, '0.029*"vaccine" + 0.015*"astrazeneca" + 0.013*"data" + 0.013*"efficacy" + 0.012*"dose" + 0.009*"countries" + 0.009*"age" + 0.009*"find" + 0.009*"participants" + 0.008*"result"')]


In [9]:
# Printing the document/topic matrix as a data frame

# use the LDA model to transform documents
lda2_docs = lda2[corpus]
# extract the scores and round them to 3 decimal places
scores2 = np.round([[doc[1] for doc in row] for row in lda2_docs], 3)
# convert the documents scores into a data frame
df_lda2 = pd.DataFrame(scores2, columns=["topic 1", "topic 2","topic 3"])
df_lda2

Unnamed: 0,topic 1,topic 2,topic 3
0,0.911,0.045,0.044
1,0.911,0.045,0.044
2,0.911,0.045,0.044
3,0.053,0.053,0.894
4,0.053,0.053,0.894
5,0.056,0.056,0.888
6,0.079,0.842,0.08
7,0.068,0.864,0.068
8,0.069,0.862,0.069


### Coherence Scores for 4 Topics

In [10]:
# build the lda model after for the 3rd optimal model: Topics = 4
lda3 = LdaModel(corpus, num_topics=4,id2word=id2word, random_state=10, passes=50) # passing 50 times per instruction
print(lda3.print_topics())

[(0, '0.022*"biden" + 0.021*"infrastructure" + 0.015*"tax" + 0.014*"house" + 0.014*"spend" + 0.011*"trillion" + 0.011*"white" + 0.010*"proposal" + 0.010*"next" + 0.009*"republicans"'), (1, '0.032*"apple" + 0.027*"homepod" + 0.015*"device" + 0.013*"mini" + 0.011*"might" + 0.011*"speakers" + 0.010*"code" + 0.010*"could" + 0.010*"speaker" + 0.010*"measure"'), (2, '0.034*"vaccine" + 0.017*"astrazeneca" + 0.015*"data" + 0.014*"efficacy" + 0.013*"dose" + 0.010*"countries" + 0.010*"age" + 0.010*"participants" + 0.010*"find" + 0.009*"result"'), (3, '0.005*"see" + 0.005*"question" + 0.005*"key" + 0.005*"april" + 0.005*"accord" + 0.005*"relate" + 0.005*"could" + 0.005*"available" + 0.005*"describe" + 0.005*"since"')]


In [11]:
# Printing the document/topic matrix as a data frame

# use the LDA model to transform documents
lda3_docs = lda3[corpus]
# extract the scores and round them to 3 decimal places
scores3 = np.round([[doc[1] for doc in row] for row in lda3_docs], 3)
# convert the documents scores into a data frame
df_lda3 = pd.DataFrame(scores3, columns=["topic 1", "topic 2","topic 3","topic 4"])
df_lda3

Unnamed: 0,topic 1,topic 2,topic 3,topic 4
0,0.905,0.032,0.032,0.032
1,0.904,0.032,0.032,0.032
2,0.903,0.032,0.032,0.032
3,0.038,0.038,0.885,0.038
4,0.038,0.038,0.887,0.037
5,0.04,0.041,0.879,0.04
6,0.057,0.829,0.057,0.057
7,0.049,0.853,0.049,0.049
8,0.051,0.849,0.05,0.05


## T7: Choosing the Optimal # of topics

According to the analysis above, I would choose 3 topics to run the SVD model. While 2 topics has the highest average coherence scores (the first topic for the first 2 documents has a mean of 0.922, and the remaining documents have a mean of 0.894), the 3 topic option does not have much tradeoff. The Topic 2 score only drops by 2%, and topic 3 has a score of .894. Therefore, while there is probably 2 topics within the all the documents, there appears to be another topic as well that can derive more detail into the documents and would be undiscoverd if the 2 topic option was chosen.

The 4 topic model appears to be the worst score as there does not appear to be a cluster of documents that have good scores for the 4th topic that I can derive, so this is not a option to consider.

In [12]:
# Calculating document mean of top scores for 2 topics
a = [['Topic 1',round(mean(df_lda.iloc[0:2,0]),3)],
      ['Topic 2',round(mean(df_lda.iloc[3:8,1]),3)]]
summ_df = pd.DataFrame(a,columns = ['Topic','Mean'])
print('2 Topic Mean Top Coherence Scores')
summ_df

2 Topic Mean Top Coherence Scores


Unnamed: 0,Topic,Mean
0,Topic 1,0.922
1,Topic 2,0.894


In [13]:
# Calculating document mean of top scores for 3 topics
a2 = [['Topic 1',round(mean(df_lda2.iloc[0:2,0]),3)],
      ['Topic 2',round(mean(df_lda2.iloc[6:8,1]),3)],
      ['Topic 3',round(mean(df_lda2.iloc[3:5,2]),3)]]
summ_df2 = pd.DataFrame(a2,columns = ['Topic','Mean Score'])
print('3 Topic Mean Top Coherence Scores')
summ_df2

3 Topic Mean Top Coherence Scores


Unnamed: 0,Topic,Mean Score
0,Topic 1,0.911
1,Topic 2,0.853
2,Topic 3,0.894


## T8: Run a Truncated SVD

In [14]:
# Running a truncated SVD model
U, s, V = svd(corpus_vect.toarray())
tsvd = TruncatedSVD(n_components=3) # set the number of topics = 3
tsvd.fit(corpus_vect)
print(np.round(tsvd.transform(corpus_vect), 3))
print(tsvd.singular_values_)

[[ 0.721 -0.56  -0.22 ]
 [ 0.706 -0.548 -0.2  ]
 [ 0.701 -0.579 -0.213]
 [ 0.577  0.736 -0.167]
 [ 0.595  0.7   -0.127]
 [ 0.549  0.679 -0.147]
 [ 0.272 -0.006  0.729]
 [ 0.312 -0.013  0.87 ]
 [ 0.296 -0.036  0.787]]
[1.66051184 1.56322451 1.45171996]


In [15]:
# Printing SVD scores per word
df_comp = pd.DataFrame(tsvd.components_, columns=vectorizer.get_feature_names())
df_comp = df_comp.apply(lambda x: np.round(x,3))
print(df_comp)

   accord  actually  advisers  africa    age  american  among  apple  april  \
0   0.057     0.033     0.070   0.030  0.077     0.042  0.041  0.170  0.035   
1  -0.006    -0.025    -0.046   0.043  0.107     0.001  0.017 -0.013  0.006   
2   0.024     0.009    -0.026  -0.011 -0.027    -0.016 -0.014  0.606  0.010   

   around  ...  university  vaccine  vaccines  wealth  weeks  white   wing  \
0   0.030  ...       0.039    0.392     0.062   0.031  0.071  0.128  0.040   
1   0.012  ...       0.055    0.544     0.085  -0.028  0.028 -0.097 -0.035   
2   0.016  ...      -0.014   -0.131    -0.020  -0.012 -0.025 -0.049 -0.015   

   world   year  years  
0  0.053  0.029  0.058  
1  0.073 -0.021  0.020  
2 -0.017  0.015 -0.022  

[3 rows x 182 columns]


In [16]:
# using the LsiModel class in gensim
lsi = LsiModel(corpus=corpus, id2word=id2word, num_topics=3)
lsi.print_topics(3)

[(0,
  '0.392*"vaccine" + 0.325*"biden" + 0.291*"infrastructure" + 0.184*"tax" + 0.176*"house" + 0.172*"spend" + 0.170*"apple" + 0.168*"astrazeneca" + 0.144*"data" + 0.136*"homepod"'),
 (1,
  '0.544*"vaccine" + -0.275*"biden" + -0.261*"infrastructure" + 0.232*"astrazeneca" + 0.202*"data" + 0.186*"efficacy" + -0.165*"tax" + 0.161*"dose" + -0.157*"house" + -0.155*"spend"'),
 (2,
  '-0.606*"apple" + -0.484*"homepod" + -0.233*"device" + -0.187*"mini" + -0.140*"speakers" + 0.131*"vaccine" + 0.125*"biden" + -0.124*"might" + -0.121*"speaker" + -0.116*"launch"')]

In [17]:
# Print the LSI topic scores
lsi_docs = lsi[corpus]
scores = np.round([[doc[1] for doc in row] for row in lsi_docs], 3)
df_lsi = pd.DataFrame(scores, columns=["topic 1", "topic 2","topic 3"])
df_lsi

Unnamed: 0,topic 1,topic 2,topic 3
0,0.721,-0.56,0.22
1,0.706,-0.548,0.2
2,0.701,-0.579,0.213
3,0.577,0.736,0.167
4,0.595,0.7,0.127
5,0.549,0.679,0.147
6,0.272,-0.006,-0.729
7,0.312,-0.013,-0.87
8,0.296,-0.036,-0.787
