In [2]:
# Josh Shell
# HITS to find dense submatrices of the term-document matrix
# Usage: python IR24A.py
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
stopwords.add('amp')
stopwords.add('https')
stopwords.add('http')
stopwords.add('co')
stopwords.add('rt')
df = pd.read_csv('tweets_01-08-2021.csv')
biv = CountVectorizer(min_df = 10, stop_words=stopwords, binary=True)
X = biv.fit_transform(df['text'])
ones = X.count_nonzero()
print(X.getnnz()) # non-zeros in the term-document matrix
shape = X.get_shape()
# the density of the term-document matrix
print('Term-document matrix density', ones / shape[0] / shape[1])
seed = random.randrange(shape[1])
terms = biv.get_feature_names()
print('\nThe randomly selected term for the initialization')
print(seed, terms[seed])  # a query
a = dict()  # authority score for terms
h = dict()  # hub score for documents
a[seed] = 1
for iter in range(3):
    for t in a:
        col = X.getcol(t)
        for d in col.nonzero()[0]:
           if d in h:
                h[d] = h[d] + a[t]
           else:
                h[d] = a[t]
    for d in h:
        row = X.getrow(d)
        for t in row.nonzero()[1]:
            if t in a:
                a[t] = a[t] + h[d]
            else:
                a[t] = h[d]
# we have found a submatrix from the query
# the number of rows/terms and the number of columns/documents are up to you
# we will display the top 20 terms and the top 10 tweets
sa = sorted(a.items(), key=lambda x: x[1], reverse=True)
print('\nTop 20 words after 3 iterations')
for ta in sa[0:20]:
    print(terms[ta[0]], ta[1])
ha = sorted(h.items(), key=lambda x: x[1], reverse=True)
print('\nTop 20 tweets after 3 iterations')
for d in ha[0:10]:
    print(df.iloc[d[0]]['date'], df.iloc[d[0]]['text'])
# we settle down on the top 20 x top 20 submatrix and compute its density
rows = set([d[0] for d in ha[0:20]])
n = 0
for ta in sa[0:20]:
    col = X.getcol(ta[0]).nonzero()
    for d in col[0]:
        if d in rows:
            n += 1
print("\n 20x20 submatrix density: ", n / 400)

490607
Term-document matrix density 0.0014115253352350733

The randomly selected term for the initialization
3249 lets

Top 20 words after 3 iterations
realdonaldtrump 6304983694
great 3539642213
trump 2992836412
president 2256036509
thank 1242123157
people 1036887141
america 1016695777
donald 863129656
country 802414548
thanks 793929473
new 677729131
run 578099502
like 578074023
vote 561060171
time 543495119
big 527630639
good 494107744
today 492304210
mr 462910948
news 461906740

Top 20 tweets after 3 iterations
2015-04-22 00:52:55 """@WSGbeme: @realDonaldTrump I'm with these great people Mr Trump. Please run for President. We need a completely new direction. #Trump2016"""
2014-11-01 01:21:13 """@Gregory_Gains: @realDonaldTrump The world needs a strong US President, I think Mr. Trump would do great. Thank you."
2015-03-31 22:14:31 """@MegaPandaduck: @realDonaldTrump @oreillyfactor We desperately need another great president. Come on Mr. Trump, America needs you!"""
2015-06-29 11:11:2