In [None]:
import numpy as np
np.random.seed(0)

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LsiModel, LdaModel

In [None]:
from google.colab import drive

drive.mount('/content/drive')
dir = "/content/drive/My Drive/project/AUP_project/AUPs/"

In [None]:
import os

def preprocess(doc):
  return [SnowballStemmer('english').stem(WordNetLemmatizer().lemmatize(token, pos='n'))
          for token in simple_preprocess(doc) if token not in stop_words]

orig_urls = []
orig_texts = []
docs = []
for fname in os.listdir(dir):
    doc = open(f'{dir}{fname}', 'r', encoding='unicode_escape').read()
    orig_urls.append(fname)
    orig_texts.append(doc)
    docs.append(preprocess(doc))

In [None]:
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

lsi = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
lsi.show_topics()

In [None]:
from gensim import similarities

index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

In [None]:
# query = 'not allowed'
vec_bow = dictionary.doc2bow(docs[110])

vec_lsi = lsi[vec_bow] # convert the query to LSI space

sims = index[vec_lsi]  # perform a similarity query against the corpus

sims = sorted(enumerate(sims), key=lambda item: -item[1])
# print("Similarity of query:",query,"\n")
for i, s in enumerate(sims[:5]):
    print('doc number:',s[0],'   has similarity of',s[1])

## TF-IDF and K-Means Clustering

In [None]:
vocabulary = dict(dictionary).values()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(vocabulary=vocabulary)
tfs = tfidf.fit_transform(orig_texts)

In [None]:
from sklearn.cluster import KMeans
from collections import defaultdict
k = 10

kmeans = KMeans(n_clusters=k)
res = kmeans.fit_predict(tfs)

clusters = defaultdict(list)
for i, cluster in enumerate(res):
  clusters[int(cluster)].append(i)

In [None]:
print(sorted(clusters.items()))

In [None]:
def cmp_aups(idx1, idx2):
  print(orig_urls[idx1])
  print(orig_texts[idx1])
  print()
  print(orig_urls[idx2])
  print(orig_texts[idx2])
  print()

In [None]:
cmp_aups(17,86)

In [None]:
cmp_aups(7,164)

In [None]:
cmp_aups(52,73)

In [None]:
cmp_aups(9,53)