## Step 1: Setup

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

dir = "/content/drive/My Drive/project/AUP_project/AUPs/"

In [None]:
# Loading Gensim and nltk libraries
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

import nltk
nltk.download('wordnet')

## Step 2: Data Preprocessing ##

lowercase, tokenize, lemmatize, stem, remove stopwords

In [None]:
docs = []
corpus = ""
for fname in os.listdir(dir):
    doc = open(f'{dir}{fname}', 'r', encoding='unicode_escape').read()
    docs.append(doc)
    corpus += doc

In [None]:
from collections import Counter

corpus_words = [token for token in gensim.utils.simple_preprocess(corpus)
                if token not in gensim.parsing.preprocessing.STOPWORDS]
freqs = dict(Counter(corpus_words))
sorted_freqs = sorted(freqs.items(), key=lambda p: (p[1], p[0]), reverse=True)

n = 8 # top n most frequest words
print(sorted_freqs[:n])
custom_stopwords = list(list(zip(*sorted_freqs[:n]))[0])

def get_custom_stopwords(n):
  if n == 0:
    return []

  custom_stopwords = list(list(zip(*sorted_freqs[:n]))[0])
  return custom_stopwords

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.hist(freqs.values(), bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]);

plt.figure()
plt.hist(freqs.values(), bins=[100, 200, 300, 400, 500, 1000]);

In [None]:
stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
custom_stopwords = ['use', 'services', 'policy', 'service',
                    'information', 'acceptable', 'content', 'including']
def preprocess(text, custom_stopwords, lem_stem=False):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and token not in custom_stopwords: # and len(token) > 3:
            if lem_stem:
              result.append(lemmatize_stemming(token)) # perform stemming and lemmatization
            else:
              result.append(token) # skip stemming and lemmatization
    return result

In [None]:
processed_docs = [preprocess(doc, []) for doc in docs]

## Step 3: Bag of words on the dataset

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)

# OPTIONAL STEP
# dictionary.filter_extremes(no_below=2, no_above=0.1, keep_n= 100000)

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

## Step 4: Running LDA using Bag of Words ##

In [None]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, num_topics = 8, id2word = dictionary, passes = 10)

for idx, topic in lda_model.print_topics(-1):
    print("Topic {}:\n{}".format(idx, topic))
    print()

Run Experiments

In [None]:
# params
def experiment(num_stopwords, num_topics, lem_stem=False, remove_rare=False):
  custom_stopwords = get_custom_stopwords(num_stopwords)
  processed_docs = [preprocess(doc, custom_stopwords, lem_stem) for doc in docs]

  dictionary = gensim.corpora.Dictionary(processed_docs)
  if remove_rare:
    dictionary.filter_extremes(no_below=2, no_above=0.1, keep_n= 100000)
  bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

  lda_model =  gensim.models.LdaMulticore(bow_corpus, num_topics = num_topics, id2word = dictionary, passes = 10)
  for idx, topic in lda_model.print_topics(-1):
      print("Topic {}:\n{}".format(idx, topic))
      print()

In [None]:
experiment(num_stopwords = 0, num_topics = 8)

In [None]:
experiment(num_stopwords = 15, num_topics = 8)