In this notebook, we are going to use Latent Dirichlet Allocation(LDA) for topic modeling to retrieve topics from 20K different Wikipedia abstracts in an unsuperivsed way.  Click here [here](https://raw.githubusercontent.com/vinid/data/master/dbpedia_sample_abstract_20k_unprep.txt) to download the data.

In [1]:
import sys
import re
import nltk
import string
import pandas as pd
import numpy as np
import gensim
from gensim.corpora import Dictionary
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
import warnings
import random
from time import time



warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

%matplotlib inline

In [2]:
print(sys.version)
print('numpy:',np.__version__)
print('pandas:',pd.__version__)
print('nltk:',nltk.__version__)
print('gensim:',gensim.__version__)

3.8.8 (default, Feb 24 2021, 13:46:16) 
[Clang 10.0.0 ]
numpy: 1.19.2
pandas: 1.2.3
nltk: 3.5
gensim: 3.8.3


In [3]:
with open("./dbpedia_sample_abstract_20k_unprep.txt", 'r') as fr_unprep:
    text = [line.strip() for line in fr_unprep.readlines()]

In [4]:
random.shuffle(text)
print('total number of Wikipedia abstracts:',len(text))

total number of Wikipedia abstracts: 20000


In [5]:
# print the first sentence
pprint(text[0])

('The Microregion of São José do Rio Preto (Portuguese: Microrregião de São '
 'José do Rio Preto) is located on the north of São Paulo state, Brazil, and '
 'is made up of 29 municipalities. It belongs to the Mesoregion of São José do '
 'Rio Preto. The microregion has a population of')


### Data cleaning & preprocessing

In [6]:
def cleanPunc(sentence):
    '''Clean a sentence of any punctuation or special characters'''
    cleaned = re.sub(r'[?|!|\'|"|#|”|’]',r'',sentence)
    cleaned = re.sub(r'[.|,|(|)|\|/|-|–]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n","")
    cleaned = re.sub('\s+',' ',cleaned) #replace multiple spaces with a single one
    return cleaned

In [7]:
def preprocess(sentence):
    '''Preprocess text, get rid of special chars, convert to lower, remove stop words.'''
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub('\s+',' ',sentence)
    sentence = sentence.replace("\n","")
    sentence = sentence.lower()
    sentence = [word for word in sentence.split() if word not in stopwords and len(word) > 1]
    return ' '.join(sentence)

In [8]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords = set(stopwords)

[nltk_data] Downloading package stopwords to /Users/isra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
text = list(map(cleanPunc, text))
text = list(map(preprocess, text))

In [10]:
# print the first sentence again
pprint(text[0])

('microregion jos rio preto portuguese microrregi de jos rio preto located '
 'north paulo state brazil made municipalities belongs mesoregion jos rio '
 'preto microregion population')


In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/isra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Split sentences into tokens (words)

In [12]:
from nltk import word_tokenize
tokens = list(map(word_tokenize, text))

In [13]:
df=pd.DataFrame()
df['abstract']=text
df['tokens']=tokens
df.head()

Unnamed: 0,abstract,tokens
0,microregion jos rio preto portuguese microrregi de jos rio preto located north paulo state brazil made municipalities belongs mesoregion jos rio preto microregion population,"[microregion, jos, rio, preto, portuguese, microrregi, de, jos, rio, preto, located, north, paulo, state, brazil, made, municipalities, belongs, mesoregion, jos, rio, preto, microregion, population]"
1,ultra low floor tram ulf low floor tram operating vienna austria oradea romania lowest floor height vehicle contrast low floor trams floor interior ulf sidewalk height cm inches,"[ultra, low, floor, tram, ulf, low, floor, tram, operating, vienna, austria, oradea, romania, lowest, floor, height, vehicle, contrast, low, floor, trams, floor, interior, ulf, sidewalk, height, cm, inches]"
2,november german revolution german november eine deutsche revolution tetralogy novels german writer alfred blin german revolution four volumes vol rger und soldaten citizens soldiers vol ii verratenes volk people betrayed vol iii heimkehr der fronttruppen return,"[november, german, revolution, german, november, eine, deutsche, revolution, tetralogy, novels, german, writer, alfred, blin, german, revolution, four, volumes, vol, rger, und, soldaten, citizens, soldiers, vol, ii, verratenes, volk, people, betrayed, vol, iii, heimkehr, der, fronttruppen, return]"
3,livingnow australias largest holistic magazine measured distribution estimated readership magazine monthly independent periodical mainly local australian content international content magazines editor chief elizabeth jewell started magazine whole person predecessor livingnow estimated,"[livingnow, australias, largest, holistic, magazine, measured, distribution, estimated, readership, magazine, monthly, independent, periodical, mainly, local, australian, content, international, content, magazines, editor, chief, elizabeth, jewell, started, magazine, whole, person, predecessor, livingnow, estimated]"
4,dangerous seed denj rasu sh vertical scrolling shooter arcade game released namco japan runs namco system hardware later sega mega drive version also developed released like arcade original,"[dangerous, seed, denj, rasu, sh, vertical, scrolling, shooter, arcade, game, released, namco, japan, runs, namco, system, hardware, later, sega, mega, drive, version, also, developed, released, like, arcade, original]"


In [14]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(df["tokens"])
dictionary

<gensim.corpora.dictionary.Dictionary at 0x11a595f40>

In [15]:
len(dictionary)

72883

#### Filter out tokens that appear in less than no_below abstracts (absolute number) or more than no_above fraction of total abstracts

In [16]:
dictionary.filter_extremes(no_below=25, no_above=0.5)

In [17]:
len(dictionary)

3098

In [34]:
?Dictionary
?dictionary.filter_extremes
?dictionary.doc2bow

####  Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples.

In [18]:
corpus = [dictionary.doc2bow(abstract) for abstract in df["tokens"]]

### Apply LDA model

In [None]:
from gensim.models import LdaModel

# Build LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, random_state=43,
                chunksize=200, passes=100)

In [None]:
lda_model.print_topics()[:10]

In [None]:
#https://www.kaggle.com/ktakuma/topic-modeling-of-ml-papers-with-lda
#plot word cloud for each topic