In [1]:
import pandas as pd
import spacy
import seaborn as sns
from ast import literal_eval
import re
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from collections import defaultdict

In [2]:
fn = "/Users/yaya/Desktop/Liu/Data/wikipedia"
df = pd.read_csv(fn + "/singapore_wiki_list.csv").drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'dist', 'primary', 'ns']) # simplify dataframe
print(df.columns)
wiki = df[['pageid', 'title', 'text']] # extract context with page id
# wiki = wiki.sample(10)
wiki.head()

Index(['lat', 'lon', 'pageid', 'title', 'text'], dtype='object')


Unnamed: 0,pageid,title,text
0,13108718,Masjid Al-Mawaddah,"[""Masjid Al-Mawaddah (Jawi: مسجد المودّه; Arab..."
1,1838457,Port of Tanjung Pelepas,['Coordinates: 01°21′58.85″N 103°32′54.12″E\uf...
2,6170311,Pulai River,['Pulai River (Malay: Sungai Pulai) is a river...
3,5073671,Gelang Patah,"['Gelang Patah is a suburb in Iskandar Puteri,..."
4,5974898,Tanjung Kupang,['Tanjung Kupang is a mukim in Iskandar Puteri...


# Preprocessing

* Replace some string like `\n`, `[` .etc in order to have clean text.
* Remove coordinates binding to some articles.
* Remove the names in different language.

It will return a nested list named `articles` as corpus.

In [3]:
def preprocessing(text):
    data = literal_eval(text)
    data = [re.sub(r'\[\d+\]|Coordinates:.*|\((.*?)(Chinese|pinyin|Malay|Abbreviation|Tamil|Jawi|Arabic|Hebrew|Latin)(.*?)\)', '', x) for x in data] # remove [1],[2].etc & coordinates
    data = [x.strip() for x in data] # remove \n and empty entries
    data = [x.replace(u'\xa0', u' ') for x in data]
    return " ".join(data).strip()
wiki['text'] = wiki['text'].apply(preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [4]:
articles = wiki['text'].to_list()

# Tokenization

Here will use pre-trained model from `spacy`. After tokenization and lemmatization, it will create a nested list excluding stop words, punctuations, white spaces and numbers. 

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
articles = [doc for doc in articles]
g = (doc for doc in nlp.pipe(articles))
g1 = next(g)

In [7]:
def filter_word(doc):
    for w in doc:
        if not w.is_stop:
            if not w.is_punct:
                if not w.is_space:
                    if w.pos_ not in ["NUM"]:
                        yield w

In [8]:
g_corpus = ( [w.lemma_.lower() for w in filter_word(doc)] for doc in nlp.pipe(articles))
texts = [next(g_corpus) for i in range(len(articles))]

In [9]:
print(len(texts))
print(len(texts[0]))
print(" ".join(texts[0]))

1233
256
masjid al mawaddah mosque locate sengkang junction sengkang east road compassvale bow open mosque give majlis ugama islam singapura derive arabic word مودة literally mean affectionate take quran surah ar rum verse sign create mate ye dwell tranquillity love mercy heart verily sign reflect mosque build phase mosque building fund programme cater large demographic young family buangkok sengkang area aside numerous family friendly feature young old boast number environmentally friendly feature energy conserving mosque attain greenmark certification building construction authority open concept family friendly theme propose mosque promise programme facility enhance community bond different race family different age group mosque capacity increase worshiper peak period serve religious need muslim community institution provide social service line concept mosque multi functional institution prime agent facilitate excellent muslim community encompass core area pillar mosque activity usta

# Bigram & BOW

In [10]:
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

Both LSI and LDA will return topics for classifying articles and their compositions.

# Latent Semantic Indexing

In [11]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [15]:
lsimodel.show_topics()

[(0,
  '0.692*"singapore" + 0.271*"school" + 0.142*"student" + 0.125*"year" + 0.117*"include" + 0.107*"building" + 0.089*"government" + 0.089*"parliament" + 0.086*"new" + 0.083*"country"'),
 (1,
  '0.748*"school" + 0.304*"student" + -0.304*"singapore" + -0.127*"parliament" + 0.120*"programme" + 0.102*"high_school" + 0.098*"college" + 0.092*"year" + 0.080*"campus" + 0.071*"institution"'),
 (2,
  '0.578*"parliament" + 0.266*"mp" + -0.257*"singapore" + 0.206*"speaker" + 0.177*"bill" + 0.130*"mps" + 0.117*"house" + 0.103*"motion" + 0.102*"government" + 0.102*"vote"'),
 (3,
  '0.503*"north_korea" + 0.371*"summit" + 0.278*"trump" + 0.152*"meeting" + 0.149*"kim" + 0.147*"united_states" + -0.138*"singapore" + 0.132*"north_korean" + 0.127*"nuclear" + 0.126*"say"'),
 (4,
  '0.382*"building" + -0.270*"singapore" + 0.248*"station" + -0.167*"country" + 0.164*"area" + 0.158*"new" + 0.155*"road" + 0.154*"park" + -0.150*"student" + 0.136*"build"'),
 (5,
  '0.433*"school" + -0.406*"college" + -0.315*"s

# Latent Dirichlet allocation

In [13]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [14]:
ldamodel.show_topics()

[(0,
  '0.017*"singapore" + 0.007*"school" + 0.006*"building" + 0.005*"locate" + 0.004*"park" + 0.003*"road" + 0.003*"year" + 0.003*"include" + 0.003*"area" + 0.003*"new"'),
 (1,
  '0.015*"singapore" + 0.009*"school" + 0.007*"station" + 0.004*"year" + 0.003*"student" + 0.003*"house" + 0.003*"new" + 0.003*"locate" + 0.003*"include" + 0.003*"road"'),
 (2,
  '0.022*"singapore" + 0.006*"school" + 0.006*"park" + 0.005*"building" + 0.005*"station" + 0.004*"road" + 0.004*"new" + 0.003*"year" + 0.003*"area" + 0.003*"include"'),
 (3,
  '0.020*"singapore" + 0.006*"station" + 0.005*"locate" + 0.005*"area" + 0.004*"new" + 0.004*"road" + 0.003*"centre" + 0.003*"include" + 0.003*"building" + 0.003*"school"'),
 (4,
  '0.018*"singapore" + 0.011*"school" + 0.007*"student" + 0.005*"building" + 0.004*"new" + 0.004*"year" + 0.004*"house" + 0.004*"include" + 0.003*"build" + 0.003*"centre"'),
 (5,
  '0.013*"singapore" + 0.013*"school" + 0.005*"student" + 0.004*"year" + 0.004*"area" + 0.004*"building" + 0.00