# Import libraries and prepare files

Import necessary libraries and load all the seedly and mycarforum forum posts in the folders. Check that folders are correct.

In [61]:
import nltk
import re
from nltk.corpus import stopwords
import pandas
import gensim

# Get all filenames of seedly forum posts
seedly_files = nltk.corpus.PlaintextCorpusReader('../WebScraper/seedly-forum/', '.+\.csv')
fids = seedly_files.fileids()

# Get all filenames of mycarforum posts
mycarforum_files = nltk.corpus.PlaintextCorpusReader('../WebScraper/mycarforum/', '.+\.txt')
mycarforum_fids = mycarforum_files.fileids()

fids

  seedly_files = nltk.corpus.PlaintextCorpusReader('../WebScraper/seedly-forum/', '.+\.csv')
  mycarforum_files = nltk.corpus.PlaintextCorpusReader('../WebScraper/mycarforum/', '.+\.txt')


['99.co Event CPF Property Investments 1168.csv',
 '99.co Event Condominium Property 4341.csv',
 '99.co Event Condominium Property Investments 1167.csv',
 '99.co Event EC Condominium Condominium Property 4331.csv',
 '99.co Event General Investments Property 1132.csv',
 '99.co Event HDB BTO Investments Property 1124.csv',
 '99.co Event HDB BTO Investments Property 1169.csv',
 '99.co Event Investments HDB BTO Property 1166.csv',
 '99.co Event Investments Loans 1140.csv',
 '99.co Event Investments Property 1121.csv',
 '99.co Event Investments Property 1128.csv',
 '99.co Event Investments Property 1129.csv',
 '99.co Event Investments Property 1131.csv',
 '99.co Event Investments Property 1133.csv',
 '99.co Event Investments Property 1135.csv',
 '99.co Event Investments Property 1137.csv',
 '99.co Event Investments Property 1139.csv',
 '99.co Event Investments Property General 1123.csv',
 '99.co Event Investments Property General 1126.csv',
 '99.co Event Investments Property HDB BTO 1125.cs

# Prepare corpus: Load Seedly forum posts

For each file, extract the relevant fields: question, responses. Filter and remove authors, dates and words that are not provided by the users.

In [62]:
def findnth(haystack, needle, n):
    parts= haystack.split(needle, n+1)
    if len(parts)<=n+1:
        return -1
    return len(haystack)-len(parts[-1])-len(needle)


def process_response(line):
    # Extract date
    temp = line[line.find('Answered on '):]
    x = findnth(temp, ' ', 3)
    answer_date = temp[:x + 5]

    # Extract answer
    answer = line[line.find(answer_date) + len(answer_date):]
    
    return answer_date, answer


corpus = []

for fid in fids:
    thread = []
    
    with open('../WebScraper/seedly-forum/' + fid, encoding='utf-8') as file:
        # Process each line and extract title, date and content of article
        for i, line in enumerate(file):
            if i == 2:
                # Handle question
                first_comma = line.find(',')
                line = line[first_comma + 1:]
                if line[0] == '"':
                    line = line[1:]
                last_qnmark = line.rfind('?')
                line = line[:last_qnmark + 1]
                
                # Extract author
                author = line[line.find('Asked by '):line.find('Asked on')]
                
                # Extract post date
                temp = line[line.find('Asked on '):]
                x = findnth(temp, ' ', 3)
                ask_date = temp[:x + 5]
                
                # Extract title
                title = line[:line.find(author)]
                
                # Extract question
                question = line.replace(author, '').replace(ask_date, '').replace(title, '')
                thread.append(question)
                    
            elif i >= 6:
                # Handle responses
                # Extract response number. If no number, means current line is part of previous response
                temp = line.find(',')
                num = line[:temp]
                is_num = False
                
                try:
                    int(num)
                    is_num = True
                except ValueError:
                    is_num = False
                    
                # If part of previous response, append entire line. If new response, filter the answerer and date
                if not is_num:
                    thread.append(line)
                else:
                    answer_date, answer = process_response(line)
                    thread.append(answer)
    
    corpus.append(thread)

# Prepare corpus: Load MyCarForum posts

In [63]:
for fid in mycarforum_fids:
    mycarforum_thread = []
    with open('../WebScraper/mycarforum/' + fid, encoding="utf8") as file:
        lines = file.readlines()
        post_set = set()
        if lines:
            thread_string = ' '.join(lines)
            thread_string = thread_string.split('\t\t')
            for i, post in enumerate(thread_string[1:5]):
                post_content = post
                
                # Strip username, tooltips, user post count and reputation
                id1 = post.find('\n') + 2
                id2 = post.find('Go to ')
                post_content = post_content[id1:id2]
                for item in post_set:
                    if item in post_content:
                        post_content = post_content.replace(item, '')
                post_set.add(post_content)
                
                mycarforum_thread.append(post_content)
                
        # Append to combined corpus
        corpus.append(thread)

# Preprocess corpus

For each post, perform Tokenization, Stop words removal, and lowercase all words.

In [71]:
# Load stop words
stop_list = nltk.corpus.stopwords.words('english')
my_stopwords = []
with open('stop_words.txt') as file:
    my_stopwords = [s.replace('\n', '') for s in file.readlines()]
    

# Tokenization
tokenized = []

for thread in corpus:
    tokens = [nltk.word_tokenize(post) for post in thread]
    tokenized.append(tokens)
    
    
# Pre-processing
cleaned_corpus = []
    
for thread in tokenized:
    cleaned_thread = [[w.lower() for w in post] for post in thread]
    cleaned_thread = [[w for w in post if re.search('^[a-z]+$', w)] for post in cleaned_thread]
    cleaned_thread = [[w for w in post if w not in stop_list and w not in my_stopwords] for post in cleaned_thread]
    cleaned_corpus.append(cleaned_thread)
    
cleaned_corpus

[[['way',
   'increase',
   'cpf',
   'amount',
   'buy',
   'property',
   'want',
   'cpf',
   'amount',
   'buy',
   'property',
   'rather',
   'taking',
   'large',
   'loan',
   'way'],
  ['hi',
   'douglas',
   'keynote',
   'afraid',
   'pump',
   'higher',
   'paying',
   'job',
   'assuming',
   'hit',
   'ceiling',
   'cap',
   'cpf'],
  ['always', 'connect', 'via', 'https', 'simply', 'company', 'hotline'],
  ['remember',
   'plan',
   'finances',
   'properly',
   'overstretch',
   'homework',
   'enjoy',
   'property',
   'investment',
   'journey'],
  ['regards'],
  ['douglas'],
  []],
 [['pick', 'property', 'enbloc', 'potential'],
  ['condos',
   'potential',
   'enough',
   'holding',
   'question',
   'long',
   'much',
   'earn',
   'longthe',
   'timing',
   'depends',
   'two',
   'factors',
   'resident',
   'demographic',
   'market',
   'residents',
   'condo',
   'ultimately',
   'determines',
   'property',
   'goes',
   'see',
   'residents',
   'know',
   'go

# Prepare model inputs

Prepare the inputs the LDA model needs to train on. Create the dictionary and term document frequency.

In [72]:
# Create dictionary
dictionary = gensim.corpora.Dictionary()
for thread in cleaned_corpus:
    dictionary.add_documents(thread)

# Collapse thread into bow
corpus_bow = []
for thread in cleaned_corpus:
    thread_bow = []
    for post in thread:
        for word in post:
            thread_bow.append(word)
    corpus_bow.append(thread_bow)

# Term document frequency
tdf = [dictionary.doc2bow(thread) for thread in corpus_bow]
tdf

[[(0, 2),
  (1, 2),
  (2, 3),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 3),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 2),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 2),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1)],
 [(3, 2),
  (6, 5),
  (9, 2),
  (17, 2),
  (27, 1),
  (40, 2),
  (41, 1),
  (42, 3),
  (43, 1),
  (44, 2),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 3),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 3),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 2),
  (66, 2),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 3),
  (72, 1),
  (73, 1),
  (74, 2),
  (75, 1),
  (76, 1),
  (77, 2),
  (78, 2),
  (79, 1),
  (80, 1),
  (81, 3),
  (82, 2),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),


# Build LDA model

In [73]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=tdf,
                                           id2word=dictionary,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=150,
                                           passes=80,
                                           alpha='auto',
                                           per_word_topics=True)
#4 topics, 150 chunk size, 80 passes

In [74]:
# Print top keywords for each topic
print(lda_model.print_topics())

[(0, '0.010*"market" + 0.009*"investment" + 0.009*"investing" + 0.008*"risk" + 0.008*"stocks" + 0.008*"invest" + 0.007*"good" + 0.006*"stock" + 0.006*"company" + 0.006*"buy"'), (1, '0.027*"account" + 0.019*"interest" + 0.014*"bank" + 0.013*"savings" + 0.012*"card" + 0.011*"credit" + 0.009*"dbs" + 0.008*"month" + 0.008*"using" + 0.007*"monthly"'), (2, '0.087*"data" + 0.069*"unlimited" + 0.043*"plan" + 0.035*"time" + 0.026*"talk" + 0.026*"pay" + 0.026*"speed" + 0.026*"high" + 0.017*"phone" + 0.017*"internet"'), (3, '0.011*"cpf" + 0.009*"money" + 0.009*"insurance" + 0.008*"life" + 0.008*"need" + 0.007*"income" + 0.007*"years" + 0.007*"plan" + 0.006*"retirement" + 0.006*"time"')]


In [75]:
# Prepare data for visualization
import pyLDAvis.gensim

prep_data = pyLDAvis.gensim.prepare(lda_model, tdf, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [76]:
# Visualize LDA model

pyLDAvis.display(prep_data)

In [78]:
pyLDAvis.save_html(prep_data, 'Topic Modelling.html')