# Import libraries and prepare files

Import necessary libraries and load all the forum posts in the folder. Check that folder is correct.

In [265]:
import nltk
import re
from nltk.corpus import stopwords
import pandas
import gensim

# Get all filenames
corpus = nltk.corpus.PlaintextCorpusReader('../WebScraper/seedly-forum/', '.+\.csv')
fids = corpus.fileids()

isVectorBuild = False

fids

['99.co Event CPF Property Investments 1168.csv',
 '99.co Event Condominium Property 4341.csv',
 '99.co Event Condominium Property Investments 1167.csv',
 '99.co Event EC Condominium Condominium Property 4331.csv',
 '99.co Event General Investments Property 1132.csv',
 '99.co Event HDB BTO Investments Property 1124.csv',
 '99.co Event HDB BTO Investments Property 1169.csv',
 '99.co Event Investments HDB BTO Property 1166.csv',
 '99.co Event Investments Loans 1140.csv',
 '99.co Event Investments Property 1121.csv',
 '99.co Event Investments Property 1128.csv',
 '99.co Event Investments Property 1129.csv',
 '99.co Event Investments Property 1131.csv',
 '99.co Event Investments Property 1133.csv',
 '99.co Event Investments Property 1135.csv',
 '99.co Event Investments Property 1137.csv',
 '99.co Event Investments Property 1139.csv',
 '99.co Event Investments Property General 1123.csv',
 '99.co Event Investments Property General 1126.csv',
 '99.co Event Investments Property HDB BTO 1125.cs

# Prepare corpus

For each file, extract the relevant fields: question, responses. Filter and remove authors, dates and words that are not provided by the users.

In [269]:
def findnth(haystack, needle, n):
    parts= haystack.split(needle, n+1)
    if len(parts)<=n+1:
        return -1
    return len(haystack)-len(parts[-1])-len(needle)


def process_response(line):
    # Extract date
    temp = line[line.find('Answered on '):]
    x = findnth(temp, ' ', 3)
    answer_date = temp[:x + 5]

    # Extract answer
    answer = line[line.find(answer_date) + len(answer_date):]
    
    return answer_date, answer


corpus = []

for fid in fids[:20]:
    thread = []
    
    with open('../WebScraper/seedly-forum/' + fid, encoding="utf8") as file:
        # Process each line and extract title, date and content of article
        for i, line in enumerate(file):
            if i == 2:
                # Handle question
                first_comma = line.find(',')
                line = line[first_comma + 1:]
                if line[0] == '"':
                    line = line[1:]
                last_qnmark = line.rfind('?')
                line = line[:last_qnmark + 1]
                
                # Extract author
                author = line[line.find('Asked by '):line.find('Asked on')]
                print(author)
                
                # Extract post date
                temp = line[line.find('Asked on '):]
                x = findnth(temp, ' ', 3)
                ask_date = temp[:x + 5]
                print(ask_date)
                
                # Extract title
                title = line[:line.find(author)]
                print(title)
                
                # Extract question
                question = line.replace(author, '').replace(ask_date, '').replace(title, '')
                print(question)
                thread.append(question)
                    
            elif i >= 6:
                # Handle responses
                # Extract response number. If no number, means current line is part of previous response
                temp = line.find(',')
                num = line[:temp]
                is_num = False
                
                try:
                    int(num)
                    is_num = True
                except ValueError:
                    is_num = False
                    
                # If part of previous response, append entire line. If new response, filter the answerer and date
                if not is_num:
#                     end_index = line.rfind('comments')
#                     line = line[:end_index - 2]
                    thread.append(line)
                else:
                    answer_date, answer = process_response(line)
#                     end_index = answer.rfind('comments')
#                     answer = answer[:end_index - 2]
                    thread.append(answer)
    
    corpus.append(thread)
    
# corpus

Asked by Anonymous
Asked on 24 May 2019
99.co Event CPF Property Investments 
Is there any way to increase my CPF amount so that I can buy more than one property?I want to use more of my CPF amount to buy property rather than taking a large loan, is there any way to do so?
Asked by Kuriakin J. Zeng
Asked on 21 May 2019
99.co Event Condominium Property 
How to pick a property with enbloc potential?
Asked by Anonymous
Asked on 24 May 2019
99.co Event Condominium Property Investments 
I was at a Condo launch recently and my property agent recommended me a banker. Should I trust him to settle my mortgage or should I look on my own?Don't know whether I'm overthinking but why did this property agent recommend me to go to this banker. Is there something in it for them?
Asked by Anonymous
Asked on 24 May 2019
99.co Event EC Condominium Condominium Property 
For a first time buyer as a couple, should we wait for ECs or jump straight into Condominiums for our first home?
Asked by Anonymous
Asked

# Preprocess corpus

For each post, perform Tokenization, Stop words removal, and lowercase all words.

In [270]:
stop_list = nltk.corpus.stopwords.words('english')
my_stopwords = ['comments', 'answerAnswerBump']

tokenized = []

# Tokenization
for thread in corpus:
    tokens = [nltk.word_tokenize(post) for post in thread]
    tokenized.append(tokens)
    
    
# Pre-processing
cleaned_corpus = []
    
for thread in tokenized:
    cleaned_thread = [[w.lower() for w in post] for post in thread]
    cleaned_thread = [[w for w in post if re.search('^[a-z]+$', w)] for post in cleaned_thread]
    cleaned_thread = [[w for w in post if w not in stop_list and w not in my_stopwords] for post in cleaned_thread]
    cleaned_corpus.append(cleaned_thread)
    
cleaned_corpus

[[['way',
   'increase',
   'cpf',
   'amount',
   'buy',
   'one',
   'property',
   'want',
   'use',
   'cpf',
   'amount',
   'buy',
   'property',
   'rather',
   'taking',
   'large',
   'loan',
   'way'],
  ['hi',
   'douglas',
   'keynote',
   'afraid',
   'pump',
   'higher',
   'paying',
   'job',
   'assuming',
   'hit',
   'ceiling',
   'cap',
   'cpf'],
  ['always', 'connect', 'via', 'https', 'simply', 'company', 'hotline'],
  ['remember',
   'plan',
   'finances',
   'properly',
   'overstretch',
   'homework',
   'enjoy',
   'property',
   'investment',
   'journey'],
  ['regards'],
  ['douglas'],
  []],
 [['pick', 'property', 'enbloc', 'potential'],
  ['condos',
   'potential',
   'enough',
   'holding',
   'question',
   'long',
   'much',
   'earn',
   'longthe',
   'timing',
   'depends',
   'two',
   'factors',
   'resident',
   'demographic',
   'market',
   'residents',
   'condo',
   'ultimately',
   'determines',
   'property',
   'goes',
   'see',
   'residents

# Prepare model inputs

Prepare the inputs the LDA model needs to train on. Create the dictionary and term document frequency.

In [273]:
# Create dictionary
dictionary = gensim.corpora.Dictionary()
for thread in cleaned_corpus:
    dictionary.add_documents(thread)
#     print(dictionary.token2id)

# Collapse thread into bow
corpus_bow = []
for thread in cleaned_corpus:
    thread_bow = []
    for post in thread:
        for word in post:
            thread_bow.append(word)
    corpus_bow.append(thread_bow)

# Term document frequency
tdf = [dictionary.doc2bow(thread) for thread in corpus_bow]
tdf

[[(0, 2),
  (1, 2),
  (2, 3),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 3),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 2),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1)],
 [(3, 2),
  (7, 5),
  (11, 2),
  (19, 2),
  (29, 1),
  (42, 2),
  (43, 1),
  (44, 3),
  (45, 2),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 3),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 3),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 2),
  (69, 1),
  (70, 2),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 3),
  (76, 1),
  (77, 1),
  (78, 2),
  (79, 1),
  (80, 1),
  (81, 2),
  (82, 2),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 3),

# Build LDA model

In [274]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=tdf,
                                           id2word=dictionary,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=3,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [275]:
# Print the Keyword in the 10 topics
print(lda_model.print_topics())

[(0, '0.014*"rah" + 0.014*"renting" + 0.013*"money" + 0.010*"investment" + 0.007*"leverage" + 0.007*"seminars" + 0.007*"means" + 0.007*"buying" + 0.007*"financial" + 0.007*"pay"'), (1, '0.025*"property" + 0.018*"loan" + 0.011*"buying" + 0.010*"take" + 0.009*"one" + 0.009*"point" + 0.008*"much" + 0.007*"long" + 0.007*"depends" + 0.007*"seems"'), (2, '0.031*"property" + 0.024*"douglas" + 0.020*"investment" + 0.014*"company" + 0.014*"homework" + 0.014*"plan" + 0.014*"always" + 0.014*"regards" + 0.013*"overstretch" + 0.013*"simply"')]
