--> Required packages for LDA Implementation

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import xlrd

tokenizer = RegexpTokenizer(r'\w+')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

--> extracting data from the excel sheets and storing it in a dictionary.

In [2]:
workbook = xlrd.open_workbook('master_boq_from_table.xls')
worksheet = workbook.sheet_by_name('Sheet1')

columns = []
n = worksheet.ncols

for i in range(0, n):
    columns.append(worksheet.cell(0, i))

boq_codes = []
for i in range(0, worksheet.nrows):
    text = str(worksheet.cell(i, 0))
    boq_codes.append(text[5:])
    
description_dict = {}
for i in range(1, worksheet.nrows):
    description = str(worksheet.cell(i, 2))
    description_dict[boq_codes[i]] = description[5:]

for each in description_dict.keys():
    print(each)
    print(description_dict[each])

'01.01'
'Site Clearance Clearing & Grubbing'
'01.02'
'Site Clearance Removal of stumps and roots'
'01.02.a'
'Site Clearance Removal of stumps and roots 300mm to 600mm'
'01.02.b'
'Site Clearance Removal of stumps and roots 600mm to 900mm'
'01.02.c'
'Site Clearance Removal of stumps and roots 900mm to 1800mm'
'01.02.d'
'Site Clearance Removal of stumps and roots 1800mm and above'
'01.03'
'Site Clearance Dismantling of unserviceable material'
'01.03.a'
'Site Clearance Dismantling of unserviceable material Brick Masonry '
'01.03.b'
'Site Clearance Dismantling of unserviceable material PCC'
'01.03.c'
'Site Clearance Dismantling of unserviceable material RCC'
'01.03.d'
'Site Clearance Dismantling of unserviceable material Stone Masonry'
'01.03.e'
'Site Clearance Dismantling of unserviceable material Hand rail'
'01.03.f'
'Site Clearance Dismantling of unserviceable material Kilometer Stones'
'01.03.g'
'Site Clearance Dismantling of unserviceable material Hectometer Stones'
'01.03.h'
'Site Cle

--> Creating a list containing master BOQ descriptions

In [3]:
document_set = []
for each in description_dict.keys():
    document_set.append(description_dict[each])
    

--> Preprossing the descriptions and creating a document-term matrix for converting into LDA model.

In [4]:
stop_words = set(stopwords.words('english'))

# list for tokenized documents in loop
texts = []

# loop through document list
for i in document_set:
    
    # clean and tokenize document string
    raw = str(i).lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stop_words]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

for each in texts:
    print(each)

['site', 'clearanc', 'clear', 'grub']
['site', 'clearanc', 'remov', 'stump', 'root']
['site', 'clearanc', 'remov', 'stump', 'root', '300mm', '600mm']
['site', 'clearanc', 'remov', 'stump', 'root', '600mm', '900mm']
['site', 'clearanc', 'remov', 'stump', 'root', '900mm', '1800mm']
['site', 'clearanc', 'remov', 'stump', 'root', '1800mm']
['site', 'clearanc', 'dismantl', 'unservic', 'materi']
['site', 'clearanc', 'dismantl', 'unservic', 'materi', 'brick', 'masonri']
['site', 'clearanc', 'dismantl', 'unservic', 'materi', 'pcc']
['site', 'clearanc', 'dismantl', 'unservic', 'materi', 'rcc']
['site', 'clearanc', 'dismantl', 'unservic', 'materi', 'stone', 'masonri']
['site', 'clearanc', 'dismantl', 'unservic', 'materi', 'hand', 'rail']
['site', 'clearanc', 'dismantl', 'unservic', 'materi', 'kilomet', 'stone']
['site', 'clearanc', 'dismantl', 'unservic', 'materi', 'hectomet', 'stone']
['site', 'clearanc', 'dismantl', 'unservic', 'materi', 'granular', 'surfac']
['site', 'clearanc', 'dismantl', '

In [46]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# print(dictionary)
# for each in dictionary:
#     print(each)
#     print(dictionary[each])

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# for content in corpus:
#     print(content)

[(0, 1), (1, 1), (2, 1), (3, 1)]
[(0, 1), (1, 1), (4, 1), (5, 1), (6, 1)]
[(0, 1), (1, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]
[(0, 1), (1, 1), (4, 1), (5, 1), (6, 1), (8, 1), (9, 1)]
[(0, 1), (1, 1), (4, 1), (5, 1), (6, 1), (9, 1), (10, 1)]
[(0, 1), (1, 1), (4, 1), (5, 1), (6, 1), (10, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1), (16, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1), (17, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1), (15, 1), (18, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1), (19, 1), (20, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1), (18, 1), (21, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1), (18, 1), (22, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1), (23, 1), (24, 1)]
[(0, 1), (1, 1), (11, 1), (12, 1), (13, 1), (25, 1), (26, 1)]
[(0, 1), (1, 1), (24, 1), (25, 1), (27, 1), (28, 1)]
[(0, 1), (1, 1), (11, 1), (13, 1), (29, 1), (30, 1)]
[(0, 1), (1, 1

In [47]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=12, id2word = dictionary, passes=20)

In [48]:
# ldamodel.print_topics(num_topics=12, num_words=2)

for each in ldamodel.print_topics(num_topics=12, num_words=4):
    print(each)


(0, '0.123*"bridg" + 0.116*"vup" + 0.114*"pup" + 0.113*"cup"')
(1, '0.159*"bear" + 0.078*"pot" + 0.077*"ptfe" + 0.075*"rob"')
(2, '0.072*"repair" + 0.055*"mainten" + 0.055*"work" + 0.054*"reinforc"')
(3, '0.060*"site" + 0.060*"clearanc" + 0.057*"dismantl" + 0.057*"electr"')
(4, '0.100*"stone" + 0.074*"kilomet" + 0.044*"misc" + 0.043*"safeti"')
(5, '0.224*"signag" + 0.186*"safeti" + 0.184*"misc" + 0.053*"board"')
(6, '0.175*"work" + 0.103*"drainag" + 0.087*"day" + 0.076*"protect"')
(7, '0.135*"pile" + 0.075*"vertic" + 0.066*"bore" + 0.052*"diamet"')
(8, '0.161*"work" + 0.067*"electr" + 0.065*"protect" + 0.065*"drainag"')
(9, '0.070*"bridg" + 0.054*"fli" + 0.051*"over" + 0.051*"rob"')
(10, '0.101*"rcc" + 0.073*"0" + 0.066*"concret" + 0.049*"rob"')
(11, '0.081*"barrier" + 0.075*"crash" + 0.043*"culvert" + 0.027*"beam"')


--> Assigns the topics to the documents in corpus


In [22]:
lda_corpus = ldamodel[corpus]

# for each in lda_corpus:
#     print(each)

[(0, 0.0166666671147923), (1, 0.416668053234174), (2, 0.4166652795882923), (3, 0.016666666669256779), (4, 0.016666666675679304), (5, 0.01666666667703106), (6, 0.016666666671668079), (7, 0.0166666666860457), (8, 0.016666666671983812), (9, 0.01666666666934527), (10, 0.016666666669459269), (11, 0.016666666672271943)]
[(0, 0.51410981936071798), (1, 0.34700129173848338), (2, 0.013888888889961351), (3, 0.013888888889417332), (4, 0.013888888890261847), (5, 0.013888888890485349), (6, 0.013888888889651401), (7, 0.013888888891865738), (8, 0.01388888889065896), (9, 0.01388888888930199), (10, 0.013888888889325149), (11, 0.013888888889869682)]
[(0, 0.55803396438756059), (1, 0.33779936893179824), (2, 0.010416666667934858), (3, 0.010416666667243273), (4, 0.010416666668413068), (5, 0.01041666666868162), (6, 0.010416666667637063), (7, 0.010416666670419961), (8, 0.010416666668072404), (9, 0.010416666667188369), (10, 0.010416666667237184), (11, 0.010416666667813629)]
[(0, 0.41343452544779496), (1, 0.4823

[(9, 0.92361081931792754)]
[(9, 0.92361083258388688)]
[(9, 0.92361082699916297)]
[(9, 0.90833294170064449)]
[(9, 0.92948695116856084)]
[(9, 0.92948695573258899)]
[(9, 0.92948695211514543)]
[(0, 0.14163039450734083), (9, 0.79426701151272072)]
[(6, 0.083373921171808349), (9, 0.85252323526564144)]
[(9, 0.92948695321237507)]
[(9, 0.92948694323222114)]
[(9, 0.92948695500431766)]
[(9, 0.92948695187121144)]
[(9, 0.92948695050409091)]
[(9, 0.92948695348812782)]
[(9, 0.92948695358408662)]
[(6, 0.083371375391210034), (9, 0.85252579013268404)]
[(1, 0.10836298731612951), (9, 0.80830323194599596)]
[(1, 0.083349480522455194), (9, 0.8525476490381767)]
[(1, 0.08334949313281334), (9, 0.85254768912602685)]
[(1, 0.083349474194668816), (9, 0.85254770333529506)]
[(9, 0.908332941415957)]
[(9, 0.9294869450918174)]
[(9, 0.92948695480936816)]
[(9, 0.9294869543016836)]
[(9, 0.92948694829099965)]
[(9, 0.92948694886657113)]
[(6, 0.083373917097879821), (9, 0.85252327080780221)]
[(3, 0.089915655960012553), (9, 0.84

--> Inputing Project BOQ and preprocessing the input BOQs'.


--> Converting the input BOQs into document term matrix and testing them with the pretrained LDAModel.

In [51]:
stop_words = set(stopwords.words('english'))

# id2word = gensim.corpora.Dictionary()

query_corpus = []
processed_queries = []
query1 = input("enter a Project BOQ : ")
query2 = input("enter another Project BOQ: ")

query_corpus.append(query1)
query_corpus.append(query2)
# print(query_corpus)

for query in query_corpus:
    raw = str(query).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from query
    stopped_query = [i for i in tokens if not i in stop_words]
    # stem query
    processed_query = [p_stemmer.stem(i) for i in stopped_query]
    processed_queries.append(processed_query)
    
# print(processed_queries)

# turn our tokenized documents into a id <-> term dictionary
query2id = corpora.Dictionary(processed_queries)
print(query2id)

# convert tokenized documents into a document-term matrix
query_corpus = [query2id.doc2bow(processed_query) for processed_query in processed_queries]
# for each in query_corpus:
#     print(each)

# assigns topics to the input porject BOQs'
topics_query = ldamodel[query_corpus]
for each_query in topics_query:
    print("the topics and their probability distributions in the Project BOQ are : ")
    print(each_query)

enter a Project BOQ : Footpath at Gap Slab location is to be repair
enter another Project BOQ: To plaster the superstructure with polymer modified cement mortar
Dictionary(11 unique tokens: ['footpath', 'gap', 'slab', 'locat', 'repair']...)
the topics and their probability distributions in the Project BOQ are : 
[(0, 0.34721657816417861), (1, 0.013888888893205961), (2, 0.013889232232989339), (3, 0.51389418951183408), (4, 0.013888888906346089), (5, 0.013888888897079869), (6, 0.013888888899300547), (7, 0.013888888893766113), (8, 0.013888888900436143), (9, 0.013888888897375169), (10, 0.013888888898020106), (11, 0.013888888905467812)]
the topics and their probability distributions in the Project BOQ are : 
[(0, 0.011904761905205586), (1, 0.011904761905118475), (2, 0.011904761905641565), (3, 0.8690476190387646), (4, 0.011904761906206814), (5, 0.01190476190543931), (6, 0.011904761905623021), (7, 0.011904761905172196), (8, 0.011904761905714375), (9, 0.011904761905461973), (10, 0.0119047619055