#### Problem Statement: Topic Modeling

Extract relevant words to identify the business a company is engaged in, depending on the description.

In [147]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
import string
import gensim
from gensim import corpora
import pickle as pkl

In [148]:
# read json file into dataframe

df = pd.read_json('data.json')

In [149]:
# removing newline character \n at the beginning of each line in about column

df['about'] = df['about'].apply(lambda x: x[1:])

In [150]:
df.head()

Unnamed: 0,about,title
0,ASKPL (erstwhile KE Technical Textiles Pvt Ltd...,Amer-Sil Ketex Private Limited
1,"Set up in 2016, AA is setting up a non-basmati...",Amrit Agro
2,"5 Core is part of the Five Core group, which m...",5 Core Acoustics Private Limited
3,Incorporated in 1986 as Vora Packaging Private...,3D Technopack Limited
4,"Established in May, 2015, 2Getherments Infra P...",2Getherments Infra Private Limited


In [151]:
# download stopwords and wordnet

nltk.download("stopwords")
nltk.download("wordnet")

# nltk.download("dpt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/himani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/himani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [152]:
# Define function to remove proper nouns and numerics

def remove_proper_nouns(sentence):
    tagged_sentence = nltk.tag.pos_tag(sentence.split())
    edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    
    return(' '.join(edited_sentence))

In [153]:
# Remove proper nouns
df['about2'] = df['about'].apply(remove_proper_nouns)

In [154]:
df.head(20)

Unnamed: 0,about,title,about2
0,ASKPL (erstwhile KE Technical Textiles Pvt Ltd...,Amer-Sil Ketex Private Limited,"incorporated in 1991, manufactures technical t..."
1,"Set up in 2016, AA is setting up a non-basmati...",Amrit Agro,"up in 2016, is setting up a non-basmati parboi..."
2,"5 Core is part of the Five Core group, which m...",5 Core Acoustics Private Limited,"5 Core is part of the group, which manufacture..."
3,Incorporated in 1986 as Vora Packaging Private...,3D Technopack Limited,Incorporated in 1986 as was taken over by the ...
4,"Established in May, 2015, 2Getherments Infra P...",2Getherments Infra Private Limited,"Established in 2015, 2Getherments is engaged i..."
5,Delhi-based JPP is a partnership firm set up i...,J P Polymers,Delhi-based is a partnership firm set up in 19...
6,JPRM was setup in 1994 as proprietorship conce...,J. P. Rice Mills - Fatehabad,was setup in 1994 as proprietorship concern of...
7,Incorporated in 1987 and promoted by Mr. Chand...,20 Microns Limited,Incorporated in 1987 and promoted by 20 manufa...
8,Incorporated in 1987 and promoted by Mr. Chand...,20 Microns Nano Minerals Limited,Incorporated in 1987 and promoted by 20 manufa...
9,"Established in 2016, 70 Microns is a partnersh...",70 Microns,"Established in 2016, 70 is a partnership firm ..."


In [155]:
# compile 'about' of all companies in one list

raw_corpus = df['about2'].tolist()

In [156]:
# remove stopwords, punctuations, numerics,lemmatizer (to get stem of a word e.g. stem for 'caring' is 'care')
# remove numbers, but not words that contain numbers, and words with only one character

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# add custom words to stop

stop.update(('r','pvt','ltd','mr','sri', 'crore', 'lakhs', 'crores', 'lakh', 'net'
             ,'visã','m','wife','promoted','company','private','limited','income'))
# print(stop)

# clean raw corpus

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
#     num_free = " ".join([token for token in punc_free.split() if not token.isnumeric()])
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

# clean data

processed_corpus = [clean(doc).split() for doc in raw_corpus] 

#### Preparing Document- Term matrix

In [157]:
# Creating the term dictionary of processed_corpus, where every unique term is assigned an index.

dictionary = corpora.Dictionary(processed_corpus)

In [158]:
# Converting processed_corpus into Document Term Matrix using dictionary prepared above.

doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_corpus]

#### Running Latent Dirichlet Allocation (LDA)model

uses matrix factorization

In [123]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

np.random.seed(1) # setting random seed to get the same results each time.

# Running and Trainig LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=50, id2word = dictionary, passes=50)

In [124]:
# save model to file

pkl.dump(ldamodel, open('lda_model_50_wo_proper_noun.pkl', 'wb'))
# pkl.dump(ldamodel, open('lda_model_50_wo_proper_noun_numerics.pkl', 'wb'))
# pkl.dump(ldamodel, open('lda_model_100_wo_proper_noun_numerics.pkl', 'wb'))


In [159]:
# load model

ldamodel = pkl.load(open('lda_model_50_wo_proper_noun.pkl', 'rb'))
# ldamodel = pkl.load(open('lda_model_100_wo_proper_noun_numerics.pkl', 'rb'))

In [160]:
topics_prob = ldamodel.print_topics(num_topics=100, num_words=5)
print(topics_prob[:2])

[(0, '0.137*"operation" + 0.065*"commenced" + 0.059*"incorporated" + 0.050*"commercial" + 0.041*"company"'), (1, '0.032*"asset" + 0.024*"bank" + 0.022*"interest" + 0.020*"total" + 0.014*"net"')]


In [161]:
topicid_keywords_map = {}
for w in topics_prob:
    topicid_keywords_map[w[0]] = [x.split('*')[1].replace("\"",'').strip() for x in w[1].split('+')]

In [162]:
topicid_keywords_map

{0: ['operation', 'commenced', 'incorporated', 'commercial', 'company'],
 1: ['asset', 'bank', 'interest', 'total', 'net'],
 2: ['sheet', 'book', 'product', 'deal', 'variety'],
 3: ['dealer', 'authorised', 'company', 'vehicle', 'incorporated'],
 4: ['facility', '0', 'total', 'bank', 'million'],
 5: ['school', 'run', 'society', 'surplus', 'agent'],
 6: ['million', 'sale', 'profit', '201415', 'year'],
 7: ['estate', 'project', 'real', 'residential', 'director'],
 8: ['revenue', 'fiscal', '2015', 'profit', '2016'],
 9: ['interest', 'net', 'product', 'expense', 'wheat'],
 10: ['product', 'group', 'brand', 'food', 'operates'],
 11: ['distributor', 'range', 'product', 'automobile', 'wide'],
 12: ['income', 'fiscal', 'operating', 'profit', '2016'],
 13: ['manufacture', 'chemical', 'pharmaceutical', 'company', 'manufacturing'],
 14: ['one', 'district', 'rated', 'crisil', 'five'],
 15: ['developing', 'education', 'develops', 'complex', 'land'],
 16: ['cotton', 'seed', 'oil', 'family', 'unit'],


In [163]:
# get topics for docs
    
# all_topics = ldamodel.get_document_topics(doc_term_matrix, per_word_topics=True)

# for doc_topics, word_topics, phi_values in all_topics:
#     print('New Document \n')
#     print ('Document topics:', ' '.join(d[doc_topics[0][0]]))
# #     print ('Word topics:', word_topics)
# #     print ('Phi values:', phi_values)
#     print(" ")
#     print('-------------- \n')

In [164]:
# get topics for docs

for doc in doc_term_matrix[:1]:
    print([(topicid_keywords_map[x[0]],x[1]) for x in sorted(ldamodel.get_document_topics(doc), key=lambda x: x[1], reverse=True)])

[(['company', 'manufacturing', 'incorporated', 'family', 'manufacture'], 0.46705792888585573), (['manufacture', 'yarn', 'fabric', 'garment', 'company'], 0.19453918443584528), (['marketing', 'refinery', 'pipeline', '20000', 'research'], 0.17692524891602737), (['company', 'limited', 'private', 'reconstituted', 'name'], 0.09070840699304146)]


In [165]:
# print(ldamodel.get_document_topics(doc_term_matrix[0]))

In [166]:
# get topics for docs

for idx, doc in enumerate(doc_term_matrix[:10]):
    print(df.iloc[idx]['about'])
    print('         ')
    print(processed_corpus[idx])
    print('         ')
    print([(topicid_keywords_map[x[0]],x[1]) for x in sorted(ldamodel[doc], key=lambda x: x[1], reverse=True)][0])
    print('         ')
    print('---------')
    print('         ')

ASKPL (erstwhile KE Technical Textiles Pvt Ltd), incorporated in 1991, manufactures technical textiles from fiberglass, polyester, and other synthetic materials. Its operations are managed by Mr. Sukumar Roy and Yashwant Roy. Its facility is in Kharagpur, West Bengal.
         
['incorporated', '1991', 'manufacture', 'technical', 'textile', 'fiberglass', 'polyester', 'synthetic', 'material', 'operation', 'managed', 'facility']
         
(['company', 'manufacturing', 'incorporated', 'family', 'manufacture'], 0.46789333718377524)
         
---------
         
Set up in 2016, AA is setting up a non-basmati parboiled rice mill with capacity of 3 tph at Kurud, (Chhattisgarh). Operations are managed by Mrs. Hemeshwari Bhaghel and her husband, Mr Dharmendra Chandrakar. Production is expected to commence from April 2017.AA is expected to generate profit of Rs 3 lakh on revenue of Rs 11 crore in fiscal 2018.
         
['2016', 'setting', 'nonbasmati', 'parboiled', 'rice', 'mill', 'capacity', '3