# Topic Modeling --Approach1:

## Using LDA 

- Documents that have similar words usually have the same topic
- Documents that have groups of words frequently occurring together usually have the same topic
 - Documents are probability distributions over latent topics
 - Topics are probability distributions over words

In [5]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
Xeeva_data = pd.read_excel('bert_sample.xlsx')
print(Xeeva_data.shape)

(10000, 2)


In [4]:
Xeeva_data['ITEM_NAME'][100]

'1000A Feed Bar Springs - (Medium Load) Color = White / Ivory'

In [6]:
count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(Xeeva_data['ITEM_NAME'].values.astype('U'))

In [9]:
doc_term_matrix

<10000x6829 sparse matrix of type '<class 'numpy.int64'>'
	with 64769 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.decomposition import LatentDirichletAllocation
import random

### Defining LDA model

In [11]:
LDA = LatentDirichletAllocation(n_components=4, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=4, random_state=42)

In [13]:
for i in range(10):
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

rcdrbhac00003
lhmaher
147211
601017
segment
0212a
inox
fraser
retention
165175


In [16]:
#find 10 words with the highest probability for the first topic
first_topic = LDA.components_[0]
first_topic

array([65.41365053,  0.25513902,  0.25001753, ...,  0.25007349,
        3.13359635,  0.25172162])

In [18]:
#sort the indexes according to probability values

top_topic_words = first_topic.argsort()[-10:]
top_topic_words

array([5453, 1938,   55, 4451, 3684, 4864, 3823, 5142, 6034, 5515],
      dtype=int64)

In [19]:
#retrieveing the value of the words from the indexes from the 'count_vect' object

for i in top_topic_words:
    print(count_vect.get_feature_names()[i])


pn
55
01
insert
drum
make
engrave
new
serial
print


In [20]:
# getting the 10 words with highest probabilities for all the four topics

for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['pn', '55', '01', 'insert', 'drum', 'make', 'engrave', 'new', 'serial', 'print']


Top 10 words for topic #1:
['pn', 'nf', '32', 'tools', 'gt', 'mfg', 'diam', 'desc', 'drill', '015']


Top 10 words for topic #2:
['best', 'diamond', 'en', 'boring', 'repair', 'bar', 'tools', 'para', 'insert', '015']


Top 10 words for topic #3:
['air', 'desc', 'd2', 'charges', 'oil', 'carbide', 'drive', 'end', 'type', '015']




## Add a column to the original data frame that will store the topic for the text.

In [21]:
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

(10000, 4)

In [22]:
Xeeva_data['Topic'] = topic_values.argmax(axis=1)

In [23]:
Xeeva_data.head()

Unnamed: 0,ITEM_NAME,CATEGORY_ID,Topic
0,CALIBRACION TRANSDUCER 75 nm,CAPITAL ASSEMBLY,3
1,for pusher whskey,CAPITAL ASSEMBLY,1
2,Stat 40B Press Head Cup to Carrier from Stati...,CAPITAL ASSEMBLY,1
3,TRANSD. Cable (4145097103) scrw,CAPITAL ASSEMBLY,1
4,"ZT200 7,5BAR,13BAR60HZ NUMERO DE SERIE: AIF09...",CAPITAL ASSEMBLY,1


In [24]:
Xeeva_data.CATEGORY_ID.value_counts()

CUTTING TOOLS        5000
CHEMICALS            2000
CAPITAL ASSEMBLY     2000
LOGISTICS SERVICE    1000
Name: CATEGORY_ID, dtype: int64

In [25]:
Xeeva_data.Topic.value_counts()

3    2662
2    2606
1    2415
0    2317
Name: Topic, dtype: int64

## Cross-veryfing with original data

In [30]:
Xeeva_data.dtypes

ITEM_NAME      object
CATEGORY_ID    object
Topic           int64
dtype: object

In [35]:
topic0_data = Xeeva_data[Xeeva_data["Topic"]== 0]
topic1_data = Xeeva_data[Xeeva_data["Topic"]== 1]

In [48]:
topic0_data.head(20)

Unnamed: 0,ITEM_NAME,CATEGORY_ID,Topic
11,# 5637351,CAPITAL ASSEMBLY,0
12,# 5637377,CAPITAL ASSEMBLY,0
13,# 5637385,CAPITAL ASSEMBLY,0
14,# 5637393,CAPITAL ASSEMBLY,0
15,# 5637406,CAPITAL ASSEMBLY,0
18,#VMHI0201 BOMBA ACEITE,CAPITAL ASSEMBLY,0
27,(1) Knob Purchase Alter,CAPITAL ASSEMBLY,0
30,(1) V-nest,CAPITAL ASSEMBLY,0
34,(2) Keepers,CAPITAL ASSEMBLY,0
44,(51) Serial Pallets,CAPITAL ASSEMBLY,0


In [42]:
topic1_data.tail(20)

Unnamed: 0,ITEM_NAME,CATEGORY_ID,Topic
9853,DIFERENCIAL 2 P. 25 A DIFERENCIAL 2 P. 25 A,LOGISTICS SERVICE,1
9854,Diferencial 6F15,LOGISTICS SERVICE,1
9855,DIFERENCIAL SUPERINMUNIZ DIFERENCIAL SUPERINMUNIZ,LOGISTICS SERVICE,1
9858,DISC - | DESC: BRAKE DISC (REF. CALVEK),LOGISTICS SERVICE,1
9862,DISCO LAMINADO G-60 DISCO LAMINADO G-60,LOGISTICS SERVICE,1
9876,Domestic Axle Skid,LOGISTICS SERVICE,1
9877,Domestic Axle Skid - HD (Heavy Duty),LOGISTICS SERVICE,1
9878,DOMESTIC AXLE SKID-HD,LOGISTICS SERVICE,1
9879,DOMESTIC AXLE SKIDS,LOGISTICS SERVICE,1
9880,DOMESTIC AXLE SKIDS 78x22,LOGISTICS SERVICE,1


# Approach2:
## NMF for Topic Modeling 

#### Non-negative matrix factorization is also a supervised learning technique which performs clustering as well as dimensionality reduction. It can be used in combination with TF-IDF scheme to perform topic modeling

In [55]:
#Importing all the required libraries

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import random

### using tf-idf for vectorization

In [50]:
tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = tfidf_vect.fit_transform(Xeeva_data['ITEM_NAME'].values.astype('U'))

In [52]:
doc_term_matrix

<10000x6829 sparse matrix of type '<class 'numpy.float64'>'
	with 64769 stored elements in Compressed Sparse Row format>

#### 10000 words can be viewedin 6829 dimension space

In [53]:
# randomly get 10 words

for i in range(10):
    random_id = random.randint(0,len(tfidf_vect.get_feature_names()))
    print(tfidf_vect.get_feature_names()[random_id])

volumetric
kick
ordering
asb
desengraxante
eze
ejecutivos
rtv
1160
regal


#### Defining nmf model object

In [57]:
nmf = NMF(n_components=4, random_state=42)
nmf.fit(doc_term_matrix )

NMF(n_components=4, random_state=42)

In [60]:
#retrieve the indexes of the top ten words index with the highest probabilities for first topic:

first_topic = nmf.components_[0]
top_topic_words_nmf = first_topic.argsort()[-10:]
top_topic_words_nmf

array([3392, 2776, 6612, 4857, 2846, 4451, 2742, 5787, 6475,   68],
      dtype=int64)

In [62]:
for i in top_topic_words_nmf:
    print(tfidf_vect.get_feature_names()[i])

cutting
best
unit
maher
boring
insert
bar
repair
tools
015


#### printing the top words for all topics

In [63]:
for i,topicNMF in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topicNMF.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['cutting', 'best', 'unit', 'maher', 'boring', 'insert', 'bar', 'repair', 'tools', '015']


Top 10 words for topic #1:
['ro', 'perlube', 'armor', 'perkool', '5464', 'dubois', 'condat', '013658', 'condaforge', 'bol']


Top 10 words for topic #2:
['tool', 'block', 'build', 'calvek', 'ref', 'pn', 'gt', 'type', 'mfg', 'desc']


Top 10 words for topic #3:
['11', 'tools', '16', 'bit', 'carbide', 'shank', 'body', 'hss', '32', 'drill']




In [70]:
topic_values = nmf.transform(doc_term_matrix)
Xeeva_data['TopicNMF'] = topic_values.argmax(axis=1)


In [69]:
Xeeva_dataNMF = Xeeva_data.drop('Topic', axis=1)


In [71]:
Xeeva_dataNMF.head(5)

Unnamed: 0,ITEM_NAME,CATEGORY_ID,TopicNMF
0,CALIBRACION TRANSDUCER 75 nm,CAPITAL ASSEMBLY,2
1,for pusher whskey,CAPITAL ASSEMBLY,2
2,Stat 40B Press Head Cup to Carrier from Stati...,CAPITAL ASSEMBLY,2
3,TRANSD. Cable (4145097103) scrw,CAPITAL ASSEMBLY,2
4,"ZT200 7,5BAR,13BAR60HZ NUMERO DE SERIE: AIF09...",CAPITAL ASSEMBLY,2


In [74]:
NMF_Topic0 = Xeeva_dataNMF[Xeeva_dataNMF["TopicNMF"]== 0]
NMF_Topic1 = Xeeva_dataNMF[Xeeva_dataNMF["TopicNMF"]== 1]

In [75]:
NMF_Topic0.head(20)

Unnamed: 0,ITEM_NAME,CATEGORY_ID,TopicNMF
5,"""6126061 TPF1588R01 IN2505 (5/8"""" INSERT)""",CAPITAL ASSEMBLY,0
9,"""Repair SERVICE TO DEBURR CONVEYOR CONVEYOR BR...",CAPITAL ASSEMBLY,0
10,"""SERVICIO DE REPARACION A TRANSPORTADOR DE REB...",CAPITAL ASSEMBLY,0
11,# 5637351,CAPITAL ASSEMBLY,0
12,# 5637377,CAPITAL ASSEMBLY,0
13,# 5637385,CAPITAL ASSEMBLY,0
14,# 5637393,CAPITAL ASSEMBLY,0
15,# 5637406,CAPITAL ASSEMBLY,0
19,#VMHI0248 CIL EYC INF,CAPITAL ASSEMBLY,0
20,#VMHI0248 CIL EYEC INF,CAPITAL ASSEMBLY,0


In [77]:
NMF_Topic1.head(15)

Unnamed: 0,ITEM_NAME,CATEGORY_ID,TopicNMF
2832,Armor Protective Dry Coat RP. BOL #8020476.,CHEMICALS,1
3255,CCMP-D CONDAT CONDACLEAN MP,CHEMICALS,1
3570,CONDAFORGE 635 CONDAFORGE 635,CHEMICALS,1
3571,CONDAT CONDACLEAN MP,CHEMICALS,1
3573,Condat condaforge. BOL #47418.,CHEMICALS,1
3574,Condat condaforge. BOL #47902.,CHEMICALS,1
3575,Condat condaforge. bol=46942,CHEMICALS,1
3577,CONDAT LATEX PAINT PUMP AND,CHEMICALS,1
3578,CONDAT MECAGREEN 450,CHEMICALS,1
3876,dubios / perkool,CHEMICALS,1


<div class="alert alert-block alert-success">
<b>TopicModeling:</b> I have used 2models to do topic segmentation.LDA used pre-dominently for TopicModeling. I have tried of NMF also. Based our datasamples which models gives proper classification we can use that model for our data. 
</div>