### In this notebook we will perform the word embedding & topic modeling & Cosine Similarity

***we merged the **three** chapters to perform the topic modeling, in order to perform cosine similarity to select which chapter the new input should go with.***

In [2]:
import pandas as pd
import numpy as np
import pickle

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

### Read the data and pickle file

In [3]:
df02 = pd.read_csv('merged_file.csv')

In [4]:
# reading the stop words list with pickle
with open ('stop_words.ob', 'rb') as fp:
    stop_words = pickle.load(fp)

In [5]:
df02.columns

Index(['string_values'], dtype='object')

In [6]:
# Declare a list that is to be converted into a column
ch_no = ['ear_nose', 'musculoskeletal', 'respiratory']
 
# Using 'ch_no' as the column name
# and equating it to the list
df02['Ch_No'] = ch_no

In [7]:
df02

Unnamed: 0,string_values,Ch_No
0,ear nose introduction ear nose rarely prove ex...,ear_nose
1,introduction bone muscle ligament musculos sha...,musculoskeletal
2,introduction distribute addition oxygen remova...,respiratory


### Word Embedding

In [8]:
df02['string_values']

0    ear nose introduction ear nose rarely prove ex...
1    introduction bone muscle ligament musculos sha...
2    introduction distribute addition oxygen remova...
Name: string_values, dtype: object

In [9]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(stop_words=stop_words)

doc_word_cv = count_vectorizer.fit_transform(df02['string_values'])



In [10]:
pd.DataFrame(doc_word_cv.toarray(), index=df02['Ch_No'], columns = count_vectorizer.get_feature_names_out()).head()

Unnamed: 0_level_0,aap,abdomen,abdu,abduct,abducted,abducting,abduction,abductor,abgs,ability,...,ysis,ysitis,ze,zealand,zheng,zinc,zone,zoster,zygote,µm
Ch_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ear_nose,0,0,0,0,1,0,0,0,0,2,...,1,0,0,0,0,0,0,1,0,0
musculoskeletal,0,1,1,2,2,2,10,1,0,2,...,1,1,0,1,0,0,0,0,1,0
respiratory,1,1,0,0,0,0,0,0,1,5,...,0,0,1,1,1,1,6,0,0,5


In [11]:
# Create a TfidfVectorizer for parsing/counting words
tfidf = TfidfVectorizer(stop_words=stop_words)

doc_word_tfidf = tfidf.fit_transform(df02['string_values'])



In [12]:
pd.DataFrame(doc_word_tfidf.toarray(), index=df02['Ch_No'], columns = tfidf.get_feature_names_out()).head()

Unnamed: 0_level_0,aap,abdomen,abdu,abduct,abducted,abducting,abduction,abductor,abgs,ability,...,ysis,ysitis,ze,zealand,zheng,zinc,zone,zoster,zygote,µm
Ch_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ear_nose,0.0,0.0,0.0,0.0,0.003086,0.0,0.0,0.0,0.0,0.004793,...,0.003086,0.0,0.0,0.0,0.0,0.0,0.0,0.004058,0.0,0.0
musculoskeletal,0.0,0.003021,0.003972,0.007943,0.006041,0.007943,0.039717,0.003972,0.0,0.004691,...,0.003021,0.003972,0.0,0.003021,0.0,0.0,0.0,0.0,0.003972,0.0
respiratory,0.004692,0.003568,0.0,0.0,0.0,0.0,0.0,0.0,0.004692,0.013855,...,0.0,0.0,0.004692,0.003568,0.004692,0.004692,0.02815,0.0,0.0,0.023458


### Topic Modeling: **LDA**

In [13]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word_cv)

In [14]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [15]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)

In [16]:
lda.print_topics(3)

[(0,
  '0.000*"abdu" + 0.000*"abdomen" + 0.000*"aap" + 0.000*"perfectly" + 0.000*"perception" + 0.000*"percussed" + 0.000*"percentage" + 0.000*"perforated" + 0.000*"perfor" + 0.000*"perforates"'),
 (1,
  '0.769*"abdomen" + 0.038*"abdu" + 0.017*"aap" + 0.000*"perfectly" + 0.000*"perception" + 0.000*"percussed" + 0.000*"percentage" + 0.000*"perforated" + 0.000*"perfor" + 0.000*"perforates"'),
 (2,
  '0.502*"abdu" + 0.384*"aap" + 0.000*"abdomen" + 0.000*"perfectly" + 0.000*"perception" + 0.000*"percussed" + 0.000*"percentage" + 0.000*"perforated" + 0.000*"perfor" + 0.000*"perforates"')]

### Performing CorEx:

In [18]:
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

words = list(np.asarray(count_vectorizer.get_feature_names_out()))


In [19]:
topic_model = ct.Corex(n_hidden=3, words=words, seed=1)
topic_model.fit(doc_word_cv, words=words, docs=df02['string_values'])



<corextopic.corextopic.Corex at 0x21ec5053be0>

In [20]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aap,normalized,normalize,norm,nontension,nonsmoker,nonpurulent,nonmotile,noncardiac,nodosum
1: jelly,outcome,creating,secr,otherwise,cream,crackling,secured,securely,sedation
2: abdomen,instance,integrity,intensify,interruption,invade,invariably,inversion,involvement,keeping


### Topic Modeling: LSA

In [21]:
lsa = TruncatedSVD(3)
doc_topic = lsa.fit_transform(doc_word_cv)
print(lsa.explained_variance_ratio_)

[0.04530854 0.5561219  0.39856955]


In [22]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ['component'+str(i) for i in range(3)],
             columns = count_vectorizer.get_feature_names_out())

print(topic_word)

              aap  abdomen   abdu  abduct  abducted  abducting  abduction  \
component0  0.001    0.003  0.002   0.004     0.005      0.004      0.018   
component1  0.000   -0.002 -0.002  -0.004    -0.002     -0.004     -0.022   
component2  0.004    0.003 -0.001  -0.002    -0.003     -0.002     -0.009   

            abductor   abgs  ability  ...   ysis  ysitis     ze  zealand  \
component0     0.002  0.001    0.012  ...  0.003   0.002  0.001    0.003   
component1    -0.002  0.000    0.003  ...  0.000  -0.002  0.000   -0.002   
component2    -0.001  0.004    0.013  ... -0.002  -0.001  0.004    0.003   

            zheng   zinc   zone  zoster  zygote     µm  
component0  0.001  0.001  0.006   0.001   0.002  0.005  
component1  0.000  0.000  0.003   0.002  -0.002  0.002  
component2  0.004  0.004  0.021  -0.001  -0.001  0.018  

[3 rows x 5568 columns]


In [23]:
tem_list = [] 
def display_topics(model, feature_names, no_top_words, topic_names=None):
    
    for ix, topic in enumerate(model.components_):
        inner_tem_list = []
       
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
            
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        inner_tem_list.append(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        tem_list.append(inner_tem_list)

In [24]:
result1 = display_topics(lsa, count_vectorizer.get_feature_names_out(), 20)


Topic  0
bone, muscle, ear, otitis, hearing, airway, membrane, bleeding, sur, deformity, hip, obstruction, oxygen, gery, ventilation, cord, brace, attack, breathing, traction

Topic  1
ear, otitis, hearing, nose, bleeding, sinusitis, externa, membrane, obstruction, sinus, abscess, septum, ge, airway, cord, polyp, speech, voice, mouth, packing

Topic  2
ventilation, oxygen, airway, dyspnea, copd, breathing, acidosis, hg, crackle, artery, bronchiectasis, collapse, inspiration, cwp, silicosis, distress, ph, exertion, pleura, empyema


In [25]:
tem_list
final_dic = {}
final_dic["Bone"] = tem_list[0]
final_dic["Ear"] = tem_list[1]
final_dic["Breathing"] = tem_list[2]

In [26]:
final_dic

{'Bone': ['bone, muscle, ear, otitis, hearing, airway, membrane, bleeding, sur, deformity, hip, obstruction, oxygen, gery, ventilation, cord, brace, attack, breathing, traction'],
 'Ear': ['ear, otitis, hearing, nose, bleeding, sinusitis, externa, membrane, obstruction, sinus, abscess, septum, ge, airway, cord, polyp, speech, voice, mouth, packing'],
 'Breathing': ['ventilation, oxygen, airway, dyspnea, copd, breathing, acidosis, hg, crackle, artery, bronchiectasis, collapse, inspiration, cwp, silicosis, distress, ph, exertion, pleura, empyema']}

In [27]:
tem_df = pd.DataFrame.from_dict(final_dic, orient ='index') 
tem_df

Unnamed: 0,0
Bone,"bone, muscle, ear, otitis, hearing, airway, me..."
Ear,"ear, otitis, hearing, nose, bleeding, sinusiti..."
Breathing,"ventilation, oxygen, airway, dyspnea, copd, br..."


In [28]:
# Declare a list that is to be converted into a column
d_name = ['musculoskeletal', 'ear_nose', 'respiratory']
 
# Using 'ch_no' as the column name
# and equating it to the list
tem_df['D_Name'] = d_name

In [29]:
tem_df.columns

Index([0, 'D_Name'], dtype='object')

In [30]:
tem_df = tem_df.rename(columns={0: 'Description'})
tem_df

Unnamed: 0,Description,D_Name
Bone,"bone, muscle, ear, otitis, hearing, airway, me...",musculoskeletal
Ear,"ear, otitis, hearing, nose, bleeding, sinusiti...",ear_nose
Breathing,"ventilation, oxygen, airway, dyspnea, copd, br...",respiratory


In [31]:
tem_df.to_csv('diseases_with_description.csv', index=False)