In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle
from numpy import dot
from numpy.linalg import norm

In [7]:
domain_stop_word = ["patient", "may", "disease", "cause", "treatment", "also", "symptom", "usually", "sign",
                "diagnosis", "result", "pain", "include", "pressure", "lung", "pulmonary", "respiratory",
                "chest", "fluid", "complication", "change", "blood", "infection", "therapy", "prevent",
                "acute", "care", "child", "level", "air", "use", "severe", "help", "used", "exercise",
                "normal", "incidence", "pneumonia","tissue", "show", "chronic", "failure", "cast", "increased",
                "monitor", "hypoxemia", "produce", "edema", "increase", "space", "occurs", "cough", "alveolar", 
                "heart", "pathophysiology", "sputum", "provide", "decreased", "pneumothorax", "test", "special",
                "tube", "condition", "common", "surgery","secretion", "fibrosis", "disorder", "pa", "area", "form",
                "cell", "skin", "drainage", "tb", "year", "commonly", "check", "teach", "rest", "watch", "encourage", 
                "underlying", "consideration", "et", "early", "hour", "family", "need", "effusion", "body", "drug", "support", 
                "rate", "syndrome", "requires", "inflammation", "abg", "side", "infant", "however", "upper", "cor", "pulmonale",
                 "ventilator", "mechanical", "breath", "maintain" , "foot", "day", "bed", "parent", "especially", "fever", "culture",
                'system', 'within', 'factor', 'amount', 'death', 'movement', 'progress', 'volume', 'one', 'stage', 'report',
                'avoid', 'respiration', 'trauma', 'occur', 'atelectasis', 'hand', 'includes', 'weight', 'tendon', 'hypertension', 
                'le', 'time', 'lead', 'damage', 'causing', 'require', 'activity', 'injury', 'risk', 'mm', 'measure', 'examination',
                'nerve', 'stress', 'make', 'al', 'see', 'decrease', 'age', 'hg''case', 'month', 'coughing', 'develops', 'formation', 
                'without', 'site', 'every', 'reduce', 'relieve', 'effect','percussion', 'ordered', 'develop', 'affect', 'loss', 'flow',
                'lesion', 'technique', 'exposure', 'gas', 'finding', 'procedure', 'begin', 'wall', 'immediately', 'type', 'response', 
                'position', 'needed', 'administer', 'control', 'ass', 'increasing', 'although', 'tell', 'output', 'give', 'analysis',
                'history', 'often' ,'week', 'home', 'perform','function', 'typically', 'frequently', 'adult', 'indicate', 'administration',
                'explain', 'using', 'suggest', 'called', 'center', 'head', 'people', 'resulting', 'including', 'period', 'feature'
                   ]

In [6]:
# read data file
file_path = 'diseases_with_description.csv'
df = pd.read_csv(file_path)
print(df.head())

                                         Description  \
0  Heart, chest pain, shortness of breath, fatigu...   
1  Respiratory system, lungs, breathing difficult...   
2  Neurological system, brain, spinal cord, nerve...   
3  Gastrointestinal system, stomach, intestines, ...   
4  Liver, bile ducts, gallbladder, hepatobiliary ...   

                       D_Name  
0              Cardiovascular  
1         Respiratory_Disease  
2         Neurologic_Disorder  
3  Gastrointestinal_Disorders  
4     Hepatobiliary_Disorders  


In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# example_sent = """This is a sample sentence,
# 				showing off the stop words filtration."""

stop_words = set(stopwords.words('english'))

# word_tokens = word_tokenize(example_sent)
# # converts the words in word_tokens to lower case and then checks whether
# #they are present in stop_words or not
# filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
# #with no lower case conversion
# filtered_sentence = []

# for w in word_tokens:
# 	if w not in stop_words:
# 		filtered_sentence.append(w)

# print(word_tokens)
# print(filtered_sentence)

def clean_text_func(text):
    
    """ this function clean & pre-process the data  """

    text = str(text)
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    final_text = ""
    for x in text.split():
        if x not in domain_stop_word and x not in stop_words:
            final_text = final_text + x  +" "
    return final_text

df['Description'] = df['Description'].apply(lambda x: clean_text_func(x))
df.head()

Unnamed: 0,Description,D_Name
0,shortness fatigue palpitations high cholestero...,Cardiovascular
1,lungs breathing difficulties shortness congest...,Respiratory_Disease
2,neurological brain spinal cord nerves nervous ...,Neurologic_Disorder
3,gastrointestinal stomach intestines digestive ...,Gastrointestinal_Disorders
4,liver bile ducts gallbladder hepatobiliary hep...,Hepatobiliary_Disorders


## Words Embedding

In [10]:
cv = CountVectorizer(stop_words="english")
cv_tfidf = TfidfVectorizer(stop_words="english")

X = cv.fit_transform(list(df.loc[:, 'Description' ]))
X_tfidf = cv_tfidf.fit_transform(list(df.loc[:, 'Description' ]))

In [11]:
df_cv = pd.DataFrame(X.toarray() , columns=cv.get_feature_names_out())
df_tfidf = pd.DataFrame(X_tfidf.toarray() , columns=cv_tfidf.get_feature_names_out())

In [12]:
print(df_cv.shape)
cosine = lambda v1 , v2 : dot(v1 , v2) / (norm(v1) * norm(v2))

(14, 849)


## Cosine Similarity

In [13]:
new_text = ["dizziness loss of balance  vomiting tinnitus of hearing in the high frequency range in one ear difficulty focusing your eyes "]
def finder(new_text):
    new_text_cv = cv.transform(new_text).toarray()[0]
    new_text_tfidf = cv_tfidf.transform(new_text).toarray()[0]
    ans = 0
    maxxer = float('-inf')
    for chpter_number in range(int(df.shape[0])):

        # print(f"This is chpter number : {chpter_number} ")
        if cosine( df_cv.iloc[chpter_number]  , new_text_cv ) > maxxer:
            ans = chpter_number
            maxxer = cosine( df_cv.iloc[chpter_number]  , new_text_cv )
        
        # print(f"Cosin cv :    { cosine( df_cv.iloc[chpter_number]  , new_text_cv )} ")
        # print(f"Cosin TFIDF : { cosine( df_tfidf.iloc[chpter_number]  , new_text_tfidf) } ")
    for index in df.index:
        if index == ans:
            print(df['D_Name'][index])
finder(new_text)
finder(["Difficulty sleeping or staying asleep Fever Fluid draining from ear  Loss of balance. Hearing difficulties. Ear pain"])
finder(["Difficulty breathing, fever, shortness in breath"])
finder(["Persistent cough, sore throat, runny nose, fatigue, low-grade fever, chest congestion, difficulty breathing"])
finder(["itching, blisters, rashes"])
finder(["feeling pain in left side of chest"])
finder(["I feel nauseated"])

ENT_Disorders
ENT_Disorders
Respiratory_Disease
ENT_Disorders
Skin_Disorders
Cardiovascular
Cardiovascular


  cosine = lambda v1 , v2 : dot(v1 , v2) / (norm(v1) * norm(v2))
