In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import nltk
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download stopwords and wordnet data from NLTK
nltk.download('stopwords')
nltk.download('wordnet')




# Load spacy's English model
nlp = spacy.load('en_core_web_lg')

# NLTK lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

  from .autonotebook import tqdm as notebook_tqdm
2023-08-15 14:49:36.069612: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to /Users/ivan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ivan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def preprocess(text):
    # Tokenization using spaCy
    doc = nlp(text)
    
    # Lemmatization using NLTK's WordNet lemmatizer
    lemmatized = [lemmatizer.lemmatize(token.text) for token in doc]
    
    # Removing stopwords, non-alphabetic words and lowercasing
    clean_text = ' '.join([word.lower() for word in lemmatized if word.lower() not in stop_words and word.isalpha()])
    
    return clean_text

In [16]:
def cleaning_process(df,syms='symptoms',des='description'):
    # Removing duplicates
    df = df.drop_duplicates()
    # Handle missing values - fill NaN with empty string in this case
    df = df.fillna("")
    df['processed_symptoms'] = df[syms].apply(preprocess)
    df['processed_description'] = df[des].apply(preprocess)

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_symptoms = tfidf_vectorizer.fit_transform(df['processed_symptoms'])
    tfidf_matrix_description = tfidf_vectorizer.fit_transform(df['processed_description'])
    print(tfidf_matrix_symptoms)
    print(tfidf_matrix_description)
    return df
    
#TESTING USER INPUT
def preprocess_user_input(user_input):
    return preprocess(user_input)

In [3]:

df_mayo=pd.read_csv('Medical Data.csv')
df_mayo.head(1)

Unnamed: 0.1,Unnamed: 0,Diesase_Name,overview,symptoms_elements,symptoms_paragraph
0,0,Atrial fibrillation,"In a typical heart, a tiny group of cells at t...","Feelings of a fast, fluttering or pounding hea...",Symptoms of AFib may include:


In [78]:
df_mayo.shape

(1534, 5)

In [77]:

print(df_diseases['Disease'].unique())

['Drug Reaction' 'Malaria' 'Allergy' 'Hypothyroidism' 'Psoriasis' 'GERD'
 'Chronic cholestasis' 'hepatitis A' 'Osteoarthristis'
 '(vertigo) Paroymsal  Positional Vertigo' 'Hypoglycemia' 'Acne'
 'Diabetes' 'Impetigo' 'Hypertension' 'Peptic ulcer diseae'
 'Dimorphic hemorrhoids(piles)' 'Common Cold' 'Chicken pox'
 'Cervical spondylosis' 'Hyperthyroidism' 'Urinary tract infection'
 'Varicose veins' 'AIDS' 'Paralysis (brain hemorrhage)' 'Typhoid'
 'Hepatitis B' 'Fungal infection' 'Hepatitis C' 'Migraine'
 'Bronchial Asthma' 'Alcoholic hepatitis' 'Jaundice' 'Hepatitis E'
 'Dengue' 'Hepatitis D' 'Heart attack' 'Pneumonia' 'Arthritis'
 'Gastroenteritis' 'Tuberculosis']


In [4]:
pd.set_option('display.max_columns', None)

datasets=['dataset.csv',
'symptom_Description.csv',
'symptom_precaution.csv',
'Symptom-severity.csv']
path='./dataset/'
df_symptoms = pd.read_csv(path+datasets[0])
df_diseases = pd.read_csv(path+datasets[1])
df_precautions = pd.read_csv(path+datasets[2])
df_symps_weights = pd.read_csv(path+datasets[3]) # Weights for symptoms, or labelEncoder, NOt that Relevant for NLP... so far


print(df_symptoms.shape)
df_symptoms=df_symptoms.fillna("")
for col in df_symptoms.drop('Disease', axis=1).columns:
    df_symptoms[col]=df_symptoms[col].str.replace("_"," ")
df_symptoms.head()

(4920, 18)


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


In [5]:
# MERGE 3 Datasets in one DF. Maintain repetition of diseases to increase match
## Create a relation between the 3 main datasets, which is Disease and symptom
df_complete = pd.DataFrame()
df_complete = pd.merge(df_symptoms,df_diseases, on='Disease')
df_complete = pd.merge(df_complete,df_precautions, on='Disease')


df_complete.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Description,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths


In [12]:
# Symptom_17
sym_cols=[]
for i in range(1,18):
    sym_cols.append('Symptom_'+str(i))
    
df_complete['Symptoms_All'] = df_complete[sym_cols].apply(lambda row: ', '.join(row), axis=1)
print(df_complete.shape)
df_complete.head(2)

(4560, 24)


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Description,Precaution_1,Precaution_2,Precaution_3,Precaution_4,Symptoms_All
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths,"itching, skin rash, nodal skin eruptions, d..."
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths,"skin rash, nodal skin eruptions, dischromic..."


In [14]:
columns_of_interest=['Disease','Symptoms_All','Description'] #for training

_df=cleaning_process(df_complete[columns_of_interest],'Symptoms_All','Description')
_df

#  
#  
#  
# Metodo similarity ELAMIGO  


In [36]:

def predict_disease(user_input_text):
    processed_input = preprocess_user_input(user_input_text)
    user_input_doc = nlp(processed_input)

    similarities = []

    for index, row in _df.iterrows():
        # Using processed_symptoms for similarity; you can also use processed_description
        record_doc = nlp(row['processed_symptoms'])
        similarity = user_input_doc.similarity(record_doc)
        similarities.append((index, row['Disease'], similarity))

    # Sort by similarity (from highest to lowest)
    sorted_similarities = sorted(similarities, key=lambda x: x[2], reverse=True)


    top_N = 5
    top_matches = sorted_similarities[:top_N]

    for match in top_matches:
        index, disease, similarity = match
        print(f"Disease: {disease}, Similarity: {similarity:.2f}")
    return top_matches[0][1]


In [37]:
testes={
    "Fungal infection" : "My skin has been itchy and flaky with a reddish rash that doesn't seem to heal.",
"Allergy" : "Whenever I'm around pollen, my eyes water, my nose runs, and I can't stop sneezing.",
"GERD (Gastroesophageal reflux disease)" : "I often experience a burning sensation in my chest, especially after eating, and sometimes food seems to come back up my throat.",
"Chronic cholestasis" : "I've been feeling persistent itching, and my eyes and skin appear more yellow than usual.",
"Drug Reaction" : "After taking that medication, I developed a rash, swelling, and felt dizzy.",
"Peptic ulcer disease" : "There's a gnawing or burning pain in my stomach, especially on an empty stomach or at night.",
"AIDS (Acquired Immune Deficiency Syndrome)" : "I've lost weight without trying, feel constantly fatigued, and keep getting infections that I never used to have.",
"Diabetes" : "I've been extremely thirsty, urinating often, and feeling more tired than usual.",
"Gastroenteritis" : "I can't keep anything down, have diarrhea, and feel abdominal cramps.",
"Bronchial Asthma" : "I often wheeze and have shortness of breath, especially during the night or after exercise.",
"Hypertension (High blood pressure)" : "The doctor said my blood pressure readings are consistently high, even though I don't feel any specific symptoms.",
"Migraine" : "There's a throbbing pain on one side of my head, and I become sensitive to light and sound.",
"Cervical spondylosis" : "My neck feels stiff and painful, and sometimes I feel tingling or numbness in my hands.",
"Paralysis (brain hemorrhage)" : "Half of my body suddenly became weak, and I couldn't move my right arm or leg.",
"Jaundice" : "My skin and eyes have taken on a yellow tint, and my urine is much darker than usual.",
"Malaria" : "I've had recurring bouts of high fever, chills, and sweats for the past few days.",
"Chicken pox" : "I've broken out in itchy blisters all over my body, and I had a fever before the rash appeared.",
"Dengue" : "I'm feeling sudden high fever, severe headaches, and pain behind my eyes, along with joint and muscle pain.",
"Typhoid" : "I've been suffering from a persistent fever, stomach pain, headache, and I noticed a rash on my abdomen.",
"Hepatitis A" : "I've lost my appetite, feel nauseous, and my liver area hurts; plus, I've noticed a yellowing of my eyes.",
"Hepatitis B" : "I've been feeling very tired, my urine is dark, and there's a yellowish tinge to my eyes and skin.",
"Hepatitis C" : "I'm feeling a general sense of fatigue, my stomach is upset, and the doctor said my liver enzymes are elevated.",
"Hepatitis D" : "I've had jaundice, fatigue, and joint pain recently, even though I already had hepatitis B.",
"Hepatitis E" : "I've noticed jaundice, feel fatigued, and have a reduced appetite along with mild fever.",
"Alcoholic hepatitis" : "After years of heavy drinking, I've been feeling nauseous, have abdominal pain, and my eyes and skin are turning yellow.",
"Tuberculosis" : "I've had a persistent cough for weeks, sometimes coughing up blood, accompanied by weight loss, night sweats, and fatigue.",
"Common Cold" : "I've got a runny nose, sneezing, a mild sore throat, and a cough that just started.",
"Pneumonia" : "I've been feeling short of breath, with a high fever and a cough that brings up thick, colored phlegm.",
"Dimorphic hemorrhoids (piles)" : "I've noticed painful swollen veins in my rectal area, and sometimes there's blood when I wipe.",
"Heart attack" : "I suddenly felt a crushing pain in my chest that radiated to my arm and jaw, accompanied by shortness of breath.",
"Varicose veins" : "The veins in my legs have become bulgy, bluish, and often cause a dull ache.",
"Hypothyroidism" : "I've been feeling constantly tired, gaining weight, and my skin has turned dry and cold.",
"Hyperthyroidism" : "I've lost weight without trying, my heart rate has increased, and I feel jittery and hot all the time.",
"Hypoglycemia" : "I suddenly felt shaky, sweaty, and had a pounding heartbeat, and needed to eat something sweet immediately.",
"Osteoarthritis" : "The joints in my hands and knees have become painful and stiff, especially when I wake up.",
"Arthritis" : "My joints are swollen, red, and warm to the touch, and they ache constantly.",
"(Vertigo) Paroxysmal Positional Vertigo" : "Whenever I change the position of my head, I feel like the room is spinning around me.",
"Acne" : "My face, back, and chest have red, pus-filled pimples and blackheads.",
"Urinary tract infection (UTI)" : "I've had a burning sensation when I urinate, and I feel the urge to go more often than usual.",
"Psoriasis" : "There are red, itchy patches covered with silvery scales on my elbows and knees.",
"Impetigo" : "I've developed honey-colored crusted sores on my face, especially around my nose and mouth."
    
}

count=0
for k,v_text in testes.items():
    print("====================")
    _ct= predict_disease(v_text) #HERE YOU CALL  YOUR FUNCTION TO PROCESS v_text
    r= "✅" if _ct == k.strip() else "❌"
    if("✅"==r):
        count+=1
    print(r , end=" ")
    print(f"Predicted: {_ct}, Original: {k},  \n\tSentence: {v_text}")
print(f"Total {count}/{len(testes)}")




Disease: Acne, Similarity: 0.84
Disease: Acne, Similarity: 0.84
Disease: Acne, Similarity: 0.83
Disease: Acne, Similarity: 0.83
Disease: Fungal infection, Similarity: 0.83
❌ Predicted: Acne, Original: Fungal infection,  
	Sentence: My skin has been itchy and flaky with a reddish rash that doesn't seem to heal.
Disease: Allergy, Similarity: 0.74
Disease: Allergy, Similarity: 0.73
Disease: Allergy, Similarity: 0.71
Disease: Hypothyroidism, Similarity: 0.70
Disease: Hypothyroidism, Similarity: 0.70
✅ Predicted: Allergy, Original: Allergy,  
	Sentence: Whenever I'm around pollen, my eyes water, my nose runs, and I can't stop sneezing.
Disease: Urinary tract infection, Similarity: 0.83
Disease: Alcoholic hepatitis, Similarity: 0.83
Disease: Alcoholic hepatitis, Similarity: 0.82
Disease: Drug Reaction, Similarity: 0.82
Disease: Alcoholic hepatitis, Similarity: 0.82
❌ Predicted: Urinary tract infection, Original: GERD (Gastroesophageal reflux disease),  
	Sentence: I often experience a burnin

Disease: Hepatitis E, Similarity: 0.84
Disease: Typhoid, Similarity: 0.84
Disease: Hepatitis E, Similarity: 0.84
Disease: Typhoid, Similarity: 0.83
Disease: Typhoid, Similarity: 0.83
❌ Predicted: Hepatitis E, Original: Hepatitis D,  
	Sentence: I've had jaundice, fatigue, and joint pain recently, even though I already had hepatitis B.
Disease: Typhoid, Similarity: 0.86
Disease: Typhoid, Similarity: 0.86
Disease: Chicken pox, Similarity: 0.85
Disease: Tuberculosis, Similarity: 0.85
Disease: Typhoid, Similarity: 0.85
❌ Predicted: Typhoid, Original: Hepatitis E,  
	Sentence: I've noticed jaundice, feel fatigued, and have a reduced appetite along with mild fever.
Disease: Chronic cholestasis, Similarity: 0.92
Disease: Chronic cholestasis, Similarity: 0.92
Disease: Hepatitis D, Similarity: 0.91
Disease: Hepatitis B, Similarity: 0.91
Disease: Chronic cholestasis, Similarity: 0.91
❌ Predicted: Chronic cholestasis, Original: Alcoholic hepatitis,  
	Sentence: After years of heavy drinking, I've

#  
#  
#  
#  
#  
#  
#  
#  
#  
#  
#  
#  
#  
#  
#  
#  MEthod: similarity Kadabra


In [72]:

def clean_text(text):
    text = text.replace('\n','. ')#.replace('','').replace('','').replace('','')
#     text = text.replace(',',' ').replace('!','').replace('?','')
    return text

def remove_punct(text, lower=False):
    if(type(text)==str):
        text = text.replace('\n','. ')#.replace('','').replace('','').replace('','')
        text = text.replace(',',' ').replace('!','').replace('?',' ')
        text = text.replace('.',' ').replace('#','').replace('$',' ')
        text = text.replace('^',' ').replace('&','and').replace(';',' ')
        text = text.replace('  ',' ')
        return text
    elif(type(text)==list):
        _words=[]
        for w in text:
            if w.isalnum():
                _words.append(w if not lower else w.lower())
#         print('remove_punct',len(_words))
        return _words

def get_sentences(text):
    return sent_tokenize(text)

def get_tokens(text):
    words =[]
    for w in word_tokenize(text):
        if w.isalnum():
            words.append(w)
    return words
    

def remove_stopwords(words):
    just_words=[]
    for word in words:
        if word.lower() not in stopword:
            just_words.append(word)
    return just_words
    
    
def lemmatize(text):
    record_lemmatized = [lemmatizer.lemmatize(token) for token in text]
    return record_lemmatized
    
    
def list_to_string(ls):
    return " ".join(ls)
    
# clean words: boooook
# Remove non stop words of 2 letters

def preprocess_text_ap1(x):
       # tokenize everything, remove stop words, lower all, remove punctutation
    # set tokens each description
    dataset_tokens= get_tokens(x)
    # remove stopwords
    dataset_nostopwords= remove_stopwords(dataset_tokens)
    # remove punctuation
    dataset_main_words= remove_punct(dataset_nostopwords, lower=False)
    # Lower all
    dataset_main_words_lower=[x.lower() for x in dataset_main_words]
    # Back to string
    dataset_clean_text =  list_to_string(dataset_main_words)
    return dataset_clean_text


def find_similar_movie(_synopsis, dataset_docs=dataset_docs):
    summary_raw=_synopsis 
    summary=preprocess_user_input(summary_raw)
    summary_doc = nlp(summary)
    similarity_scores = [summary_doc.similarity(doc) for doc in dataset_docs]
    most_similar_index = similarity_scores.index(max(similarity_scores))
  
    df_match = _df.iloc[most_similar_index]#[df['']]
    # THe best this, is to find the movie in the dataset, return title, year and link
    return df_match #most_similar_record.text


In [57]:
# processed_symptoms	processed_description
import os
import time
import pickle

# MOVIE_DATABASE_ROOT = "./DISEASES"

# dataset_docs=[]
# if not os.path.exists(MOVIE_DATABASE_ROOT):
#     print("This database has not been processed, creating NLP docs...")
#     os.makedirs(MOVIE_DATABASE_ROOT)
#     # To avoid too much disk space
#     tic=time.time()
#     dataset_docs = [nlp(record) for record in _df['processed_description'].tolist()]
#     print(f"{len(dataset_docs)} docs . Took {(time.time()-tic)} secs to process") # 89 docs on 26.4 secs
#     # save dataset_docs in pickle
#     with open(MOVIE_DATABASE_ROOT+'/dataset_docs.pkl', 'wb') as f:
#         pickle.dump(dataset_docs, f)
# else:
#     with open(MOVIE_DATABASE_ROOT+'/dataset_docs.pkl', 'rb') as f:
#         dataset_docs = pickle.load(f)
#     print(f"{len(dataset_docs)} docs . Took {(time.time()-tic)} secs to process")

    
    
# SYMPTOMS
MOVIE_DATABASE_ROOT_SYMS = "./DISEASES_SYMPS"

dataset_docs=[]
if not os.path.exists(MOVIE_DATABASE_ROOT_SYMS):
    print("This database has not been processed, creating NLP docs...")
    os.makedirs(MOVIE_DATABASE_ROOT_SYMS)
    # To avoid too much disk space
    tic=time.time()
    dataset__syms_docs = [nlp(record) for record in _df['processed_symptoms'].tolist()]
    print(f"{len(dataset__syms_docs)} docs . Took {(time.time()-tic)} secs to process") # 89 docs on 26.4 secs
    # save dataset_docs in pickle
    with open(MOVIE_DATABASE_ROOT_SYMS+'/dataset_docs.pkl', 'wb') as f:
        pickle.dump(dataset__syms_docs, f)
else:
    with open(MOVIE_DATABASE_ROOT_SYMS+'/dataset_docs.pkl', 'rb') as f:
        dataset_docs = pickle.load(f)
    print(f"{len(dataset__syms_docs)} docs . Took {(time.time()-tic)} secs to process")


This database has not been processed, creating NLP docs...
283 docs . Took 1.937380075454712 secs to process


In [54]:
find_similar_movie("I have a headache and fever and joint pain with mosquito bites")

Disease                                                            Typhoid
Symptoms_All              chills,  vomiting,  fatigue,  high fever,  na...
Description              An acute illness characterized by fever caused...
processed_symptoms       chill vomiting fatigue high fever nausea const...
processed_description    acute illness characterized fever caused infec...
Name: 1920, dtype: object

In [73]:
count=0
for k,v_text in testes.items():
    print("====================")
    _ct= find_similar_movie(v_text,dataset__syms_docs) #HERE YOU CALL  YOUR FUNCTION TO PROCESS v_text
    r= "✅" if _ct['Disease'] == k.strip() else "❌"
    if("✅"==r):
        count+=1
    print(r , end=" ")
    print(f"Predicted: {_ct['Disease']}, Original: {k},  \n\tSentence: {v_text}")
print(f"Total {count}/{len(testes)}")

#by symptoms no lemma  :Total 13/41
# By description: 8/41
# By symproms, but with lemma

❌ Predicted: Acne, Original: Fungal infection,  
	Sentence: My skin has been itchy and flaky with a reddish rash that doesn't seem to heal.
✅ Predicted: Allergy, Original: Allergy,  
	Sentence: Whenever I'm around pollen, my eyes water, my nose runs, and I can't stop sneezing.
❌ Predicted: Urinary tract infection, Original: GERD (Gastroesophageal reflux disease),  
	Sentence: I often experience a burning sensation in my chest, especially after eating, and sometimes food seems to come back up my throat.
❌ Predicted: Hepatitis C, Original: Chronic cholestasis,  
	Sentence: I've been feeling persistent itching, and my eyes and skin appear more yellow than usual.
❌ Predicted: (vertigo) Paroymsal  Positional Vertigo, Original: Drug Reaction,  
	Sentence: After taking that medication, I developed a rash, swelling, and felt dizzy.
❌ Predicted: Drug Reaction, Original: Peptic ulcer disease,  
	Sentence: There's a gnawing or burning pain in my stomach, especially on an empty stomach or at night

#  
#  
#  
#  
# ELAMIGO 2 -> direct

In [60]:
_df.head(1)

Unnamed: 0,Disease,Symptoms_All,Description,processed_symptoms,processed_description
0,Fungal infection,"itching, skin rash, nodal skin eruptions, d...","In humans, fungal infections occur when an inv...",itching skin rash nodal skin eruption dischrom...,human fungal infection occur invading fungus t...


In [67]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


# Preprocess the data
# Assuming your dataset has 'Disease' and 'Symptoms' columns
symptom_corpus = _df['Symptoms_All']#.str.split(',')
disease_labels = _df['Disease']

print(type(disease_labels))
print(type(symptom_corpus))
print((symptom_corpus))
# Create a CountVectorizer to convert symptoms to a matrix of token counts
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(symptom_corpus)

# Train a simple Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X, disease_labels)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
0       itching,  skin rash,  nodal skin eruptions,  d...
1        skin rash,  nodal skin eruptions,  dischromic...
2       itching,  nodal skin eruptions,  dischromic  p...
3       itching,  skin rash,  dischromic  patches, , ,...
4       itching,  skin rash,  nodal skin eruptions, , ...
                              ...                        
4442     high fever,  blister,  red sore around nose, ...
4443     skin rash,  blister,  red sore around nose,  ...
4445     skin rash,  high fever,  red sore around nose...
4446     skin rash,  high fever,  blister,  yellow cru...
4447     skin rash,  high fever,  blister,  red sore a...
Name: Symptoms_All, Length: 283, dtype: object


MultinomialNB()

In [70]:
# Take user input for symptoms
user_input = "I've noticed painful swollen veins in my rectal area, and sometimes there's blood when I wipe."#input("Enter your symptoms (comma-separated): ")
def predict_disease_3(user_input):
    user_symptoms = [s.strip() for s in user_input.split(',')]

    # Convert user input to the same format as the training data
    user_input_vector = vectorizer.transform([' '.join(user_symptoms)])

    # Predict the disease based on user input
    predicted_disease = classifier.predict(user_input_vector)[0]

    # Print the predicted disease
#     print(f"Predicted disease: {predicted_disease}")
    return predicted_disease

In [71]:
count=0
for k,v_text in testes.items():
    print("====================")
    _ct= predict_disease_3(v_text) #HERE YOU CALL  YOUR FUNCTION TO PROCESS v_text
    r= "✅" if _ct == k.strip() else "❌"
    if("✅"==r):
        count+=1
    print(r , end=" ")
    print(f"Predicted: {_ct}, Original: {k},  \n\tSentence: {v_text}")
print(f"Total {count}/{len(testes)}")

#by symptoms  :Total 13/41
# By description: 8/41

Predicted disease: Psoriasis
❌ Predicted: None, Original: Fungal infection,  
	Sentence: My skin has been itchy and flaky with a reddish rash that doesn't seem to heal.
Predicted disease: Common Cold
❌ Predicted: None, Original: Allergy,  
	Sentence: Whenever I'm around pollen, my eyes water, my nose runs, and I can't stop sneezing.
Predicted disease: Cervical spondylosis
❌ Predicted: None, Original: GERD (Gastroesophageal reflux disease),  
	Sentence: I often experience a burning sensation in my chest, especially after eating, and sometimes food seems to come back up my throat.
Predicted disease: Hepatitis B
❌ Predicted: None, Original: Chronic cholestasis,  
	Sentence: I've been feeling persistent itching, and my eyes and skin appear more yellow than usual.
Predicted disease: Hypoglycemia
❌ Predicted: None, Original: Drug Reaction,  
	Sentence: After taking that medication, I developed a rash, swelling, and felt dizzy.
Predicted disease: GERD
❌ Predicted: None, Original: Peptic ulcer