In [169]:
df_tempr = pd.read_csv('disease_with_description.csv')
df_tempr.head()

Unnamed: 0,Description,D_Name
0,"bone, muscle, ear, otitis, hearing, airway, me...",musculoskeletal
1,"ear, otitis, hearing, nose, bleeding, sinusiti...",ear_nose
2,"ventilation, oxygen, airway, dyspnea, copd, br...",respiratory


In [171]:
import pandas as pd

# Load the CSV file
df_tempr = pd.read_csv('disease_with_description.csv')

# Define expanded keywords for each main disease
disease_keywords = {
    "musculoskeletal": "pain, joint pain, muscle pain, stiffness, swelling, tenderness, soreness, aching, inflammation, arthritis, osteoarthritis, rheumatoid arthritis, tendonitis, bursitis, sprain, strain, fracture, osteoporosis, dislocation, degeneration, weakness, immobility, cramps, spasms, fatigue, myalgia, dystrophy, back pain, lower back pain, upper back pain, neck pain, sciatica, scoliosis, fibromyalgia, joint stiffness, cartilage damage, bone fracture, ligament tear, tendon rupture, muscle fatigue, muscle spasm, chronic pain, mobility issues, gout, lupus, nerve pain, disc herniation, frozen shoulder, repetitive strain injury (RSI), degenerative disc disease",
    "ear_nose": "ear pain, hearing loss, tinnitus, vertigo, dizziness, ear infection, fluid buildup, nasal congestion, sinus pain, sinusitis, runny nose, sneezing, allergy, stuffy nose, sore throat, hoarseness, difficulty swallowing, tonsillitis, cough, postnasal drip, otitis media, sinus infection, allergic rhinitis, deviated septum, swollen lymph nodes, throat irritation, laryngitis, nasal polyps, sleep apnea, difficulty breathing through nose, chronic sinusitis, postnasal congestion, adenoiditis, ear pressure, muffled hearing, voice changes, chronic cough, flu symptoms, nasal obstruction, vocal cord strain, swollen tonsils, nasal dryness, Eustachian tube dysfunction",
    "respiratory": "shortness of breath, wheezing, chest tightness, coughing, mucus, choking, asthma, bronchitis, pneumonia, COPD, tuberculosis, lung infection, sore throat, flu, fever, congestion, difficulty breathing, sinus infection, dust, smoke, allergy, hay fever, pollution, inflammation, asthma attack, chronic bronchitis, lung fibrosis, chest discomfort, respiratory distress, nasal drip, persistent cough, pleurisy, pulmonary edema, respiratory failure, respiratory tract infection, lung congestion, wheezy breathing, labored breathing, viral pneumonia, bacterial pneumonia, emphysema, pneumonitis, sore chest, allergic asthma, post-infectious cough, pulmonary embolism, environmental allergies, interstitial lung disease"
}

# Append keywords to the Description column based on D_Name
df_tempr["Description"] = df_tempr.apply(
    lambda row: row["Description"] + ", " + disease_keywords.get(row["D_Name"], ""), axis=1
)

# Save the updated CSV
df_tempr.to_csv('updated_disease_with_description.csv', index=False)

# Display the updated DataFrame
print(df_tempr.head())


                                         Description           D_Name
0  bone, muscle, ear, otitis, hearing, airway, me...  musculoskeletal
1  ear, otitis, hearing, nose, bleeding, sinusiti...         ear_nose
2  ventilation, oxygen, airway, dyspnea, copd, br...      respiratory


In [9]:
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Load stop words
with open('stop_words.ob', 'rb') as fp:
    domain_stop_word = pickle.load(fp)

# Load Main Disease Dataset
df_main = pd.read_csv('updated_disease_with_description.csv')

# Preprocessing function
def clean_text(text):
    
    """ this function clean & pre-process the data  """

    text = str(text)
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    final_text = ""
    for x in text.split():
        if x not in domain_stop_word:
            final_text = final_text + x  +" "
    return final_text

df_main['cleaned_text'] = df_main['Description'].apply(clean_text)
df_main = df_main.rename(columns={'D_Name': 'main_disease'})
df_main.head()

Unnamed: 0,Description,main_disease,cleaned_text
0,"bone, muscle, ear, otitis, hearing, airway, me...",musculoskeletal,bone muscle ear otitis hearing airway membrane...
1,"ear, otitis, hearing, nose, bleeding, sinusiti...",ear_nose,ear otitis hearing nose bleeding sinusitis ext...
2,"ventilation, oxygen, airway, dyspnea, copd, br...",respiratory,ventilation oxygen airway dyspnea copd breathi...


In [98]:
df_temp = pd.read_excel('musculoskeletal.xlsx')
df_temp.head()

Unnamed: 0,sub_disease,Description
0,CLUBFOOT,Talipes equinovarus varies greatly in severity...
1,MUSCULAR DYSTROPHY,Although all four types of muscular dystrophy ...
2,SEPTIC ARTHRITIS,"Acute septic arthritis begins abruptly, causin..."
3,GOUT,"Gout develops in four stages: asymptomatic, ac..."
4,NEUROGENIC ARTHROPATHY,Neurogenic arthropathy begins insidiously with...


In [159]:

# Train TF-IDF Vectorizer for Main Diseases
vectorizer = TfidfVectorizer()
main_disease_texts = df_main['cleaned_text'].values
vectorizer.fit(main_disease_texts)
main_disease_vectors = vectorizer.transform(main_disease_texts)

# Save Vectorizer
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
with open("main_disease_vectors.pkl", "wb") as f:
    pickle.dump(main_disease_vectors, f)

# Train and Save Logistic Regression Models for Each Main Disease
subdisease_models = {}
for main_disease in df_main['main_disease'].unique():
    sub_df = pd.read_excel(f"{main_disease}.xlsx")
    sub_df['cleaned_text'] = sub_df['Description'].apply(clean_text)
    
    X = vectorizer.transform(sub_df['cleaned_text'])
    y = sub_df['sub_disease']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    subdisease_models[main_disease] = model
    
    # Save the trained model
    with open(f"{main_disease}_model.pkl", "wb") as f:
        pickle.dump(model, f)



NotFittedError: The TF-IDF vectorizer is not fitted

In [132]:

def predict_disease(user_input):
    cleaned_input = clean_text(user_input)
    input_vector = vectorizer.transform([cleaned_input])
    
    # Predict Main Disease using Cosine Similarity
    similarity_scores = cosine_similarity(input_vector, main_disease_vectors)
    main_disease_index = np.argmax(similarity_scores)
    main_disease = df_main.iloc[main_disease_index]['main_disease']
    
    # Load the pre-trained sub-disease model
    with open(f"{main_disease}_model.pkl", "rb") as f:
        model = pickle.load(f)
    
    # Predict Sub-Disease
    sub_disease = model.predict(input_vector)[0]
    
    return main_disease, sub_disease
# Example Input
user_symptoms = " "
main_disease, sub_disease = predict_disease(user_symptoms)
print(f"Predicted Main Disease: {main_disease}")
print(f"Predicted Sub-Disease: {sub_disease}")

Predicted Main Disease: musculoskeletal
Predicted Sub-Disease: CARPAL TUNNEL SYNDROME


In [13]:

# **1️⃣ Train TF-IDF for Main Diseases (For Cosine Similarity Matching)**
vectorizer_main = TfidfVectorizer()
main_disease_texts = df_main['cleaned_text'].values
vectorizer_main.fit(main_disease_texts)
main_disease_vectors = vectorizer_main.transform(main_disease_texts)

# **Save Main Disease TF-IDF Model**
with open("vectorizer_main.pkl", "wb") as f:
    pickle.dump(vectorizer_main, f)
with open("main_disease_vectors.pkl", "wb") as f:
    pickle.dump(main_disease_vectors, f)

# **2️⃣ Train Separate TF-IDF + Logistic Regression for Each Sub-Disease Dataset**
subdisease_models = {}

for main_disease in df_main['main_disease'].unique():
    sub_df = pd.read_excel(f"{main_disease}.xlsx")  # Load sub-disease dataset
    sub_df['cleaned_text'] = sub_df['Description'].apply(clean_text)

    # **Train a New TF-IDF Model for This Specific Main Disease**
    vectorizer_sub = TfidfVectorizer()
    X = vectorizer_sub.fit_transform(sub_df['cleaned_text'])  # Training on sub-disease descriptions
    y = sub_df['sub_disease']

    # **Split Data for Training**
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # **Train Logistic Regression Model for This Sub-Disease**
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # **Save the TF-IDF & Model for This Main Disease**
    with open(f"{main_disease}_vectorizer.pkl", "wb") as f:
        pickle.dump(vectorizer_sub, f)
    with open(f"{main_disease}_model.pkl", "wb") as f:
        pickle.dump(model, f)

    subdisease_models[main_disease] = model

print("Training completed successfully!")




Training completed successfully!


In [17]:
# **3️⃣ Prediction Function**
def predict_disease(user_input):
    cleaned_input = clean_text(user_input)

    # **Load Main Disease TF-IDF and Compute Cosine Similarity**
    input_vector = vectorizer_main.transform([cleaned_input])
    similarity_scores = cosine_similarity(input_vector, main_disease_vectors)

    # **Print Cosine Similarity Scores for All Main Diseases**
    print("\nCosine Similarity Scores:")
    for idx, disease in enumerate(df_main['main_disease']):
        print(f"{disease}: {similarity_scores[0][idx]:.4f}")

    # **Find Most Similar Main Disease**
    main_disease_index = np.argmax(similarity_scores)
    main_disease = df_main.iloc[main_disease_index]['main_disease']

    print(f"\nPredicted Main Disease: {main_disease}")

    # **Load Sub-Disease TF-IDF and Model**
    with open(f"{main_disease}_vectorizer.pkl", "rb") as f:
        vectorizer_sub = pickle.load(f)
    with open(f"{main_disease}_model.pkl", "rb") as f:
        model = pickle.load(f)

    # **Predict Sub-Disease**
    input_vector_sub = vectorizer_sub.transform([cleaned_input])
    sub_disease = model.predict(input_vector_sub)[0]

    return main_disease, sub_disease

# **Example Input**
user_symptoms = "I have been experiencing persistent joint pain and muscle stiffness for the past few weeks"
main_disease, sub_disease = predict_disease(user_symptoms)

print(f"\nFinal Predicted Main Disease: {main_disease}")
print(f"Final Predicted Sub-Disease: {sub_disease}")


Cosine Similarity Scores:
musculoskeletal: 0.4045
ear_nose: 0.0000
respiratory: 0.0000

Predicted Main Disease: musculoskeletal

Final Predicted Main Disease: musculoskeletal
Final Predicted Sub-Disease: Acquired Torticollis
