In [1]:
import pandas as pd

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder,MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,r2_score,f1_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB

In [80]:
df=pd.read_csv("Symptom2Disease_new.csv")

In [81]:
df.head()

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [82]:
df["label"]=df["label"].apply(lambda x:[label.strip() for label in str(x).split(",")])

In [83]:
mlb = MultiLabelBinarizer()
encoded_labels = mlb.fit_transform(df["label"])  # returns 2D array

# Add back as multiple columns
label_df = pd.DataFrame(encoded_labels, columns=mlb.classes_)

# Concatenate with the original DataFrame
df = pd.concat([df, label_df], axis=1)

In [84]:
df.head()

Unnamed: 0,label,text,Acne,Acute Lymphoblastic Leukaemia,Arthritis,Bladder Cancer,Bronchial Asthma,Cervical spondylosis,Chicken pox,Chronic Kidney Disease,...,Psoriasis,Tuberculosis,Typhoid,Varicose Veins,allergy,diabetes,drug reaction,gastroesophageal reflux disease,peptic ulcer disease,urinary tract infection
0,[Psoriasis],I have been experiencing a skin rash on my arm...,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,[Psoriasis],"My skin has been peeling, especially on my kne...",0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,[Psoriasis],I have been experiencing joint pain in my fing...,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,[Psoriasis],"There is a silver like dusting on my skin, esp...",0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,[Psoriasis],"My nails have small dents or pits in them, and...",0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [85]:
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_indices,test_indices in split.split(df,df["label"]):
    train_set=df.loc[train_indices]
    test_set=df.loc[test_indices]
    

In [86]:
x_train=train_set[["text"]]

In [87]:
y_train = train_set[mlb.classes_]

In [88]:
preprocessor=ColumnTransformer(transformers=[
    ("tfid",TfidfVectorizer(
        stop_words="english",
        ngram_range=(1,3),
        max_features=2000),"text")])


In [101]:
model=Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("clf",OneVsRestClassifier(MultinomialNB()))])

In [102]:
model.fit(x_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('tfid', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,estimator,MultinomialNB()
,n_jobs,
,verbose,0

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [103]:
x_test=test_set[["text"]]

In [104]:
y_test=test_set[mlb.classes_]

In [105]:
y_predict=model.predict(x_test)

In [106]:
print("accuracy Score:", accuracy_score(y_test, y_predict))

accuracy Score: 0.28662420382165604


In [107]:
symptom_input = input("enter symptoms:")

# Prepare as DataFrame since pipeline expects named column "text"
input_df = pd.DataFrame({"text": [symptom_input]})

# Predict probabilities
probs = model.predict_proba(input_df)[0]

# Get top 3 predicted class indices
top3_indices = probs.argsort()[-3:][::-1]

# Map to disease names
top3_diseases = [mlb.classes_[i] for i in top3_indices]

# Show result
print("Top 3 predicted diseases:", top3_diseases)


enter symptoms: pain in my shoulder below neck


Top 3 predicted diseases: ['Cervical spondylosis', 'Dengue', 'Arthritis']


In [108]:
import joblib

In [109]:
joblib.dump(model,"model.pkl")
joblib.dump(mlb,"label.pkl")

['label.pkl']