In [None]:
#install fastext
!pip install fasttext



In [None]:

import pandas as pd
import numpy as np
import fasttext
import fasttext.util
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/project dp/my project1.csv')

In [None]:
data

Unnamed: 0,Age,Gender,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,...,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Descriptions,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,56,Male,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,...,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths
1,19,Male,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,...,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths
2,76,Male,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,...,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths
3,65,Female,Fungal infection,itching,skin_rash,dischromic _patches,,,,,...,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths
4,25,others,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,...,,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,85,Male,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,...,,,,,,Benign paroxysmal positional vertigo (BPPV) is...,lie down,avoid sudden change in body,avoid abrupt head movment,relax
4916,64,Male,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,...,,,,,,"Acne vulgaris is the formation of comedones, p...",bath twice,avoid fatty spicy food,drink plenty of water,avoid too many products
4917,87,Female,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,...,,,,,,Urinary tract infection: An infection of the k...,drink plenty of water,increase vitamin c intake,drink cranberry juice,take probiotics
4918,62,Female,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,...,,,,,,Psoriasis is a common skin disorder that forms...,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [None]:

data.isna().sum()

Unnamed: 0,0
Age,0
Gender,0
Disease,0
Symptom_1,0
Symptom_2,0
Symptom_3,0
Symptom_4,348
Symptom_5,1206
Symptom_6,1986
Symptom_7,2652


In [None]:




symptom_columns = [col for col in data.columns if 'Symptom_' in col]
data[symptom_columns] = data[symptom_columns].fillna('None')

In [None]:


label_encoder_gender = LabelEncoder()
label_encoder_disease = LabelEncoder()

In [None]:


data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data['Disease'] = label_encoder_disease.fit_transform(data['Disease'])



symptoms_combined = data[symptom_columns].apply(lambda x: ' '.join(x), axis=1)
symptoms_combined.to_csv('symptoms.txt', index=False, header=False)
fasttext_model = fasttext.train_unsupervised('symptoms.txt', model='skipgram')


def get_symptom_embeddings(symptoms):
    """Generate FastText embeddings for symptoms."""
    symptom_vectors = [fasttext_model.get_word_vector(symptom) for symptom in symptoms.split()]
    return np.mean(symptom_vectors, axis=0)

data['symptom_embeddings'] = symptoms_combined.apply(get_symptom_embeddings)


embeddings_df = pd.DataFrame(data['symptom_embeddings'].tolist())

X = pd.concat([data[['Age', 'Gender']], embeddings_df], axis=1)
y = data['Disease']

In [None]:
data


Unnamed: 0,Age,Gender,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,...,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Descriptions,Precaution_1,Precaution_2,Precaution_3,Precaution_4,symptom_embeddings
0,56,1,15,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,...,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths,"[0.24136704, 0.027693, 0.004422776, -0.3363623..."
1,19,1,15,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,...,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths,"[0.24509433, 0.026489656, -0.00063481764, -0.3..."
2,76,1,15,itching,nodal_skin_eruptions,dischromic _patches,,,,,...,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths,"[0.24603206, 0.026659928, 0.00036365166, -0.33..."
3,65,0,15,itching,skin_rash,dischromic _patches,,,,,...,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths,"[0.24813598, 0.02495547, 7.8532845e-05, -0.342..."
4,25,2,15,itching,skin_rash,nodal_skin_eruptions,,,,,...,,,,,"In humans, fungal infections occur when an inv...",bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths,"[0.25457203, 0.023010833, -0.0035274795, -0.34..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,85,1,0,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,...,,,,,Benign paroxysmal positional vertigo (BPPV) is...,lie down,avoid sudden change in body,avoid abrupt head movment,relax,"[0.21891272, 0.03993668, 0.030840095, -0.31782..."
4916,64,1,2,skin_rash,pus_filled_pimples,blackheads,scurring,,,,...,,,,,"Acne vulgaris is the formation of comedones, p...",bath twice,avoid fatty spicy food,drink plenty of water,avoid too many products,"[0.23905195, 0.027693361, 0.0021379883, -0.330..."
4917,87,0,38,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,...,,,,,Urinary tract infection: An infection of the k...,drink plenty of water,increase vitamin c intake,drink cranberry juice,take probiotics,"[0.2365719, 0.029592752, 0.008654752, -0.33458..."
4918,62,0,35,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,...,,,,,Psoriasis is a common skin disorder that forms...,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths,"[0.23360597, 0.03182902, 0.0105657615, -0.3316..."


In [None]:
data.isna().sum()

Unnamed: 0,0
Age,0
Gender,0
Disease,0
Symptom_1,0
Symptom_2,0
Symptom_3,0
Symptom_4,0
Symptom_5,0
Symptom_6,0
Symptom_7,0


In [None]:

X.columns = X.columns.astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




model = RandomForestClassifier(n_estimators=100, random_state=42)


model.fit(X_train, y_train)



y_pred = model.predict(X_test)

In [None]:

import pickle

with open('disease_model.pkl', 'wb') as f:
  pickle.dump(model, f)
with open('label_encoder_gender.pkl', 'wb') as f:
  pickle.dump(label_encoder_gender, f)
with open('label_encoder_disease.pkl', 'wb') as f:
  pickle.dump(label_encoder_disease, f)

from google.colab import files
files.download('disease_model.pkl')
files.download('label_encoder_gender.pkl')
files.download('label_encoder_disease.pkl')

with open('disease_model.pkl', 'rb') as f:
  loaded_model = pickle.load(f)
with open('label_encoder_gender.pkl', 'rb') as f:
  loaded_label_encoder_gender = pickle.load(f)
with open('label_encoder_disease.pkl', 'rb') as f:
  loaded_label_encoder_disease = pickle.load(f)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00        30
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        25
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        23
           6       1.00      1.00      1.00        33
           7       1.00      1.00      1.00        23
           8       1.00      1.00      1.00        21
           9       1.00      1.00      1.00        15
          10       1.00      1.00      1.00        23
          11       1.00      1.00      1.00        26
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        29
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00        19
          16       1.00      1.00      1.00

In [None]:

filename_gender = 'label_encoder_gender.pkl'
pickle.dump(label_encoder_gender, open(filename_gender, 'wb'))

filename_disease = 'label_encoder_disease.pkl'
pickle.dump(label_encoder_disease, open(filename_disease, 'wb'))


In [None]:


loaded_label_encoder_gender = pickle.load(open(filename_gender, 'rb'))
loaded_label_encoder_disease = pickle.load(open(filename_disease, 'rb'))

In [None]:

age = int(input("Enter age: "))
gender = input("Enter gender (Male/Female): ")
symptoms_input = input("Enter symptoms (comma-separated): ")


gender_encoded = label_encoder_gender.transform([gender])[0]


symptoms_list = [symptom.strip() for symptom in symptoms_input.split(',')]
symptoms_combined = ' '.join(symptoms_list)
symptom_embeddings = get_symptom_embeddings(symptoms_combined, fasttext_model)


input_features = np.concatenate(([age, gender_encoded], symptom_embeddings))
input_features = input_features.reshape(1, -1)


prediction = model.predict(input_features)[0]


predicted_disease = label_encoder_disease.inverse_transform([prediction])[0]

print("Predicted Disease:", predicted_disease)


Enter age: 22
Enter gender (Male/Female): Male
Enter symptoms (comma-separated): body pain etr dryness,stomach pain
Predicted Disease: Hepatitis E




In [None]:


def get_symptom_embeddings(symptoms, fasttext_model):

    symptom_vectors = [fasttext_model.get_word_vector(symptom) for symptom in symptoms.split()]
    return np.mean(symptom_vectors, axis=0)


In [None]:


import pickle


with open('disease_model.pkl', 'wb') as f:
  pickle.dump(model, f)
with open('label_encoder_gender.pkl', 'wb') as f:
  pickle.dump(label_encoder_gender, f)
with open('label_encoder_disease.pkl', 'wb') as f:
  pickle.dump(label_encoder_disease, f)


from google.colab import files
files.download('disease_model.pkl')
files.download('label_encoder_gender.pkl')
files.download('label_encoder_disease.pkl')


with open('disease_model.pkl', 'rb') as f:
  loaded_model = pickle.load(f)
with open('label_encoder_gender.pkl', 'rb') as f:
  loaded_label_encoder_gender = pickle.load(f)
with open('label_encoder_disease.pkl', 'rb') as f:
  loaded_label_encoder_disease = pickle.load(f)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pickle

with open('get_symptom_embeddings.pkl', 'wb') as f:
    pickle.dump(get_symptom_embeddings, f)


fasttext_model.save_model('fasttext_model.bin')


In [None]:
import fasttext

fasttext_model = fasttext.load_model('fasttext_model.bin')

In [None]:
import pickle

with open('get_symptom_embeddings.pkl', 'rb') as f:
    get_symptom_embeddings = pickle.load(f)

In [None]:
import pickle
import os


model_path = os.path.abspath('disease_model.pkl')
gender_encoder_path = os.path.abspath('label_encoder_gender.pkl')
disease_encoder_path = os.path.abspath('label_encoder_disease.pkl')


with open(model_path, 'rb') as f:
  loaded_model = pickle.load(f)
with open(gender_encoder_path, 'rb') as f:
  loaded_label_encoder_gender = pickle.load(f)
with open(disease_encoder_path, 'rb') as f:
  loaded_label_encoder_disease = pickle.load(f)

In [None]:
!pip install catboost



In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


xgb = XGBClassifier(objective='multi:softmax', num_class=len(y.unique()), random_state=42)
param_grid = {
    'n_estimators': [10, 30],
    'max_depth': [2, 4],

    'learning_rate': [0.001, 0.01],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 1.5]
}

# Grid Search with more cross-validation folds for better generalization
grid_search_xgb = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

# Save the best model
best_xgb = grid_search_xgb.best_estimator_
joblib.dump(best_xgb, 'XGBoost_best_model.pkl')

# Make predictions and evaluate
predictions = best_xgb.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"XGBoost Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{classification_report(y_test, predictions)}")


XGBoost Accuracy: 0.9949
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      0.93      0.97        30
           2       0.92      1.00      0.96        24
           3       1.00      1.00      1.00        25
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        23
           6       1.00      1.00      1.00        33
           7       1.00      1.00      1.00        23
           8       1.00      1.00      1.00        21
           9       1.00      1.00      1.00        15
          10       1.00      1.00      1.00        23
          11       1.00      1.00      1.00        26
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        29
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00        19
          16       1.00      1.00

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib


lgbm = LGBMClassifier(random_state=42)
param_grid = {'n_estimators': [50, 100], 'num_leaves': [31, 64], 'learning_rate': [0.01, 0.1]}


grid_search_lgbm = GridSearchCV(lgbm, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_lgbm.fit(X_train, y_train)


best_lgbm = grid_search_lgbm.best_estimator_
joblib.dump(best_lgbm, 'LightGBM_best_model.pkl')


predictions = best_lgbm.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"LightGBM Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{classification_report(y_test, predictions)}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



NameError: name 'X_train' is not defined

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib


catboost = CatBoostClassifier(verbose=0, random_state=42)
param_grid = {'iterations': [50, 100], 'depth': [3, 6], 'learning_rate': [0.01, 0.1]}

grid_search_cat = GridSearchCV(catboost, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_cat.fit(X_train, y_train)


best_cat = grid_search_cat.best_estimator_
joblib.dump(best_cat, 'CatBoost_best_model.pkl')


predictions = best_cat.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"CatBoost Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{classification_report(y_test, predictions)}")


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib


rf = RandomForestClassifier(random_state=42)
param_grid = {'n_estimators': [50, 100], 'max_depth': [None, 10]}


grid_search_rf = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)


best_rf = grid_search_rf.best_estimator_
joblib.dump(best_rf, 'RandomForest_best_model.pkl')


predictions = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Random Forest Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{classification_report(y_test, predictions)}")


In [None]:
import pickle


filename = 'random_forest_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib


svm = SVC(probability=True, random_state=42)
param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf']}

grid_search_svm = GridSearchCV(svm, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_svm.fit(X_train, y_train)


best_svm = grid_search_svm.best_estimator_
joblib.dump(best_svm, 'SVM_best_model.pkl')


predictions = best_svm.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"SVM Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{classification_report(y_test, predictions)}")


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Model and parameter grid
gb = GradientBoostingClassifier(random_state=42)
param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}

# Hyperparameter tuning
grid_search_gb = GridSearchCV(gb, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_gb.fit(X_train, y_train)

# Save best model
best_gb = grid_search_gb.best_estimator_
joblib.dump(best_gb, 'GradientBoosting_best_model.pkl')

# Evaluation
predictions = best_gb.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Gradient Boosting Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{classification_report(y_test, predictions)}")


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Model and parameter grid
mlp = MLPClassifier(max_iter=500, random_state=42)
param_grid = {'hidden_layer_sizes': [(100,), (50, 50)], 'alpha': [0.0001, 0.01]}

# Hyperparameter tuning
grid_search_mlp = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_mlp.fit(X_train, y_train)

# Save best model
best_mlp = grid_search_mlp.best_estimator_
joblib.dump(best_mlp, 'MLPClassifier_best_model.pkl')

# Evaluation
predictions = best_mlp.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Neural Network Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{classification_report(y_test, predictions)}")


In [None]:
import pickle

# Load the model from the pickle file
loaded_model = pickle.load(open('random_forest_model.pkl', 'rb'))