In [1]:
import pandas as pd
import numpy as np
# from sentence_transformers import SentenceTransformer


In [2]:
df = pd.read_csv('archive/dataset_Diseases_and_Symptoms.csv')


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

class_counts = df["diseases"].value_counts()

# Keep only classes with at least 2 samples
valid_classes = class_counts[class_counts >= 2].index
df_filtered = df[df["diseases"].isin(valid_classes)]

X = df_filtered.drop("diseases", axis=1)
y = df_filtered["diseases"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# Initialize the Random Forest Classifier with your parameters
clf = RandomForestClassifier(
    n_estimators=150,
    max_depth=25,
    max_features='log2',
    random_state=42
)

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8140768638885514

Confusion Matrix:
 [[28  0  0 ...  0  0  0]
 [ 0 78  0 ...  0  0  0]
 [ 0  0 46 ...  0  0  0]
 ...
 [ 0  0  0 ...  3  0  0]
 [ 0  0  0 ...  0 87  0]
 [ 0  0  0 ...  0  0  2]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
                                                           precision    recall  f1-score   support

                               abdominal aortic aneurysm       1.00      1.00      1.00        28
                                        abdominal hernia       0.90      0.96      0.93        81
                                         abscess of nose       0.96      0.79      0.87        58
                                     abscess of the lung       1.00      0.75      0.86         4
                                  abscess of the pharynx       0.92      0.68      0.78        68
                                    acanthosis nigricans       1.00      0.83      0.91         6
                                               acariasis       1.00      0.71      0.83         7
                                               achalasia       1.00      0.29      0.45        17
                                                    acne       0.86      0.73      0.79     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
TỪ DƯỚI NÀY TRỞ XUỐNG LÀ EMBEDED MODEL CODE

In [None]:
import joblib
joblib.dump(clf,'clf-imbalanced-v2.pkl')

['clf-imbalanced-v2.pkl']

In [43]:
embedModel = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")

def get_embedding(text):
    return embedModel.encode(text)



In [45]:
import faiss

In [None]:
# Generate embeddings
symptom_vectors = np.array([get_embedding(s) for s in all_symptoms]).astype('float32')


In [None]:
# Store using FAISS
dim = symptom_vectors.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(symptom_vectors)

# Map vector index to original symptom
symptom_map = {i: all_symptoms[i] for i in range(len(all_symptoms))}

In [47]:
def find_similar_symptoms(user_input, top_k=5):
    query_vec = get_embedding(user_input).astype('float32').reshape(1, -1)
    distances, indices = index.search(query_vec, top_k)
    # Filter out any indices that are -1
    threshold = 1.0  # tune this based on your needs
    valid_results = [(symptom_map[i], distances[0][j]) for j, i in enumerate(indices[0]) if i != -1]
    return valid_results


In [48]:
user_input = "anxiety"
results = find_similar_symptoms(user_input)

print("I found these matching symptoms:")
for symptom, score in results:
    print(f"- {symptom} (distance: {score:.4f})")

I found these matching symptoms:
- anxiety and nervousness (distance: 92.2544)
- emotional symptoms (distance: 128.6182)
- fears and phobias (distance: 155.5813)
- depression (distance: 163.0554)
- excessive anger (distance: 190.3128)


In [6]:
user_symptoms_input = [results]

# Extract just the symptom names
user_symptoms_input = [symptom[0] for symptom in user_symptoms_input[0]]
# Result
print(user_symptoms_input)

['nasal congestion', 'headache', 'dizziness', 'symptoms of the face', 'nausea']


In [7]:
user_symptom_vector = [1 if symptom in user_symptoms_input else 0 for symptom in all_symptoms]

In [3]:
X = df.drop('diseases', axis=1)  # Drops the disease column, keeps symptoms
X.head()

Unnamed: 0,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,palpitations,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,1,0,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
y = df['diseases']
y.head()

0    panic disorder
1    panic disorder
2    panic disorder
3    panic disorder
4    panic disorder
Name: diseases, dtype: object

In [6]:
user_symptoms_input = ['shortness of breath', 'chest tightness', 'sharp chest pain', 'difficulty breathing', 'coughing up sputum', 'wheezing', 'nasal congestion']
user_symptom_vector = [1 if symptom in user_symptoms_input else 0 for symptom in all_symptoms]
# input_df = pd.DataFrame([user_symptom_vector], columns=all_symptoms)


In [8]:
probs = rf_2nd.predict_proba([user_symptom_vector])[0]
top_indices = probs.argsort()[-3:][::-1]
top_diagnoses = [(rf_2nd.classes_[i], probs[i]) for i in top_indices]
top_diagnoses



[('asthma', np.float64(0.1033131320711424)),
 ('chronic obstructive pulmonary disease (copd)',
  np.float64(0.04797807042724246)),
 ('acute bronchitis', np.float64(0.02294493393490376))]