In [1]:
!mkdir data

In [12]:
import numpy as np

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

file_path = "/content/data/medical data.csv"
data = pd.read_csv(file_path)

data['Symptoms'] = data['Symptoms'].fillna('')
data['Causes'] = data['Causes'].fillna('')

data['Features'] = data['Symptoms'] + " " + data['Causes']

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_features = tfidf_vectorizer.fit_transform(data['Features']).toarray()


label_encoder_disease = LabelEncoder()
label_encoder_medicine = LabelEncoder()
y_disease = label_encoder_disease.fit_transform(data['Disease'])
y_medicine = label_encoder_medicine.fit_transform(data['Medicine'])

X_train, X_test, y_disease_train, y_disease_test = train_test_split(X_features, y_disease, test_size=0.2, random_state=42)
_, _, y_medicine_train, y_medicine_test = train_test_split(X_features, y_medicine, test_size=0.2, random_state=42)

knn_disease = KNeighborsClassifier(n_neighbors=5)
knn_disease.fit(X_train, y_disease_train)
disease_preds_knn = knn_disease.predict(X_test)

svm_disease = SVC(random_state=42)
svm_disease.fit(X_train, y_disease_train)
disease_preds_svm = svm_disease.predict(X_test)

rf_disease = RandomForestClassifier(random_state=42)
rf_disease.fit(X_train, y_disease_train)
disease_preds_rf = rf_disease.predict(X_test)

knn_medicine = KNeighborsClassifier(n_neighbors=5)
knn_medicine.fit(X_train, y_medicine_train)
medicine_preds_knn = knn_medicine.predict(X_test)

svm_medicine = SVC(random_state=42)
svm_medicine.fit(X_train, y_medicine_train)
medicine_preds_svm = svm_medicine.predict(X_test)

rf_medicine = RandomForestClassifier(random_state=42)
rf_medicine.fit(X_train, y_medicine_train)
medicine_preds_rf = rf_medicine.predict(X_test)

disease_accuracy_knn = accuracy_score(y_disease_test, disease_preds_knn) * 100
disease_accuracy_svm = accuracy_score(y_disease_test, disease_preds_svm) * 100
disease_accuracy_rf = accuracy_score(y_disease_test, disease_preds_rf) * 100

medicine_accuracy_knn = accuracy_score(y_medicine_test, medicine_preds_knn) * 100
medicine_accuracy_svm = accuracy_score(y_medicine_test, medicine_preds_svm) * 100
medicine_accuracy_rf = accuracy_score(y_medicine_test, medicine_preds_rf) * 100

print(f"Disease Prediction Accuracy (KNN): {disease_accuracy_knn:.2f}%")
print(f"Disease Prediction Accuracy (SVM): {disease_accuracy_svm:.2f}%")
print(f"Disease Prediction Accuracy (Random Forest): {disease_accuracy_rf:.2f}%")

print(f"Medicine Prediction Accuracy (KNN): {medicine_accuracy_knn:.2f}%")
print(f"Medicine Prediction Accuracy (SVM): {medicine_accuracy_svm:.2f}%")
print(f"Medicine Prediction Accuracy (Random Forest): {medicine_accuracy_rf:.2f}%")

sample_input = ["Fever, Cough Viral Infection"]
sample_features = tfidf_vectorizer.transform(sample_input).toarray()

predicted_disease_rf = label_encoder_disease.inverse_transform(rf_disease.predict(sample_features))
predicted_medicine_rf = label_encoder_medicine.inverse_transform(rf_medicine.predict(sample_features))

print(f"Predicted Disease (Random Forest): {predicted_disease_rf[0]}")
print(f"Predicted Medicine (Random Forest): {predicted_medicine_rf[0]}")


Disease Prediction Accuracy (KNN): 77.59%
Disease Prediction Accuracy (SVM): 87.93%
Disease Prediction Accuracy (Random Forest): 89.66%
Medicine Prediction Accuracy (KNN): 75.86%
Medicine Prediction Accuracy (SVM): 93.10%
Medicine Prediction Accuracy (Random Forest): 89.66%
Predicted Disease (Random Forest): Common Cold
Predicted Medicine (Random Forest): Antivirals, Rest
