In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, auc
import warnings
import ast
warnings.filterwarnings("ignore")

print("Loading modified datasets...")
try:

    train_df_modified = pd.read_csv('/content/Training_modified.csv')
    description_df_modified = pd.read_csv('/content/description_modified.csv')
    medications_df_modified = pd.read_csv('/content/medications_modified.csv')
    diets_df_modified = pd.read_csv('/content/diets_modified.csv')
    precautions_df_modified = pd.read_csv('/content/precautions_df_modified.csv')


    medications_df_modified['Medication'] = medications_df_modified['Medication'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    diets_df_modified['Diet'] = diets_df_modified['Diet'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

except FileNotFoundError as e:
    print(f"Error loading modified dataset: {e}")
    print("\nPlease make sure the modified CSV files (Training_modified.csv, description_modified.csv, etc.)")
    print("are uploaded to the /content/ directory in your Colab environment.")
    exit()

try:
    practical_socioeconomic_df = pd.read_csv('/content/practical_socioeconomic_modified.csv')
    print("practical_socioeconomic_modified.csv loaded successfully.")
except FileNotFoundError:
    print("practical_socioeconomic_modified.csv not found. Creating a sample DataFrame with sample data.")
    # Create a sample DataFrame with the specified columns and some sample data
    practical_socioeconomic_df = pd.DataFrame({
        'Disease': [
            'Fungal infection',
            'Allergy',
            'GERD',
            'Chronic cholestasis',
            'Drug Reaction',
            'Peptic ulcer disease',
            'AIDS',
            'Diabetes',
            'Gastroenteritis',
            'Bronchial Asthma',
            'Hypertension',
            'Migraine',
            'Cervical spondylosis',
            'Paralysis (brain hemorrhage)',
            'Jaundice',
            'Malaria',
            'Chicken pox',
            'Dengue',
            'Typhoid',
            'hepatitis A',
            'Hepatitis B',
            'Hepatitis C',
            'Hepatitis D',
            'Hepatitis E',
            'Alcoholic hepatitis',
            'Tuberculosis',
            'Common Cold',
            'Pneumonia',
            'Dimorphic hemmorhoids(piles)',
            'Heart attack',
            'Varicose veins',
            'Hypothyroidism',
            'Hyperthyroidism',
            'Hypoglycemia',
            'Osteoarthristis',
            'Arthritis',
            '(vertigo) Paroymsal Positional Vertigo',
            'Acne',
            'Urinary tract infection',
            'Psoriasis',
            'Impetigo'
        ],
        'Cost & insurance coverage': [
            'Generally low cost, often covered by insurance.',
            'Varies depending on severity and treatment, often covered.',
            'Medication costs can vary, usually covered.',
            'Treatment can be expensive, coverage varies.',
            'Treatment costs vary, usually covered.',
            'Medication costs vary, often covered.',
            'Treatment is expensive, often government-assisted.',
            'Ongoing costs for medication and monitoring, usually covered.',
            'Treatment is usually low cost and covered.',
            'Medication costs vary, usually covered.',
            'Ongoing medication costs, usually covered.',
            'Medication costs vary, often covered.',
            'Treatment costs vary depending on therapy, often covered.',
            'Treatment and rehabilitation are expensive, coverage varies.',
            'Treatment costs vary, often covered.',
            'Treatment costs vary depending on medication, often covered.',
            'Treatment is generally low cost and covered.',
            'Treatment costs vary depending on severity, often covered.',
            'Antibiotics are low cost, often covered.',
            'Treatment costs vary, often covered.',
            'Treatment is expensive, coverage varies.',
            'Treatment is expensive, coverage varies.',
            'Treatment is expensive, coverage varies.',
            'Treatment costs vary, coverage varies.',
            'Treatment costs vary, coverage varies.',
            'Treatment costs vary, often government-assisted.',
            'Treatment is low cost and often over-the-counter.',
            'Treatment costs vary depending on severity, often covered.',
            'Treatment costs vary, often covered.',
            'Treatment and rehabilitation are expensive, coverage varies.',
            'Treatment costs vary, often covered.',
            'Ongoing medication costs, usually covered.',
            'Ongoing medication costs, usually covered.',
            'Treatment is low cost and often involves diet.',
            'Treatment costs vary depending on therapy, often covered.',
            'Treatment costs vary depending on therapy, often covered.',
            'Treatment costs vary depending on therapy, often covered.',
            'Treatment costs vary, often covered.',
            'Antibiotics are low cost, often covered.',
            'Treatment costs vary, coverage varies.',
            'Antibiotics are low cost, often covered.'
        ],
        'Availability': [
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Specialized treatments may require larger pharmacies or hospitals.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Available through specialized clinics.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Requires hospital and rehabilitation facilities.',
            'Available in most pharmacies.',
            'Available in most pharmacies.',
            'Widely available.',
            'Available in most pharmacies.',
            'Widely available in pharmacies.',
            'Available in most pharmacies.',
            'Available through specialized clinics.',
            'Available through specialized clinics.',
            'Available through specialized clinics.',
            'Available through specialized clinics.',
            'Available in most pharmacies.',
            'Available through specialized clinics and government programs.',
            'Widely available in pharmacies and stores.',
            'Available in most pharmacies.',
            'Widely available in pharmacies.',
            'Requires hospital facilities.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Widely available.',
            'Widely available in pharmacies and therapy centers.',
            'Widely available in pharmacies and therapy centers.',
            'Requires specialized therapy.',
            'Widely available in pharmacies.',
            'Widely available in pharmacies.',
            'Specialized treatments may require larger pharmacies.',
            'Widely available in pharmacies.'
        ],
        'Patient preferences': [
            'Topical treatments preferred for mild cases.',
            'Preference for non-drowsy antihistamines.',
            'Preference for once-daily medication.',
            'May require dietary restrictions and lifestyle changes.',
            'Preference for oral medications over injections.',
            'Preference for medication over surgery.',
            'Preference for less frequent dosing.',
            'Preference for oral medications over insulin.',
            'Preference for oral rehydration solutions.',
            'Preference for inhalers over oral medication.',
            'Preference for single-pill combinations.',
            'Preference for non-narcotic pain relievers.',
            'Preference for physical therapy over surgery.',
            'Requires extensive rehabilitation.',
            'May require dietary changes.',
            'Preference for oral medication.',
            'Preference for topical treatments for itching.',
            'May require fluid intake preferences.',
            'Preference for oral antibiotics.',
            'May require dietary changes.',
            'Preference for less frequent injections.',
            'Preference for oral medications.',
            'Preference for less frequent injections.',
            'May require dietary changes.',
            'Preference for oral medication.',
            'Preference for less frequent dosing.',
            'Preference for over-the-counter remedies.',
            'Preference for oral antibiotics.',
            'Preference for topical treatments.',
            'Requires lifestyle changes and possibly surgery.',
            'Preference for compression stockings.',
            'Preference for oral medication.',
            'Preference for oral medication.',
            'Preference for dietary changes.',
            'Preference for physical therapy and pain management.',
            'Preference for physical therapy and pain management.',
            'Preference for physical therapy.',
            'Preference for topical treatments.',
            'Preference for oral antibiotics.',
            'Preference for topical treatments.',
            'Preference for topical antibiotics.'
        ],
        'Ease of use': [
            'Generally easy to use.',
            'Generally easy to use.',
            'Generally easy to use.',
            'May require complex treatment regimens.',
            'Generally easy to use.',
            'Generally easy to use.',
            'May involve complex regimens and side effects.',
            'Requires regular monitoring and adherence.',
            'Generally easy to use.',
            'Requires proper inhaler technique.',
            'Generally easy to use.',
            'Generally easy to use.',
            'May require regular therapy sessions.',
            'Requires intensive care and rehabilitation.',
            'Generally easy to use.',
            'Generally easy to use.',
            'Generally easy to use.',
            'Generally easy to use.',
            'Generally easy to use.',
            'Generally easy to use.',
            'May require injections.',
            'Generally easy to use.',
            'May require injections.',
            'Generally easy to use.',
            'Generally easy to use.',
            'Requires strict adherence to medication.',
            'Generally easy to use.',
            'Generally easy to use.',
            'Generally easy to use.',
            'Requires significant lifestyle changes.',
            'Generally easy to use.',
            'Requires regular monitoring.',
            'Requires regular monitoring.',
            'Generally easy to use.',
            'May require regular therapy sessions.',
            'May require regular therapy sessions.',
            'Requires specialized therapy.',
            'Generally easy to use.',
            'Generally easy to use.',
            'May require consistent application of treatments.',
            'Generally easy to use.'
        ]
    })
    print("Sample practical_socioeconomic_df created with sample data.")


#Data Preparation and Splitting (using modified training data)
X_modified = train_df_modified.drop('prognosis', axis=1)
y_modified = train_df_modified['prognosis']
all_symptoms_modified = X_modified.columns.tolist()

# Encode the target variable (disease names) from the modified data
le_modified = LabelEncoder()
y_encoded_modified = le_modified.fit_transform(y_modified)
n_classes_modified = len(le_modified.classes_)

#Split the modified data into training and testing sets (80/20 split)
X_train_modified, X_test_modified, y_train_modified, y_test_modified = train_test_split(
    X_modified, y_encoded_modified, test_size=0.2, random_state=42
)

#3. Model Training (on modified training data)

model_modified = RandomForestClassifier(random_state=42, min_samples_split=20, min_samples_leaf=10)
model_modified.fit(X_train_modified, y_train_modified)

#4. Model Evaluation (on the single modified test split)
print("\n----------------------------------------------------")
print("  📈 Model Evaluation Metrics (Modified Test Split) 📈  ")
print("----------------------------------------------------")

y_pred_modified = model_modified.predict(X_test_modified)
y_pred_proba_modified = model_modified.predict_proba(X_test_modified)

# Calculate metrics
accuracy_modified = accuracy_score(y_test_modified, y_pred_modified)
precision_modified = precision_score(y_test_modified, y_pred_modified, average='macro')
recall_modified = recall_score(y_test_modified, y_pred_modified, average='macro')
f1_modified = f1_score(y_test_modified, y_pred_modified, average='macro')

print(f"Accuracy: {accuracy_modified:.4f}")
print(f"Precision (Macro): {precision_modified:.4f}")
print(f"Recall (Macro): {recall_modified:.4f}")
print(f"F1-Score (Macro): {f1_modified:.4f}")
print("\n--- Classification Report (Modified Data) ---\n")
print(classification_report(y_test_modified, y_pred_modified, target_names=le_modified.classes_))
print("----------------------------------------------------")


#5. Cross-Validation (on full modified data) ---
print("\n----------------------------------------------------")
print("     📊 Cross-Validation Results (Modified Data) 📊    ")
print("----------------------------------------------------")

# Use StratifiedKFold for cross-validation on the data
cv_modified = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Using 5 folds

print("Performing 5-fold cross-validation on modified data...")

# Calculate cross-validation scores (accuracy)
cv_scores_modified = cross_val_score(model_modified, X_modified, y_encoded_modified, cv=cv_modified, scoring='accuracy')

print("\nCross-validation Accuracy Scores (Modified Data):", cv_scores_modified)
print(f"Mean Cross-validation Accuracy (Modified Data): {np.mean(cv_scores_modified):.4f}")
print(f"Standard Deviation of Accuracy (Modified Data): {np.std(cv_scores_modified):.4f}")
print("----------------------------------------------------")


#6. Visualization of Evaluation Metrics
# Bar Chart for key metrics
metrics_modified = {'Accuracy': accuracy_modified, 'Precision': precision_modified, 'Recall': recall_modified, 'F1-Score': f1_modified}
fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(metrics_modified.keys(), metrics_modified.values(), color=['#4C72B0', '#55A868', '#C44E52', '#8172B2'])
ax.set_ylim(0, 1.1)
ax.set_ylabel('Score')
ax.set_title('Model Performance Metrics (Modified Test Split)')
# Add values on top of bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval, f'{yval:.2f}', va='bottom', ha='center')
plt.tight_layout()
plt.savefig('evaluation_metrics_modified.png')
print("\n✅ Evaluation metrics bar chart saved as 'evaluation_metrics_modified.png'")


# AUC-ROC Curve
y_test_binarized_modified = label_binarize(y_test_modified, classes=range(n_classes_modified))

# Compute ROC curve and ROC area for each class
fpr_modified = dict()
tpr_modified = dict()
roc_auc_modified = dict()
for i in range(n_classes_modified):
    fpr_modified[i], tpr_modified[i], _ = roc_curve(y_test_binarized_modified[:, i], y_pred_proba_modified[:, i])
    roc_auc_modified[i] = auc(fpr_modified[i], tpr_modified[i])

# Plot all ROC curves
plt.figure(figsize=(10, 8))
colors = plt.cm.get_cmap('tab20', n_classes_modified) # Get the colormap
for i, color in zip(range(n_classes_modified), colors.colors):
    plt.plot(fpr_modified[i], tpr_modified[i], color=color, lw=2,
             label='ROC curve of {0} (area = {1:0.2f})'
             ''.format(le_modified.classes_[i], roc_auc_modified[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-class Receiver Operating Characteristic (ROC) Curve (Modified Test Split)')
plt.legend(loc="lower right", fontsize='small')
plt.tight_layout()
plt.savefig('roc_curve_modified.png')
print("✅ AUC-ROC curve plot saved as 'roc_curve_modified.png'")


#7. Recommendation Functions

def get_recommendations_from_disease_modified(disease):
    """
    Given a disease name, fetches its description, medications, diet, precautions,
    and practical/socioeconomic parameters from modified dataframes.
    """
    recommendations = {}
    try:
        recommendations['Description'] = description_df_modified[description_df_modified['Disease'] == disease]['Description'].values[0]
    except IndexError:
        recommendations['Description'] = "No description available."

    try:
      #Medications and Diet columns in modified dataframes are now lists
      meds = medications_df_modified[medications_df_modified['Disease'] == disease]['Medication'].values
      recommendations['Medications'] = meds[0] if len(meds) > 0 else ["No specific medication listed."]
    except IndexError:
      recommendations['Medications'] = ["No specific medication listed."]

    try:
      # Medications and Diet columns in modified dataframes are now lists
      diet = diets_df_modified[diets_df_modified['Disease'] == disease]['Diet'].values
      recommendations['Diet'] = diet[0] if len(diet) > 0 else ["No specific diet information available."]
    except IndexError:
      recommendations['Diet'] = ["No specific diet information available."]


    prec_cols = ['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']
    try:
      precs = precautions_df_modified[precautions_df_modified['Disease'] == disease][prec_cols].values.flatten().tolist()
      recommendations['Precautions'] = [p for p in precs if pd.notna(p)] if any(pd.notna(p) for p in precs) else ["Follow general health advice."]
    except IndexError:
      recommendations['Precautions'] = ["Follow general health advice."]

    # Fetch practical and socio-economic parameters
    try:
        practical_socioeconomic_info = practical_socioeconomic_df[practical_socioeconomic_df['Disease'] == disease].to_dict('records')
        if practical_socioeconomic_info:
            recommendations['Practical & Socio-economic Parameters'] = practical_socioeconomic_info[0]
            del recommendations['Practical & Socio-economic Parameters']['Disease'] # Remove the disease column
        else:
            recommendations['Practical & Socio-economic Parameters'] = {"Information not available": "No specific practical or socio-economic information listed."}
    except IndexError:
        recommendations['Practical & Socio-economic Parameters'] = {"Information not available": "No specific practical or socio-economic information listed."}


    return recommendations


def get_recommendations_from_symptoms_modified(symptoms_list):
    """
    Predicts the disease from symptoms using the model trained on modified data,
    and then fetches recommendations from modified dataframes.
    """
    input_vector = pd.DataFrame([0] * len(all_symptoms_modified), index=all_symptoms_modified).T
    recognized_symptoms_count = 0
    for symptom in symptoms_list:
        formatted_symptom = symptom.strip().replace(" ", "_")
        if formatted_symptom in input_vector.columns:
            input_vector[formatted_symptom] = 1
            recognized_symptoms_count += 1
        else:
            print(f"Warning: Symptom '{symptom}' not recognized in the modified dataset.")

    if recognized_symptoms_count == 0:
        return None, None # Return None to indicate failure

    # Predict the disease using the model trained on modified data
    prediction_encoded = model_modified.predict(input_vector)[0]
    predicted_disease = le_modified.inverse_transform([prediction_encoded])[0]

    # Get recommendations for the predicted disease from modified dataframes
    recommendations = get_recommendations_from_disease_modified(predicted_disease)

    return predicted_disease, recommendations


# --- 8. Interactive User Interface (using modified functions) ---
def main_modified():
    """
    Main function to run the interactive recommendation system using modified data and model.
    """
    print("\n----------------------------------------------------")
    print(" 🩺 Medicine Recommendation System (Modified Data) 🩺")
    print("----------------------------------------------------")

    while True:
        print("\nChoose an option:")
        print("1. Get recommendation based on symptoms")
        print("2. Get details for a specific disease")
        print("Type 'quit' to exit.")

        choice = input("Your choice: ").strip().lower()

        if choice == 'quit':
            print("\nThank you for using the modified system. Stay healthy!")
            break

        elif choice == '1':
            symptoms_input = input("\nEnter your symptoms, separated by commas (e.g., itching,skin_rash): ").strip()
            symptoms = [s.strip() for s in symptoms_input.split(',') if s.strip()]

            if not symptoms:
                 print("\nError: Please enter at least one symptom.")
                 continue

            disease, recs = get_recommendations_from_symptoms_modified(symptoms)

            if disease and recs:
                print("\n----------------------------------------------------")
                print(f"Predicted Disease: {disease}")
                print("----------------------------------------------------")
                print(f"\nDescription:\n{recs['Description']}\n")
                print(f"Recommended Medications:\n- " + "\n- ".join(recs['Medications']) + "\n")
                print(f"Recommended Diet:\n- " + "\n- ".join(recs['Diet']) + "\n")
                print(f"Precautions to Take:\n- " + "\n- ".join(recs['Precautions']) + "\n")
                print("Practical & Socio-economic Parameters:")
                for key, value in recs['Practical & Socio-economic Parameters'].items():
                    print(f"- {key}: {value}")
                print("----------------------------------------------------")
            else:
                print("\nCould not make a prediction. Please enter valid symptoms from the dataset.")

        elif choice == '2':
            disease_input = input("Enter the disease name (e.g., Fungal infection): ").strip()

            if not disease_input:
                print("\nError: Please enter a disease name.")
                continue

            # Check if disease exists in our modified data
            if disease_input not in description_df_modified['Disease'].unique():
                print(f"\nSorry, we don't have information for '{disease_input}' in the modified dataset. Please check the spelling.")
                continue

            recs = get_recommendations_from_disease_modified(disease_input)
            print("\n----------------------------------------------------")
            print(f"Details for Disease: {disease_input}")
            print("----------------------------------------------------")
            print(f"\nDescription:\n{recs['Description']}\n")
            print(f"Recommended Medications:\n- " + "\n- ".join(recs['Medications']) + "\n")
            print(f"Recommended Diet:\n- " + "\n- ".join(recs['Diet']) + "\n")
            print(f"Precautions to Take:\n- " + "\n- ".join(recs['Precautions']) + "\n")
            print("Practical & Socio-economic Parameters:")
                # Ensure practical_socioeconomic_info is a dictionary before iterating
            if isinstance(recs['Practical & Socio-economic Parameters'], dict):
                for key, value in recs['Practical & Socio-economic Parameters'].items():
                    print(f"- {key}: {value}")
            else:
                print("- Information not available.")
            print("----------------------------------------------------")


        else:
            print("\nInvalid choice. Please enter 1, 2, or 'quit'.")


if __name__ == "__main__":
    main_modified()

Loading modified datasets...
Error loading modified dataset: [Errno 2] No such file or directory: '/content/Training_modified.csv'

Please make sure the modified CSV files (Training_modified.csv, description_modified.csv, etc.)
are uploaded to the /content/ directory in your Colab environment.
practical_socioeconomic_modified.csv not found. Creating a sample DataFrame with sample data.
Sample practical_socioeconomic_df created with sample data.


NameError: name 'train_df_modified' is not defined