In [43]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [44]:
# --- Step 1: Data Preprocessing ---
def load_and_preprocess(file_path):
    try:
        df = pd.read_csv(file_path)
        # Ensure the required columns exist
        if 'symptoms' not in df.columns or 'disease_name' not in df.columns:
            print("Error: Dataset missing 'symptoms' or 'disease_name' columns.")
            return None

        # Remove rows with missing values
        df = df.dropna(subset=['symptoms', 'disease_name'])

        # Text cleaning function
        def clean_text(text):
            text = str(text).lower()
            text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation/numbers
            return text

        df['cleaned_symptoms'] = df['symptoms'].apply(clean_text)
        print(f"Successfully loaded {len(df)} rows.")
        return df
    except Exception as e:
        print(f"Error loading file: {e}")
        return None

In [45]:
# --- Step 2 & 3: Model Building ---
def build_model(df):
    if df is None or df.empty:
        print("Error: Cannot build model because the dataframe is empty or None.")
        return None, None

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
    X = vectorizer.fit_transform(df['cleaned_symptoms'])

    # Nearest Neighbors Model
    model = NearestNeighbors(n_neighbors=1, metric='cosine')
    model.fit(X)

    return vectorizer, model

In [46]:
# --- Step 4: Prediction Logic ---
def predict_disease(input_symptoms, df, vectorizer, model):
    if vectorizer is None or model is None:
        return "Model not initialized.", 0, ""

    cleaned_input = str(input_symptoms).lower()
    cleaned_input = re.sub(r'[^a-z\s]', '', cleaned_input)
    input_vec = vectorizer.transform([cleaned_input])

    distances, indices = model.kneighbors(input_vec)
    confidence = 1 - distances[0][0]

    if confidence < 0.1:
        return "No matching disease found.", 0, "N/A"

    row = df.iloc[indices[0][0]]
    return row['disease_name'], confidence, row['treatment']

In [47]:
# --- EXECUTION ---
# 1. Load data
df_clean = load_and_preprocess('/content/last_final_final_disease_data.csv')

Successfully loaded 295 rows.


In [48]:
# 2. Build model (Only if data loaded successfully)
if df_clean is not None:
    tfidf_vec, knn_model = build_model(df_clean)

    # 3. Test the prediction
    test_symptoms = "I have a high fever, cough, and chest pain"
    disease, score, treat = predict_disease(test_symptoms, df_clean, tfidf_vec, knn_model)

    print("-" * 30)
    print(f"Predicted Disease: {disease}")
    print(f"Confidence: {score:.2%}")
    print(f"Suggested Treatment: {treat}")
else:
    print("Failed to initialize df_clean. Please check the file path.")

------------------------------
Predicted Disease: Cough
Confidence: 37.52%
Suggested Treatment: Most cases of cough are symptomatically relieved with the help of cold & cough medicines. Some of the common treatment measures for cough include:
