In [None]:
import pandas as pd
import random
import os

# Sample names for diversity
male_names = [
    "Amit", "Ravi", "Vikas", "Deepak", "Manoj", "Arjun", "Sanjay", "Prakash", "Vinod", "Rohit",
    "Rajesh", "Suresh", "Mahesh", "Ramesh", "Karan", "Harsh", "Nitin", "Yash", "Pankaj", "Sumit",
    "Ankit", "Tarun", "Gaurav", "Kapil", "Abhishek", "Sachin", "Rahul", "Sameer", "Aditya", "Siddharth",
    "Lokesh", "Bhavesh", "Akhil", "Parth", "Tushar", "Varun", "Rajat", "Dev", "Aarav", "Krishna",
    "Shubham", "Ashish", "Dinesh", "Uday", "Alok", "Bharat", "Kunal", "Suraj", "Vivek", "Lalit"
]
female_names = [
    "Rekha", "Kavita", "Pooja", "Neha", "Anjali", "Priya", "Sunita", "Meena", "Divya", "Sapna",
    "Sneha", "Ritu", "Nisha", "Anita", "Simran", "Aarti", "Bhavna", "Komal", "Preeti", "Rashmi",
    "Swati", "Payal", "Sheetal", "Shilpa", "Monika", "Geeta", "Radhika", "Jyoti", "Seema", "Archana",
    "Tina", "Laxmi", "Usha", "Suman", "Kiran", "Pallavi", "Megha", "Riya", "Manisha", "Varsha",
    "Kirti", "Sarita", "Sonia", "Chitra", "Damini", "Alisha", "Hemlata", "Tanvi", "Ishita", "Harshita"
]
last_names = [
    "Sharma", "Verma", "Gupta", "Yadav", "Rao", "Singh", "Agarwal", "Choudhary", "Patel", "Nair",
    "Iyer", "Mehta", "Tripathi", "Joshi", "Bansal", "Malhotra", "Kohli", "Thakur", "Mishra", "Dubey",
    "Goyal", "Rawat", "Reddy", "Kapoor", "Chopra", "Vijay", "Bhatia", "Pandey", "Saxena", "Tiwari",
    "Kulkarni", "Deshmukh", "Pawar", "Sethi", "Mathur", "Srivastava", "Das", "Roy", "Banerjee", "Chatterjee",
    "Naidu", "Menon", "Dutta", "Sen", "Mukherjee", "Kumar", "Bhatt", "Pathak", "Jha", "Khan"
]


# Categorical options for fields
smoking_status_options = [
    "Non-smoker", "Ex-smoker", "Light Smoker", "Moderate Smoker", "Heavy Smoker",
    "Social Smoker", "Occasional Smoker", "Trying to Quit", "Vaper (Nicotine)", "Dual User (Cigarette + Vaper)"
]
alcohol_options = [
    "None", "Occasional", "Moderate", "Frequent", "Heavy",
    "Social Drinker", "Binge Drinker", "Weekend Drinker", "Rare Drinker", "Alcohol Dependent"
]
exercise_options = [
    "None", "Light", "Moderate", "Regular", "Intense",
    "Yoga Only", "Walking Daily", "Gym 2-3 times/week", "Athlete", "Sedentary"
]

conditions_options = [
    "None", "Hypertension", "Diabetes", "Heart Disease", "Kidney Disease",
    "Hypertension, Diabetes", "Hypertension, Heart Disease", "Diabetes, Kidney Disease",
    "Hypertension, Kidney Disease", "Heart Disease, Kidney Disease",
    "Asthma", "Obesity", "COPD", "Thyroid Disorder", "Cancer History",
    "Hypertension, Obesity", "Diabetes, Obesity", "Hypertension, Diabetes, Obesity",
    "Autoimmune Disease", "Arthritis"
]
medications_options = [
    "None", "BP Meds", "Insulin", "Aspirin", "BP Meds, Insulin",
    "Cholesterol Meds", "Blood Thinners", "Diuretics", "Beta Blockers", "ACE Inhibitors",
    "Metformin", "Statins", "Painkillers (NSAIDs)", "Thyroid Meds", "Antidepressants",
    "Hypertension Combo Therapy", "Diabetes Combo Therapy", "Heart Disease Drugs",
    "Kidney Disease Drugs", "Anticoagulants"
]

complaints_options = [
    "None", "Chest Pain", "Dizziness", "Fatigue", "Headache",
    "Shortness of Breath", "Palpitations", "Swelling in Legs", "Blurred Vision", "Nausea",
    "Chest Tightness", "Fainting", "Sudden Weakness", "Numbness in Limbs", "Back Pain",
    "Joint Pain", "Abdominal Pain", "Insomnia", "Frequent Urination", "Rapid Weight Loss"
]

# Generating 50 records
records = []
for i in range(1, 30001):
    gender = random.choice(["Male", "Female"])
    if gender == "Male":
        first_name = random.choice(male_names)
    else:
        first_name = random.choice(female_names)
    last_name = random.choice(last_names)
    patient_id = f"SHC{i:03d}"
    age = random.randint(25, 75)
    family_history = random.choice(["Yes", "No"])
    bp_systolic = random.randint(110, 180)
    bp_diastolic = random.randint(70, 110)
    blood_sugar = random.randint(80, 250)
    cholesterol = random.randint(140, 300)
    bmi = round(random.uniform(18.5, 35.0), 1)
    smoking_status = random.choice(smoking_status_options)
    alcohol_consumption = random.choice(alcohol_options)
    exercise_level = random.choice(exercise_options)
    existing_conditions = random.choice(conditions_options)
    medications = random.choice(medications_options)
    heart_rate = random.randint(60, 100)
    ecg_abnormality = random.choice(["Yes", "No"])
    recent_complaints = random.choice(complaints_options)
    hospital_visits = random.randint(0, 5)

    records.append([
        patient_id, first_name, last_name, age, gender, family_history, bp_systolic, bp_diastolic,
        blood_sugar, cholesterol, bmi, smoking_status, alcohol_consumption, exercise_level,
        existing_conditions, medications, heart_rate, ecg_abnormality, recent_complaints, hospital_visits
    ])

# Creating DataFrame
columns = [
    "Patient_ID", "First_Name", "Last_Name", "Age", "Gender", "Family_History", "BP_Systolic", "BP_Diastolic",
    "Blood_Sugar", "Cholesterol", "BMI", "Smoking_Status", "Alcohol_Consumption", "Exercise_Level",
    "Existing_Conditions", "Medications", "Heart_Rate", "ECG_Abnormality", "Recent_Complaints", "Hospital_Visits"
]
df = pd.DataFrame(records, columns=columns)

# Saving to CSV in the current working directory
csv_path = "patient_health_data.csv"  # Changed the path

# Create the directory if it doesn't exist (optional for current directory)
if os.path.dirname(csv_path):
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)

df.to_csv(csv_path, index=False)

csv_path


'patient_health_data.csv'

In [None]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.6 MB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m2.7/3.6 MB[0m [31m40.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import random

# Load Dataset
df = pd.read_csv('patient_health_data.csv')

# Encoding categorical features
label_encoders = {}
categorical_columns = ['Gender', 'Family_History', 'Smoking_Status', 'Alcohol_Consumption', 'Exercise_Level', 'Existing_Conditions', 'Medications', 'ECG_Abnormality', 'Recent_Complaints']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Features and dummy target creation for each risk
X = df.drop(columns=['Patient_ID', 'First_Name', 'Last_Name'])

# Introduce noise to mimic real-world uncertainty
def generate_target_risk_with_noise(row):
    base_risk = 'Low'
    if row['BP_Systolic'] > 150 or row['Cholesterol'] > 240 or row['Heart_Rate'] > 95:
        base_risk = 'High'
    elif row['BP_Systolic'] > 130 or row['Cholesterol'] > 200:
        base_risk = 'Medium'

    # Introduce noise
    if random.random() < 0.1:  # 10% chance of wrong label
        return random.choice(['Low', 'Medium', 'High'])
    return base_risk

# Creating dummy targets based on simple rules with noise
for risk in ['Heart_Attack_Risk', 'Stroke_Risk', 'Diabetic_Complications_Risk', 'Hypertension_Crisis_Risk']:
    df[risk] = df.apply(generate_target_risk_with_noise, axis=1)

# Training a model for each risk
models = {}
for risk in ['Heart_Attack_Risk', 'Stroke_Risk', 'Diabetic_Complications_Risk', 'Hypertension_Crisis_Risk']:
    y = df[risk]
    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)

    params = {
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'max_depth': 6,
        'num_leaves': 31,
        'verbose': -1
    }

    model = lgb.train(
        params,
        train_data,
        valid_sets=[test_data],
        num_boost_round=100,
        callbacks=[lgb.early_stopping(stopping_rounds=10)], # Use callbacks for early stopping
    )


    models[risk] = (model, le_target)

    y_pred = model.predict(X_test)
    y_pred_classes = [y.argmax() for y in y_pred]

    print(f"\nClassification Report for {risk}:")
    print(classification_report(y_test, y_pred_classes, target_names=le_target.classes_))

print("Training Complete.")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[63]	valid_0's multi_logloss: 0.290026

Classification Report for Heart_Attack_Risk:
              precision    recall  f1-score   support

        High       0.94      0.98      0.96      3915
         Low       0.94      0.76      0.84       723
      Medium       0.93      0.90      0.91      1362

    accuracy                           0.94      6000
   macro avg       0.93      0.88      0.90      6000
weighted avg       0.93      0.94      0.93      6000

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[66]	valid_0's multi_logloss: 0.283176

Classification Report for Stroke_Risk:
              precision    recall  f1-score   support

        High       0.94      0.98      0.96      3908
         Low       0.93      0.80      0.86       679
      Medium       0.94      0.88      0.91      1413

    accuracy                           0.94  

In [None]:
import pandas as pd
#ENTRY 1

# Example patient data - AFTER encoding the categorical fields
custom_input = pd.DataFrame({
    'Age': [55],
    'Gender': [label_encoders['Gender'].transform(['Male'])[0]],
    'Family_History': [label_encoders['Family_History'].transform(['Yes'])[0]],
    'BP_Systolic': [160],
    'BP_Diastolic': [95],
    'Blood_Sugar': [180],
    'Cholesterol': [250],
    'BMI': [28],
    'Smoking_Status': [label_encoders['Smoking_Status'].transform(['Heavy Smoker'])[0]],
    'Alcohol_Consumption': [label_encoders['Alcohol_Consumption'].transform(['Occasional'])[0]],
    'Exercise_Level': [label_encoders['Exercise_Level'].transform(['Regular'])[0]],
    'Existing_Conditions': [label_encoders['Existing_Conditions'].transform(['Hypertension'])[0]],
    'Medications': [label_encoders['Medications'].transform(['BP Meds'])[0]],
    'Heart_Rate': [90],
    'ECG_Abnormality': [label_encoders['ECG_Abnormality'].transform(['Yes'])[0]],
    'Recent_Complaints': [label_encoders['Recent_Complaints'].transform(['Chest Pain'])[0]],
    'Hospital_Visits': [0] # Adding the missing 'Hospital_Visits' column
})
for risk in ['Heart_Attack_Risk', 'Stroke_Risk', 'Diabetic_Complications_Risk', 'Hypertension_Crisis_Risk']:
    model, le_target = models[risk]  # Get trained model & LabelEncoder for target
    prediction_probs = model.predict(custom_input)  # Get probabilities for each class
    predicted_class_index = prediction_probs[0].argmax()  # Index of the highest probability
    predicted_class_label = le_target.inverse_transform([predicted_class_index])[0]  # Convert back to 'Low/Medium/High'

    print(f"{risk}: {predicted_class_label}")

Heart_Attack_Risk: High
Stroke_Risk: High
Diabetic_Complications_Risk: High
Hypertension_Crisis_Risk: High


In [None]:
import pandas as pd
import random
# MAKING ANOMALOUS Dataset

# Load Dataset
df = pd.read_csv('patient_health_data.csv')

# Function to inject anomalies into the dataset
def inject_anomalies(df, anomaly_fraction=0.03):
    df_anomaly = df.copy()
    num_anomalies = int(len(df) * anomaly_fraction)
    anomaly_indices = random.sample(range(len(df)), num_anomalies)

    for idx in anomaly_indices:
        choice = random.random()
        if choice < 0.25:  # Extreme vitals
            df_anomaly.loc[idx, 'BP_Systolic'] = random.randint(220, 300)
            df_anomaly.loc[idx, 'BP_Diastolic'] = random.randint(130, 180)
            df_anomaly.loc[idx, 'Heart_Rate'] = random.randint(180, 250)
        elif choice < 0.5:  # Illogical BMI
            df_anomaly.loc[idx, 'BMI'] = random.uniform(5, 60)
        elif choice < 0.75:  # Medication & BP mismatch
            df_anomaly.loc[idx, 'BP_Systolic'] = random.randint(180, 250)
            df_anomaly.loc[idx, 'Medications'] = random.choice(['BP Meds', 'BP Meds, Insulin'])
        else:  # Contradictory Lifestyle vs. Obesity
            df_anomaly.loc[idx, 'Exercise_Level'] = random.choice(['Moderate', 'Regular'])
            df_anomaly.loc[idx, 'BMI'] = random.uniform(40, 60)

    return df_anomaly

# Apply anomaly injection
df_anomalous = inject_anomalies(df, anomaly_fraction=0.03)

# Save the anomalous dataset to CSV
df_anomalous.to_csv('anomalous.csv', index=False)

print("Anomalous dataset saved as 'anomalous.csv'")


Anomalous dataset saved as 'anomalous.csv'


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import multivariate_normal

# Load Anomalous Dataset
df_anomalous = pd.read_csv('anomalous.csv')

# Drop non-numeric fields
X_gaussian = df_anomalous.select_dtypes(include=['float64', 'int64']).copy()

# Fit Gaussian Distribution
mean = X_gaussian.mean(axis=0)
cov = np.cov(X_gaussian, rowvar=False)

# Calculate probability density for each data point
pdf_values = multivariate_normal(mean=mean, cov=cov).pdf(X_gaussian)

# Set epsilon threshold (adjust as needed based on data distribution)
epsilon = np.percentile(pdf_values, 3)  # 3% as anomalies

# Mark anomalies
df_anomalous['Anomaly'] = np.where(pdf_values < epsilon, 'Anomaly', 'Normal')

# Save the results with anomalies marked
df_anomalous.to_csv('anomalous_with_predictions.csv', index=False)

# Display some detected anomalies
anomalies_detected = df_anomalous[df_anomalous['Anomaly'] == 'Anomaly']
print(anomalies_detected.head())

# Evaluate accuracy (assuming 3% of injected anomalies)
actual_anomalies = int(len(df_anomalous) * 0.03)  # Since we injected 3%
detected_anomalies = len(anomalies_detected)

print(f"Expected Anomalies: {actual_anomalies}")
print(f"Detected Anomalies: {detected_anomalies}")

precision = detected_anomalies / (detected_anomalies + (len(df_anomalous) - detected_anomalies))
recall = detected_anomalies / actual_anomalies
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")

print("Gaussian anomaly detection complete. Results saved as 'anomalous_with_predictions.csv'")


    Patient_ID First_Name  Last_Name  Age  Gender Family_History  BP_Systolic  \
24      SHC025      Seema  Choudhary   58  Female             No          228   
128     SHC129     Pankaj     Tiwari   30    Male             No          171   
159     SHC160       Usha      Dutta   65  Female             No          126   
176     SHC177     Shilpa       Nair   59  Female             No          254   
320     SHC321    Pallavi     Bansal   38  Female             No          138   

     BP_Diastolic  Blood_Sugar  Cholesterol  ...    Smoking_Status  \
24             91          171          247  ...  Vaper (Nicotine)   
128           104           80          264  ...   Moderate Smoker   
159           101          181          184  ...     Social Smoker   
176           141          150          150  ...     Social Smoker   
320           105          162          158  ...  Vaper (Nicotine)   

    Alcohol_Consumption      Exercise_Level     Existing_Conditions  \
24           Occasion

In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Load Anomalous Dataset
df_anomalous = pd.read_csv('anomalous.csv')

# Inject anomalies into the dataset (for testing purposes)
# For simplicity, let's assume that we inject 3% anomalies at random
import numpy as np
np.random.seed(42)
num_rows = len(df_anomalous)
num_anomalies = int(num_rows * 0.03)

# Randomly assign True Anomalies
true_anomalies = np.zeros(num_rows)
anomaly_indices = np.random.choice(df_anomalous.index, num_anomalies, replace=False)
true_anomalies[anomaly_indices] = 1

# Add True_Anomaly column to the dataframe
df_anomalous['True_Anomaly'] = true_anomalies

# Drop non-numeric fields for Isolation Forest
X_iforest = df_anomalous.select_dtypes(include=['float64', 'int64']).copy()

# Train Isolation Forest
model_iforest = IsolationForest(n_estimators=100, contamination=0.0382, random_state=42)
model_iforest.fit(X_iforest)

# Predict Anomalies
anomaly_preds = model_iforest.predict(X_iforest)

# Add Anomaly Column to DataFrame
df_anomalous['Anomaly'] = anomaly_preds

# Map Anomalies to Human-readable Labels
df_anomalous['Anomaly'] = df_anomalous['Anomaly'].map({1: 'Normal', -1: 'Anomaly'})

# Save the results with anomalies marked
df_anomalous.to_csv('anomalous_with_predictions.csv', index=False)

# Display some detected anomalies
anomalies_detected = df_anomalous[df_anomalous['Anomaly'] == 'Anomaly']
print(anomalies_detected.head())

# Evaluate accuracy using True_Anomaly for comparison
detected_anomalies = len(anomalies_detected)

# False positives and false negatives
false_positives = len(df_anomalous[(df_anomalous['Anomaly'] == 'Anomaly') & (df_anomalous['True_Anomaly'] == 0)])
false_negatives = len(df_anomalous[(df_anomalous['Anomaly'] == 'Normal') & (df_anomalous['True_Anomaly'] == 1)])

print(f"Detected Anomalies: {detected_anomalies}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")

# Precision, Recall, and F1 Score
precision = detected_anomalies / (detected_anomalies + false_positives)
recall = detected_anomalies / (detected_anomalies + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")

print("Isolation Forest anomaly detection complete. Results saved as 'anomalous_with_predictions.csv'")


    Patient_ID First_Name   Last_Name  Age  Gender Family_History  \
34      SHC035      Anita      Chopra   61  Female             No   
173     SHC174      Tanvi      Bansal   65  Female             No   
176     SHC177     Shilpa        Nair   59  Female             No   
216     SHC217      Megha  Srivastava   27  Female            Yes   
217     SHC218      Divya       Vijay   66  Female            Yes   

     BP_Systolic  BP_Diastolic  Blood_Sugar  Cholesterol  ...  \
34           140           102          118          246  ...   
173          110           103           90          296  ...   
176          254           141          150          150  ...   
216          169            89          195          251  ...   
217          135            91          192          224  ...   

     Alcohol_Consumption      Exercise_Level              Existing_Conditions  \
34       Weekend Drinker           Sedentary         Diabetes, Kidney Disease   
173           Occasional  Gym 2-