<a href="https://colab.research.google.com/github/kushiraj18/DATA-ANALYSIS-USING-PYTHON/blob/main/2203A52030_DAUP_LAB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from scipy.stats import norm, ttest_ind

# Load dataset
data = pd.read_csv('/content/diabetes_data_upload.csv')

# Preprocessing (Encoding categorical variables if necessary)
data = pd.get_dummies(data, drop_first=True)

# Split dataset into features and labels
X = data.drop(columns=['class_Positive'])
y = data['class_Positive']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)
    results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'FPR': fpr, 'FNR': fnr}

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

# Identify model with lowest FNR
best_model_fnr = results_df['FNR'].idxmin()
print(f'Model with lowest FNR (Type II Error): {best_model_fnr}')

# Z-Test: Mean age of correctly classified vs. misclassified diabetic patients
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
misclassified = X_test[y_test != y_pred_log]
correctly_classified = X_test[y_test == y_pred_log]

age_misclassified = misclassified[:, 0]
age_correct = correctly_classified[:, 0]

z_stat, p_value = ttest_ind(age_correct, age_misclassified, equal_var=False)
print(f'Z-Test for mean age difference: Z-Statistic = {z_stat}, P-Value = {p_value}')

# Check if FPR for Random Forest is > 20%
rf_fpr = results_df.loc['Random Forest', 'FPR']
if rf_fpr > 0.2:
    pop_mean = 0.2
    n = len(y_test)
    se = np.sqrt((pop_mean * (1 - pop_mean)) / n)
    z_stat_fpr = (rf_fpr - pop_mean) / se
    p_value_fpr = 2 * (1 - norm.cdf(abs(z_stat_fpr)))
    print(f'Z-Test for FPR > 20%: Z-Statistic = {z_stat_fpr}, P-Value = {p_value_fpr}')

# Compare FNRs of SVM, KNN, and Logistic Regression
fnr_svm = results_df.loc['SVM', 'FNR']
fnr_knn = results_df.loc['KNN', 'FNR']
n_svm = y_test.shape[0]
n_knn = y_test.shape[0]
se_fnr = np.sqrt((fnr_svm * (1 - fnr_svm) / n_svm) + (fnr_knn * (1 - fnr_knn) / n_knn))
z_stat_fnr = (fnr_svm - fnr_knn) / se_fnr
p_value_fnr = 2 * (1 - norm.cdf(abs(z_stat_fnr)))
print(f'Z-Test for FNR (SVM vs. KNN): Z-Statistic = {z_stat_fnr}, P-Value = {p_value_fnr}')

                     Accuracy  Precision    Recall    FPR       FNR
Logistic Regression  0.942308   0.983333  0.921875  0.025  0.078125
Decision Tree        0.980769   1.000000  0.968750  0.000  0.031250
Random Forest        0.980769   0.984375  0.984375  0.025  0.015625
SVM                  0.980769   0.984375  0.984375  0.025  0.015625
KNN                  0.932692   0.983051  0.906250  0.025  0.093750
Gradient Boosting    0.990385   1.000000  0.984375  0.000  0.015625
Model with lowest FNR (Type II Error): Random Forest
Z-Test for mean age difference: Z-Statistic = 1.4810358680954627, P-Value = 0.18603116427333352
Z-Test for FNR (SVM vs. KNN): Z-Statistic = -2.5151608424234704, P-Value = 0.011897809031118811
