In [None]:
# Q1. Import the dataset and examine the variables

# To begin, we will load the dataset and explore it using descriptive statistics and visualizations. This helps us understand the distribution of the data and relationships between the variables.

# Code Example:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('diabetes.csv')

# Display the first few rows
print(df.head())

# Display summary statistics
print(df.describe())

# Visualize the distribution of each feature
df.hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

# Visualize the correlation matrix to identify relationships
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

# Visualize the distribution of the target variable (Outcome)
sns.countplot(x='Outcome', data=df)
plt.show()


In [None]:
# Q2. Preprocess the data
# Before training the model, we need to preprocess the data:

# Handle missing values: Some columns may have missing values that need to be filled or removed.
# Remove outliers: Outliers may affect model performance.
# Transform categorical variables: In this dataset, the "Outcome" variable is the target, but we may need to handle other categorical features if present (none in this case).

# Check for missing values
print(df.isnull().sum())

# Handle missing values by replacing with the mean (or other imputation methods)
df.fillna(df.mean(), inplace=True)

# Check for outliers using boxplots
sns.boxplot(x=df['Glucose'])
plt.show()

# Remove outliers (example: removing values beyond the 95th percentile)
df = df[df['Glucose'] < df['Glucose'].quantile(0.95)]

# Check the data types
print(df.dtypes)

# Ensure all variables are numeric (already true in this dataset)


In [None]:
# Q3. Split the dataset into training and test sets
# We will split the dataset into a training set and a test set to train and evaluate the model. Using a random seed ensures reproducibility.

from sklearn.model_selection import train_test_split

# Features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Q4. Train a decision tree model
# We will use a decision tree algorithm such as ID3 or C4.5 (scikit-learn’s DecisionTreeClassifier implements C4.5), and we will perform cross-validation to optimize the model's hyperparameters.

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Initialize a DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {'max_depth': [3, 5, 7, 10, None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'criterion': ['gini', 'entropy']}

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print(grid_search.best_params_)

# Train the model with the best hyperparameters
best_dt = grid_search.best_estimator_


In [None]:
# Q5. Evaluate the performance of the decision tree
# After training the model, we evaluate its performance on the test set using metrics like accuracy, precision, recall, and F1 score. We will also use a confusion matrix and ROC curve to visualize the results.

# Code Example:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve

# Make predictions on the test set
y_pred = best_dt.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'ROC AUC: {roc_auc:.2f}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Diabetic', 'Diabetic'], yticklabels=['Non-Diabetic', 'Diabetic'])
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, best_dt.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label='ROC Curve')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()


In [None]:
# Q6. Interpret the decision tree
# After training the decision tree, we can visualize the tree to understand the splits, branches, and leaves. We can also determine the most important features.

# Code Example:

from sklearn.tree import plot_tree

# Visualize the trained decision tree
plt.figure(figsize=(15, 10))
plot_tree(best_dt, filled=True, feature_names=X.columns, class_names=['Non-Diabetic', 'Diabetic'], rounded=True)
plt.show()

# Feature importance
importances = best_dt.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
print(feature_importance.sort_values(by='Importance', ascending=False))


In [None]:
# Q7. Validate the decision tree model
# Finally, we can validate the model by applying it to new data or testing its robustness to changes in the dataset or environment. Sensitivity analysis can also be used to check how the model responds to small changes in input data.

# Code Example:

# Sensitivity analysis: Try slightly modifying the test set
X_test_modified = X_test.copy()
X_test_modified['Glucose'] += 5  # Slight modification
y_pred_modified = best_dt.predict(X_test_modified)

# Evaluate the performance again
accuracy_modified = accuracy_score(y_test, y_pred_modified)
print(f'Accuracy on modified test set: {accuracy_modified:.2f}')
