In [None]:
#Q1.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv("diabetes.csv")

# Display the first few rows of the dataset
print(data.head())

# Summary statistics
print(data.describe())

# Pairwise relationships using a pairplot
sns.pairplot(data, hue="Outcome", diag_kind='kde')
plt.show()

# Correlation matrix heatmap
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.show()

In [None]:
#Q2.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Load the dataset
data = pd.read_csv("diabetes.csv")

# Replace zero values with NaN in specific columns
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[columns_to_replace] = data[columns_to_replace].replace(0, np.nan)

# Box plots to detect outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=data[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']], palette="Set2")
plt.xticks(rotation=45)
plt.show()



In [None]:
#Q3.

from sklearn.model_selection import train_test_split

# Specify the features (X) and the target variable (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and testing sets (e.g., 70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# The random_state parameter ensures reproducibility by fixing the random seed


In [None]:
#Q4.
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Create a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Define the imputer (replace NaN values with the mean)
imputer = SimpleImputer(strategy='mean')

# Create a pipeline
pipeline = Pipeline(steps=[('imputer', imputer), ('classifier', clf)])

# Perform cross-validation to optimize hyperparameters
param_grid = {
    'classifier__max_depth': range(1, 11)  # Try different values for the maximum depth
}

# Perform 10-fold cross-validation to find the best max_depth
cv_scores = []

for max_depth in param_grid['classifier__max_depth']:
    pipeline.set_params(classifier__max_depth=max_depth)
    scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

# Find the max_depth that resulted in the highest cross-validated accuracy
best_max_depth = param_grid['classifier__max_depth'][cv_scores.index(max(cv_scores))]

# Train the final Decision Tree model with the best max_depth
clf = DecisionTreeClassifier(max_depth=best_max_depth, random_state=42)

# Fit the model using the training data
pipeline.fit(X_train, y_train)

# Now, your Decision Tree model is trained with the optimized hyperparameter and missing values handled.


In [None]:
#Q5.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate ROC curve and AUC
y_probs = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = roc_auc_score(y_test, y_probs)

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix')
plt.colorbar()

# Labeling the plot
classes = ['Non-Diabetic', 'Diabetic']
tick_marks = range(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)

plt.xlabel('Predicted')
plt.ylabel('True')

for i in range(len(classes)):
    for j in range(len(classes)):
        plt.text(j, i, str(conf_matrix[i, j]), horizontalalignment='center', color='white' if conf_matrix[i, j] > conf_matrix.max() / 2 else 'black')

# Show the plot
plt.show()

# Visualize the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

#Q6.

Interpreting a decision tree involves examining the splits, branches, and leaves to understand how the model makes predictions. In your case, the decision tree is trained to predict whether a patient has diabetes based on clinical variables. To interpret the decision tree, we need to look at the splits and the most important variables along with their thresholds. Here's how you can interpret the decision tree:

    Root Node: The root node represents the starting point of the decision tree. The variable and threshold used in the first split indicate the most important feature for making predictions.

    Splits and Branches: As you traverse down the tree, you encounter splits and branches. Each split represents a decision point based on a specific variable, and each branch corresponds to one of the possible outcomes of that decision. The splitting criterion can be Gini impurity, entropy, or another metric, and the algorithm selects the variable and threshold that best separate the data into different classes.

    Leaves: The terminal nodes of the tree are called leaves. Each leaf node is associated with a class label (0 for non-diabetic, 1 for diabetic). When a data point reaches a leaf, the model predicts the class label associated with that leaf.

    Thresholds: For each split, you can find the threshold value that separates the data. The threshold represents a critical value of the variable used in the split. For example, if the first split occurs on the "Glucose" variable with a threshold of 140, it means that patients with glucose levels greater than 140 will follow one branch, while those with levels less than or equal to 140 will follow the other branch.

    Important Variables: To identify the most important variables, you can look at the top-level splits and consider the features that appear early in the decision tree. These are typically the variables that have the most significant impact on the prediction.

    Domain Knowledge and Common Sense: To interpret the patterns and trends, it's essential to use your domain knowledge and common sense. For example, if the first split is on the "Glucose" level, it's reasonable to expect that high glucose levels are a strong predictor of diabetes. Similarly, if the second split is on "BMI," it suggests that the body mass index is another critical factor in diabetes prediction.

    Pruning: Decision trees can become quite complex, and not all splits may be meaningful. Pruning techniques can be used to simplify the tree and remove unnecessary branches, making the interpretation more straightforward.

It's essential to remember that decision trees are interpretable models, and understanding the splits, branches, and leaves can provide valuable insights into how the model is making predictions based on the given features.

#Q7.

Validating a decision tree model and testing its robustness to changes in the dataset or environment is a crucial step in the model development process. Here are some techniques, including sensitivity analysis and scenario testing, to explore uncertainty and risks:

    Holdout Testing: Split your data into a training set and a holdout test set. Train the decision tree model on the training set and evaluate its performance on the holdout test set. This provides an estimate of how well the model generalizes to new, unseen data.

    Cross-Validation: Use cross-validation techniques, such as k-fold cross-validation, to assess the model's performance more robustly. Cross-validation helps ensure that the model's performance is consistent across different subsets of the data.

    Sensitivity Analysis:
        Variable Importance: Analyze the importance of variables in the decision tree. Some libraries offer feature importance scores that can help you identify which features are most influential in making predictions.
        Threshold Sensitivity: Evaluate how changes in the threshold values for splits affect the model's performance. You can increase or decrease the thresholds used in the decision tree splits to assess its sensitivity to these changes.

    Scenario Testing:
        Outliers: Test the model's robustness to outliers in the data. Introduce extreme values for certain variables and observe how the model responds. Outliers can significantly impact decision tree models.
        Missing Data: Introduce missing values in the dataset and assess how the model handles missing data. You may need to preprocess the data to handle missing values, such as imputation or removal.
        Feature Changes: Change the distribution of feature values to simulate different scenarios. For example, modify the distribution of glucose levels or age to see how the model's predictions change.
        Concept Drift: If the environment or data distribution is subject to change over time, periodically retest the model's performance on new data to detect and adapt to concept drift.

    Robustness Testing:
        Perturbations: Introduce small random perturbations to the input features to test the model's stability. This can help identify whether the model is overly sensitive to noise in the data.
        Model Variants: Compare the performance of the decision tree model to other models, such as random forests, gradient boosting, or support vector machines. Different models may have varying degrees of robustness.

    Threshold Tuning: Experiment with changing the decision thresholds to balance precision and recall. This can be especially important in healthcare applications where the cost of false positives and false negatives may differ.

    External Validation: If possible, obtain external datasets for validation. This can help ensure that the model performs consistently across different data sources and populations.

    Business Impact Analysis: Consider the real-world impact of model predictions. For a healthcare application, assess how the model's predictions may affect patient care, costs, and outcomes.

    Documentation and Monitoring: Continuously monitor the model's performance in production and document any observed deviations or issues. Implement feedback loops to retrain the model as needed.

    Regulatory Compliance: Ensure that the model complies with any relevant regulatory requirements, especially in healthcare where patient data and decisions can have significant legal implications.

By applying these techniques and conducting thorough validation and testing, you can better understand the performance and robustness of your decision tree model and mitigate potential risks and uncertainties.