In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

In [15]:
# Step 1: Load the Pima Indians Diabetes Dataset
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
               'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv('pima-indians-diabetes.csv', names=column_names)

print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nBasic statistics:")
print(df.describe())

Dataset shape: (768, 9)

First 5 rows:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 

In [16]:
# Step 2: Enlist the column names in your dataset 
print("Column names in the dataset:")
input_features = df.columns[:-1].tolist()  
target_column = df.columns[-1]

print("Input Features (8 features):")
for i, feature in enumerate(input_features, 1):
    print(f"{i}. {feature}")

print(f"\nTarget Variable: {target_column}")
print(f"\nTotal Input Features: {len(input_features)}")
print(f"Target Values Distribution:")
print(df['Outcome'].value_counts())
print(f"\nClass Balance:")
print(df['Outcome'].value_counts(normalize=True))

Column names in the dataset:
Input Features (8 features):
1. Pregnancies
2. Glucose
3. BloodPressure
4. SkinThickness
5. Insulin
6. BMI
7. DiabetesPedigreeFunction
8. Age

Target Variable: Outcome

Total Input Features: 8
Target Values Distribution:
Outcome
0    500
1    268
Name: count, dtype: int64

Class Balance:
Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64


In [17]:
# Step 3: Extract 4 input features from the dataset
selected_features = ['Glucose', 'BMI', 'Age', 'Pregnancies']

print("Selected 4 input features:")
for i, feature in enumerate(selected_features, 1):
    print(f"{i}. {feature}")

X_selected = df[selected_features]
print(f"\nShape of selected features: {X_selected.shape}")
print("\nFirst 5 rows of selected features:")
print(X_selected.head())
print("\nStatistics for selected features:")
print(X_selected.describe())

Selected 4 input features:
1. Glucose
2. BMI
3. Age
4. Pregnancies

Shape of selected features: (768, 4)

First 5 rows of selected features:
   Glucose   BMI  Age  Pregnancies
0      148  33.6   50            6
1       85  26.6   31            1
2      183  23.3   32            8
3       89  28.1   21            1
4      137  43.1   33            0

Statistics for selected features:
          Glucose         BMI         Age  Pregnancies
count  768.000000  768.000000  768.000000   768.000000
mean   120.894531   31.992578   33.240885     3.845052
std     31.972618    7.884160   11.760232     3.369578
min      0.000000    0.000000   21.000000     0.000000
25%     99.000000   27.300000   24.000000     1.000000
50%    117.000000   32.000000   29.000000     3.000000
75%    140.250000   36.600000   41.000000     6.000000
max    199.000000   67.100000   81.000000    17.000000


In [18]:
# Step 4: Establish X and Y Matrix

X = X_selected.values  
Y = df['Outcome'].values  

print("X and Y matrices established:")
print(f"X shape (Features): {X.shape}")
print(f"Y shape (Target): {Y.shape}")
print(f"\nX (first 5 rows):")
print(X[:5])
print(f"\nY (first 10 values): {Y[:10]}")
print(f"\nTarget distribution:")
unique, counts = np.unique(Y, return_counts=True)
for val, count in zip(unique, counts):
    print(f"Class {val}: {count} samples ({count/len(Y)*100:.1f}%)")

X and Y matrices established:
X shape (Features): (768, 4)
Y shape (Target): (768,)

X (first 5 rows):
[[148.   33.6  50.    6. ]
 [ 85.   26.6  31.    1. ]
 [183.   23.3  32.    8. ]
 [ 89.   28.1  21.    1. ]
 [137.   43.1  33.    0. ]]

Y (first 10 values): [1 0 1 0 1 0 1 0 1 1]

Target distribution:
Class 0: 500 samples (65.1%)
Class 1: 268 samples (34.9%)


In [7]:
# Step 5: Perform 80:20 Data Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

print("Data split completed (80% train, 20% test)")
print(f"\nTraining set:")
print(f"X_train shape: {X_train.shape}")
print(f"Y_train shape: {Y_train.shape}")

print(f"\nTesting set:")
print(f"X_test shape: {X_test.shape}")
print(f"Y_test shape: {Y_test.shape}")

# Check class distribution in train and test sets
print(f"\nClass distribution in training set:")
unique_train, counts_train = np.unique(Y_train, return_counts=True)
for val, count in zip(unique_train, counts_train):
    print(f"Class {val}: {count} samples ({count/len(Y_train)*100:.1f}%)")

print(f"\nClass distribution in test set:")
unique_test, counts_test = np.unique(Y_test, return_counts=True)
for val, count in zip(unique_test, counts_test):
    print(f"Class {val}: {count} samples ({count/len(Y_test)*100:.1f}%)")

Data split completed (80% train, 20% test)

Training set:
X_train shape: (614, 4)
Y_train shape: (614,)

Testing set:
X_test shape: (154, 4)
Y_test shape: (154,)

Class distribution in training set:
Class 0: 400 samples (65.1%)
Class 1: 214 samples (34.9%)

Class distribution in test set:
Class 0: 100 samples (64.9%)
Class 1: 54 samples (35.1%)


In [19]:
# Step 7: Create a Sequential Neural Network with 4 layers

model = Sequential()

# Input layer (12 neurons, input_dim=4)
model.add(Dense(12, input_dim=4, activation='relu'))

# 1st Hidden Layer (8 neurons)
model.add(Dense(8, activation='relu'))

# 2nd Hidden Layer (4 neurons) 
model.add(Dense(4, activation='relu'))

# Output Layer (1 neuron, sigmoid activation for binary classification)
model.add(Dense(1, activation='sigmoid'))

print("Sequential Neural Network created successfully!")
print("\nModel Architecture:")
model.summary()

Sequential Neural Network created successfully!

Model Architecture:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
# Step 8: Compile the sequential model
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

print("Model compiled successfully!")
print("\nCompilation Details:")
print(f"Optimizer: adam")
print(f"Loss function: binary_crossentropy")
print(f"Metrics: accuracy")

Model compiled successfully!

Compilation Details:
Optimizer: adam
Loss function: binary_crossentropy
Metrics: accuracy


In [21]:
# Step 9: Fit the model 
print("Training the model...")
print("Parameters: epochs=50, batch_size=10")
print("This may take a moment...")

history = model.fit(X_train, Y_train, 
                    epochs=50, 
                    batch_size=10, 
                    validation_data=(X_test, Y_test),
                    verbose=1)

print("\nModel training completed!")

Training the model...
Parameters: epochs=50, batch_size=10
This may take a moment...
Epoch 1/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6064 - loss: 4.9452 - val_accuracy: 0.5779 - val_loss: 0.6348
Epoch 2/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6064 - loss: 4.9452 - val_accuracy: 0.5779 - val_loss: 0.6348
Epoch 2/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6015 - loss: 0.6756 - val_accuracy: 0.6429 - val_loss: 0.6347
Epoch 3/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6015 - loss: 0.6756 - val_accuracy: 0.6429 - val_loss: 0.6347
Epoch 3/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6585 - loss: 0.6141 - val_accuracy: 0.5974 - val_loss: 0.6528
Epoch 4/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6585 - loss: 0.61

In [23]:
# Step 10: Evaluate the model displaying the accuracy
print("=== MODEL EVALUATION ===")

test_loss, test_accuracy = model.evaluate(X_test, Y_test, verbose=0)

print(f"\nTest Results:")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

train_loss, train_accuracy = model.evaluate(X_train, Y_train, verbose=0)

print(f"\nTraining Results:")
print(f"Training Loss: {train_loss:.4f}")
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")


=== MODEL EVALUATION ===

Test Results:
Test Loss: 0.5773
Test Accuracy: 0.6688 (66.88%)

Training Results:
Training Loss: 0.5777
Training Accuracy: 0.7166 (71.66%)


In [12]:
# Step 12: Make predictions utilizing the data in the lecture presentation
print("=== MAKING PREDICTIONS ===")

# Make predictions on the test set
predictions_prob = model.predict(X_test, verbose=0)
predictions_binary = (predictions_prob > 0.5).astype(int)

print(f"Predictions made on {len(X_test)} test samples")
print(f"\nFirst 10 predictions:")
print("Sample | Actual | Predicted | Probability | Correct?")
print("-" * 50)

for i in range(min(10, len(X_test))):
    actual = int(Y_test[i])
    predicted = int(predictions_binary[i][0])
    probability = float(predictions_prob[i][0])
    correct = "✓" if actual == predicted else "✗"
    print(f"{i+1:6d} | {actual:6d} | {predicted:9d} | {probability:11.3f} | {correct:7s}")

# Calculate additional metrics
from sklearn.metrics import classification_report, confusion_matrix
print(f"\n=== DETAILED CLASSIFICATION REPORT ===")
print(classification_report(Y_test, predictions_binary))

print(f"\n=== CONFUSION MATRIX ===")
cm = confusion_matrix(Y_test, predictions_binary)
print("Predicted:  0   1")
print(f"Actual 0: [{cm[0,0]:3d} {cm[0,1]:3d}]")
print(f"Actual 1: [{cm[1,0]:3d} {cm[1,1]:3d}]")

print(f"\n=== PREDICTION SUMMARY ===")
correct_predictions = np.sum(Y_test == predictions_binary.flatten())
total_predictions = len(Y_test)
accuracy = correct_predictions / total_predictions
print(f"Correct predictions: {correct_predictions}/{total_predictions}")
print(f"Final accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

=== MAKING PREDICTIONS ===
Predictions made on 154 test samples

First 10 predictions:
Sample | Actual | Predicted | Probability | Correct?
--------------------------------------------------
     1 |      0 |         0 |       0.467 | ✓      
     2 |      0 |         0 |       0.068 | ✓      
     3 |      0 |         0 |       0.207 | ✓      
     4 |      1 |         0 |       0.434 | ✗      
     5 |      0 |         0 |       0.467 | ✓      
     6 |      0 |         0 |       0.224 | ✓      
     7 |      1 |         0 |       0.243 | ✗      
     8 |      1 |         0 |       0.467 | ✗      
     9 |      0 |         0 |       0.221 | ✓      
    10 |      0 |         0 |       0.438 | ✓      

=== DETAILED CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

           0       0.65      0.99      0.78       100
           1       0.00      0.00      0.00        54

    accuracy                           0.64       154
   macro avg       0.32      0.