<a href="https://colab.research.google.com/github/hasnaynm/Machine_Learning_Assignment/blob/main/pima_indians.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Loading Data & Preprocessing**

---


In [None]:
import numpy as np #handles numerical operations
import pandas as pd #handles data frames
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf


from sklearn.tree import DecisionTreeClassifier, plot_tree #for decidision tree
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split #splitting the dataset
from sklearn.preprocessing import StandardScaler #scaling features
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix #evaluation metrics
from sklearn.ensemble import RandomForestClassifier #random forest model
from sklearn.svm import SVC #support vector machine model
from tensorflow.keras.models import Sequential #building the neural network
from tensorflow.keras.layers import Dense, Flatten, Conv1D, Conv2D, Dropout #adding layers to the neural network


data = pd.read_csv('diabetes.csv') #reads dataset in data frame



In [None]:
data.head() #see first few rows of dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
data.info() #returns info on dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
data.describe() #describes statistics for the numerical features

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
#splitting the data
X = data.drop('Outcome', axis=1)  #excludes the outcome column from the data
y = data['Outcome']  #stores this column to the y variable so we can still use it- its a dependent variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#splits the data into 80% test and 20% train

In [None]:
#standardises feature values to have mean 0 and variance 1
scaler = StandardScaler()  #calculates mean & standard dev. and uses it to scale data
X_train = scaler.fit_transform(X_train)  #transforms x_train
X_test = scaler.transform(X_test)  #ensures consistency throughout data

In [None]:
#EDA from RF model
rf_model.fit(X, y) #fits model to dataset

importances = rf_model.feature_importances_  #measures importance of each feature in making prediction
indices = np.argsort(importances)[::-1] #returns indices of sorted values in descending order
feature_names = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=feature_names[indices], palette='viridis') #horizontal barplot
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

NameError: name 'rf_model' is not defined

# **Decision tree**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30) #splits dataset ibto 80% training and 20% testing

In [None]:
dt_model = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42) #creates decision tree model

In [None]:
dt_model.fit(X_train, y_train)# trains the model

In [None]:
y_pred = dt_model.predict(X_test) #makes predictions

In [None]:
plt.figure(figsize=(12, 8))   #visualises tree
plot_tree(
    dt_model,
    feature_names=X.columns,
    class_names=['No Diabetes', 'Diabetes'],
    filled=True,
    rounded=True
)
plt.title('Decision Tree Visualization')
plt.show()

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))  #accuracy
print("Decision Tree F1 Score:", f1_score(y_test, y_pred))  #prints f1 score
print(classification_report(y_test, y_pred))  #classification report

# **Random Forest Classifier**

---



In [None]:
#initialising classifier
rf_model = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=42)  #creates 100 trees in the forest, and uses 42 as the seed for its random number, max depth restricts how far the tree goes

In [None]:
rf_model.fit(X_train, y_train) #training random forest model using training data

In [None]:
rf_preds = rf_model.predict(X_test) #uses the trained model for predictions

In [None]:
#evaluates the performance of model
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))  #prints accuracy score
print("Random Forest F1 Score:", f1_score(y_test, rf_preds))  #prints f1 score
print(classification_report(y_test, rf_preds))  #prints precision, recall, and F1-score

# **Convolutional Nueral Network**

In [None]:
# Load the Pima Indians Diabetes dataset
pima = fetch_openml(name="diabetes", version=1, as_frame=True)
X, y = pima.data, pima.target


In [None]:
# Convert target labels to binary (0 = Negative, 1 = Positive)
y = (y == 'tested_positive').astype(int)

In [None]:
# Reshape data to fit CNN input (samples, height, width, channels)
X = X.values.reshape(-1, 8, 1, 1)

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
# Standardise the dataset
scaler = StandardScaler()
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)
X_train_flat = scaler.fit_transform(X_train_flat)
X_test_flat = scaler.transform(X_test_flat)
X_train = X_train_flat.reshape(X_train.shape)
X_test = X_test_flat.reshape(X_test.shape)

In [None]:
# Build a simple CNN model
model = Sequential([
    Conv2D(16, kernel_size=(1, 1), activation='relu', input_shape=(8, 1, 1)),
    Flatten(),
    Dense(32, activation='relu'),  # First hidden layer
    Dropout(0.3),
    Dense(16, activation='relu'),  # Second hidden layer
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=8, verbose=1)


In [None]:
# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()
f1_cnn = f1_score(y_test, y_pred)

In [None]:
print(f"CNN F1 Score (Pima Indians): {f1_cnn:.2f}")

# Classification Report
print("\nClassification Report for CNN:")
print(classification_report(y_test, y_pred))
