Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection and Analysis

PIMA Diabetes Dataset

In [None]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('/content/diabetes.csv')

In [None]:
diabetes_dataset.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
diabetes_data = diabetes_dataset[['Glucose',  'BMI','Age','DiabetesPedigreeFunction','Outcome']]

In [None]:
# printing the first 5 rows of the dataset
diabetes_data.head()

Unnamed: 0,Glucose,BMI,Age,DiabetesPedigreeFunction,Outcome
0,148,33.6,50,0.627,1
1,85,26.6,31,0.351,0
2,183,23.3,32,0.672,1
3,89,28.1,21,0.167,0
4,137,43.1,33,2.288,1


In [None]:
# number of rows and Columns in this dataset
diabetes_data.shape

(768, 5)

In [None]:
# getting the statistical measures of the data
diabetes_data.describe()

Unnamed: 0,Glucose,BMI,Age,DiabetesPedigreeFunction,Outcome
count,768.0,768.0,768.0,768.0,768.0
mean,120.894531,31.992578,33.240885,0.471876,0.348958
std,31.972618,7.88416,11.760232,0.331329,0.476951
min,0.0,0.0,21.0,0.078,0.0
25%,99.0,27.3,24.0,0.24375,0.0
50%,117.0,32.0,29.0,0.3725,0.0
75%,140.25,36.6,41.0,0.62625,1.0
max,199.0,67.1,81.0,2.42,1.0


In [None]:
diabetes_data['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


0 --> Non-Diabetic

1 --> Diabetic

In [None]:
diabetes_data.groupby('Outcome').mean()

Unnamed: 0_level_0,Glucose,BMI,Age,DiabetesPedigreeFunction
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,109.98,30.3042,31.19,0.429734
1,141.257463,35.142537,37.067164,0.5505


In [None]:
# separating the data and labels
X = diabetes_data.drop(columns = 'Outcome', axis=1)
Y = diabetes_data['Outcome']

In [None]:
print(X)

     Glucose   BMI  Age  DiabetesPedigreeFunction
0        148  33.6   50                     0.627
1         85  26.6   31                     0.351
2        183  23.3   32                     0.672
3         89  28.1   21                     0.167
4        137  43.1   33                     2.288
..       ...   ...  ...                       ...
763      101  32.9   63                     0.171
764      122  36.8   27                     0.340
765      121  26.2   30                     0.245
766      126  30.1   47                     0.349
767       93  30.4   23                     0.315

[768 rows x 4 columns]


In [None]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


Train Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(768, 4) (614, 4) (154, 4)


Training the Model

In [None]:
# prompt: train with XGBoost model

!pip install xgboost

import xgboost as xgb

#Create the XGBoost model
classifier = xgb.XGBClassifier()

#Train the model
classifier.fit(X_train, Y_train)

#Model Evaluation
#Accuracy Score
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)


Accuracy score of the training data :  1.0
Accuracy score of the test data :  0.7467532467532467


In [None]:
classifier = svm.SVC(kernel='linear')

In [None]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

In [None]:
# prompt: Random Forest

from sklearn.ensemble import RandomForestClassifier

#Create the Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42) # You can adjust n_estimators

#Train the model
rf_classifier.fit(X_train, Y_train)

#Model Evaluation
#Accuracy Score
# accuracy score on the training data
X_train_prediction = rf_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data (Random Forest): ', training_data_accuracy)

# accuracy score on the test data
X_test_prediction = rf_classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data (Random Forest): ', test_data_accuracy)


Accuracy score of the training data (Random Forest):  1.0
Accuracy score of the test data (Random Forest):  0.7662337662337663


Model Evaluation

Accuracy Score

In [None]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7785016286644951


In [None]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7597402597402597


Making a Predictive System

In [None]:
input_data = (50, 30, 45, 0.587) # Example values, replace with relevant patient data

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[0]
The person is not diabetic




Saving the trained model

In [None]:
import pickle

In [None]:
filename = 'diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [None]:
# prompt: save all models with pkl

filename_xgb = 'xgb_model.pkl'
pickle.dump(classifier, open(filename_xgb, 'wb'))

filename_svm = 'svm_model.pkl'
pickle.dump(classifier, open(filename_svm, 'wb'))

filename_rf = 'rf_model.pkl'
pickle.dump(rf_classifier, open(filename_rf, 'wb'))


In [None]:
#  save model with joblib

import joblib

# Saving the trained model
filename = 'diabetes_model.joblib'
joblib.dump(classifier, filename)


In [None]:
# loading the saved model
loaded_model = pickle.load(open('diabetes_model.sav', 'rb'))

In [None]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

In [None]:
for column in X.columns:
  print(column)