In [None]:
# Diabetes Prediction

![](https://m.economictimes.com/thumb/msid-61997607,width-1200,height-900,resizemode-4,imgsize-175312/suffering-from-diabetes-five-apps-to-help-you-manage-your-lifestyle-better.jpg)
[Img Source](https://economictimes.indiatimes.com/magazines/panache/are-you-a-diabetes-patient-five-apps-that-can-help-manage-your-lifestyle-better/articleshow/61997607.cms)

The Pima Indians Diabetes Database is a widely used dataset in the field of medical research and specifically, in the study of diabetes. The dataset contains information about female patients of Pima Indian heritage, who are 21 years of age or older. The data was collected by the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of this notebook is to use the Pima Indians Diabetes Database to build a predictive model for diabetes using logistic regression. We will be using the popular data science packages in Python, such as pandas, numpy, seaborn, matplotlib and scikit-learn. The logistic regression algorithm from scikit-learn will be used to build our classification model. Our goal is to use the information in the dataset to accurately predict whether a patient has diabetes or not.

---

# Imports

In [None]:
#importing necessary libraries into our code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#loading our dataset
DataPath = ('../input/diabetes-dataset/diabetes2.csv')
data = pd.read_csv(DataPath)
data.head()

# EDA

In [None]:
data.isna().any()

In [None]:
data.shape

In [None]:
data.nunique()

In [None]:
data.corr()

In [None]:
data.info()

In [None]:
data.describe()

# Data Processing

In [None]:
#split the dataset into x and y values
#x-independent variables
#y-dependent variables
features = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
x = data[features]
y = data.Outcome

In [None]:
#splitting the dataset into features and target variables
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (x,y,test_size = 0.10, random_state = 0)

# Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

#instantiate the model using the default parameters
reg = LogisticRegression (max_iter = 1000)

In [None]:
#fit the model with the data
reg.fit(x_train, y_train)

In [None]:
#predicting the output for our test set
y_pred = reg.predict(x_test)
y_pred

In [None]:
x_test

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
#plotting confusion matrix
class_names = [0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

#create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix', y=1.1)
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')


In [None]:
print("Accuracy : ", metrics.accuracy_score(y_test, y_pred))

In [None]:
import pickle
import os 

#saving the model
# if not os.path.exists('models') :
#       os.makedirs('models')
        
Model_Path = 'logistic_reg.sav'
pickle.dump(reg, open(Model_Path, 'wb'))


In [None]:
#user defined data set
data = [[5, 0, 33.7, 50, 150, 74, 0.5, 53]]
#(not diabetic)

#create the pandas DataFrame 
df = pd.DataFrame(data, columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'])

#predict on new data
ourmodelprediction = reg.predict(df)
ourmodelprediction

---
# Thank You
---