## 1. Import libraries

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 2. Get the data

In [3]:
df=pd.read_csv('diabetes.csv')

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [5]:
df.shape

(2000, 9)

In [6]:
df.groupby(by='Outcome').count()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1316,1316,1316,1316,1316,1316,1316,1316
1,684,684,684,684,684,684,684,684


In [7]:
df.groupby('Outcome').size()

Outcome
0    1316
1     684
dtype: int64

## 3. Pre-processing the data

### a. Missing data (checking if any of the data is null)

In [8]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### b. Duplicated data (check and remove duplicate data)

In [9]:
df.duplicated().sum() #checking for duplicates

1256

In [10]:
df=df.drop_duplicates() #removing duplicates

In [11]:
df.duplicated().sum()

0

In [11]:
df.groupby('Outcome').size()

Outcome
0    491
1    253
dtype: int64

## 4. Splitting the data

* Storing feature_names in X, target in y

In [12]:
X=df[['Pregnancies','Glucose','BloodPressure','Insulin','BMI','DiabetesPedigreeFunction','Age']]
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,2,138,62,0,33.6,0.127,47
1,0,84,82,125,38.2,0.233,23
2,0,145,0,0,44.2,0.63,31
3,0,135,68,250,42.3,0.365,24
4,1,139,62,480,40.7,0.536,21


In [13]:
y=df['Outcome']
y.head()

0    1
1    0
2    1
3    1
4    0
Name: Outcome, dtype: int64

In [14]:
X.shape, y.shape

((744, 7), (744,))

In [15]:
train_X, test_X, train_y, test_y= train_test_split(X,y,test_size=0.2,random_state=21)

In [16]:
train_X.shape, test_X.shape

((595, 7), (149, 7))

## 5. Select Algorithm

* Logistic Regression

## 6. Build the classifier model

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
model=LogisticRegression(solver='lbfgs', max_iter=1000)

In [19]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## 7. Train the model

In [20]:
model.fit(train_X,train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## 8. Test the model

In [22]:
pred_y=model.predict(test_X)
pred_y

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

## 9. Performance of the model

In [23]:
model.score(test_X,test_y)*100

73.15436241610739

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
accuracy_score(test_y,pred_y)*100

73.15436241610739

## Result

In [26]:
pred_y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

In [27]:
test_y[:10]

557    0
173    0
685    0
539    1
422    0
730    1
180    0
119    0
676    1
683    1
Name: Outcome, dtype: int64