# Importing libraries

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib

# Loading Dataset

In [14]:
diabetes_df=pd.read_csv("data/raw_data/diabetes.csv")

In [15]:
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [16]:
y = diabetes_df["Outcome"]
x = diabetes_df.drop(["Outcome"], axis=1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# Pipeline

In [18]:
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Hyperparameter optimization

In [20]:
param_grid = {
    'classifier__C': [0.1, 0.5, 1.0, 10.0]
}

# Training

In [21]:
grid_search_logistic_regression = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

In [22]:
grid_search_logistic_regression.fit(X_train, y_train)

# Test

In [23]:
y_predictions = grid_search_logistic_regression.predict(X_test)

In [24]:
y_predictions

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0])

# Metrics

In [25]:
accuracy = accuracy_score(y_test, y_predictions)

In [26]:
accuracy

0.7447916666666666

# Packaging

## Write

In [28]:
filename = 'models/logistic_regression_model_v01.pkl'

In [29]:
joblib.dump(grid_search_logistic_regression, filename)

['models/logistic_regression_model_v01.pkl']

## Read

In [31]:
x.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [35]:
import random

new_value = {
    "Pregnancies": 6,
    "Glucose": 50,
    "BloodPressure": 72,
    "SkinThickness": 35,
    "Insulin": 0,
    "BMI": 33.6,
    "DiabetesPedigreeFunction": 0.627,
    "Age": 50
}


df_new_data = pd.DataFrame(data=new_value, index=[0])


In [36]:
model = joblib.load(filename='models/logistic_regression_model_v01.pkl')

In [37]:
model.predict(df_new_data)

array([0])