In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head()
train_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,614.0,614.0,612.0,613.0,613.0,607.0,611.0,614.0,614.0
mean,3.742671,120.855049,69.383987,20.402936,81.323002,31.973476,0.46693,32.907166,0.346906
std,3.313264,32.035057,18.534648,15.446274,116.29473,7.757084,0.33377,11.503437,0.476373
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,100.0,64.0,0.0,0.0,27.1,0.2405,24.0,0.0
50%,3.0,117.0,72.0,23.0,42.0,32.0,0.371,29.0,0.0
75%,6.0,139.0,80.0,32.0,129.0,36.45,0.6125,40.0,1.0
max,17.0,199.0,122.0,63.0,846.0,67.1,2.42,81.0,1.0


In [3]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98,58.0,33.0,190.0,34.0,0.43,43,0
1,2,112,75.0,32.0,0.0,35.7,0.148,21,0
2,2,108,64.0,0.0,0.0,30.8,0.158,21,0
3,8,107,80.0,0.0,0.0,24.6,0.856,34,0
4,7,136,90.0,0.0,0.0,29.9,0.21,50,0


In [4]:
test_df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               2
SkinThickness               0
Insulin                     1
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [5]:
train_df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               2
SkinThickness               1
Insulin                     1
BMI                         7
DiabetesPedigreeFunction    3
Age                         0
Outcome                     0
dtype: int64

In [6]:
X_train = train_df.iloc[:, :-1].values
y_train = train_df.iloc[:, -1].values

X_test = test_df.iloc[:, :-1].values
y_test = test_df.iloc[:, -1].values

In [7]:
def missing_value(X, y):
    
    #BloodPressure
    blood_pressure_imputer = SimpleImputer(np.nan, strategy = 'mean')
    X[:, 2] = blood_pressure_imputer.fit_transform(X[:, 2].reshape(-1, 1)).ravel()
    
    #SkinThickness
    skin_thickness_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 3] = skin_thickness_imputer.fit_transform(X[:, 3].reshape(-1, 1)).ravel()
    
    #Insulin
    insulin_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 4] = insulin_imputer.fit_transform(X[:, 4].reshape(-1, 1)).ravel()
    
    #BMI
    bmi_imputer = SimpleImputer(np.nan, strategy = 'mean')
    X[:, 5] = bmi_imputer.fit_transform(X[:, 5].reshape(-1, 1)).ravel()
    
    #DiabetesPedigreeFunction
    diabetes_pedigree_function_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 6] = diabetes_pedigree_function_imputer.fit_transform(X[:, 6].reshape(-1, 1)).ravel()
    
    return X, y

In [8]:
X_train, y_train = missing_value(X_train, y_train)
X_test, y_test  = missing_value(X_test, y_test)

In [9]:
print(X_train[0])

[ 2.    84.     0.     0.     0.     0.     0.304 21.   ]


In [10]:
train = pd.DataFrame.from_records(X_train)
train.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
dtype: int64

In [11]:
def normalize(X, y):
    minmax_scaler = MinMaxScaler()
    standard_scaler = StandardScaler()
    
    X[:, 1] = minmax_scaler.fit_transform(X[:, 1].reshape(-1, 1)).ravel()
    return X, y

In [12]:
X_train, y_train = normalize(X_train, y_train)
X_test, y_test = normalize(X_test, y_test)

In [13]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train[0])

(614, 8) (614,) (154, 8) (154,)
[ 2.          0.42211055  0.          0.          0.          0.
  0.304      21.        ]


In [14]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
y_pred = clf.predict(X_test)

In [16]:
print("Accuracy_score:", accuracy_score(y_test, y_pred))

Accuracy_score: 0.7857142857142857


In [17]:
clf.coef_

array([[ 6.18782095e-02,  4.20932311e+00, -1.21169211e-02,
        -1.83638782e-03, -5.39908878e-04,  1.05677655e-01,
         6.45325668e-01,  4.18005231e-02]])

In [18]:
clf.intercept_

array([-7.76519197])

In [20]:
data = {1:X_train, 2: y_train, 3:X_test, 4: y_test}

In [23]:
import pickle
with open('data.pkl', 'wb') as f:
    pickle.dump(data, f)