In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
dt = pd.read_csv("diabetes_dataset.csv")
dt.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
dt.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [4]:
dt.dtypes

gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

# Data Information

In [5]:
dt.heart_disease.unique()

array([1, 0], dtype=int64)

In [6]:
dt.hypertension.unique()

array([0, 1], dtype=int64)

In [7]:
dt.diabetes.unique()

array([0, 1], dtype=int64)

In [8]:
dt.smoking_history.unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [9]:
dt.gender.unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [10]:
dt.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

# One Hot Encode

In [11]:
encode = pd.get_dummies(dt, prefix=None)
encode

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,1,0,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0,0,1,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,0,1,0,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,1,0,0,0,1,0,0,0,0
4,76.0,1,1,20.14,4.8,155,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,1,0,0,1,0,0,0,0,0
99996,2.0,0,0,17.37,6.5,100,0,1,0,0,1,0,0,0,0,0
99997,66.0,0,0,27.83,5.7,155,0,0,1,0,0,0,0,1,0,0
99998,24.0,0,0,35.42,4.0,100,0,1,0,0,0,0,0,0,1,0


In [12]:
encode.dtypes

age                            float64
hypertension                     int64
heart_disease                    int64
bmi                            float64
HbA1c_level                    float64
blood_glucose_level              int64
diabetes                         int64
gender_Female                    uint8
gender_Male                      uint8
gender_Other                     uint8
smoking_history_No Info          uint8
smoking_history_current          uint8
smoking_history_ever             uint8
smoking_history_former           uint8
smoking_history_never            uint8
smoking_history_not current      uint8
dtype: object

# Normalize Data

In [13]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(encode)
data_normalisasi = pd.DataFrame(d, columns=encode.columns)
data_normalisasi.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,1.0,0.0,1.0,0.177171,0.563636,0.272727,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.674675,0.0,0.0,0.202031,0.563636,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.349349,0.0,0.0,0.202031,0.4,0.354545,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.449449,0.0,0.0,0.156863,0.272727,0.340909,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.94995,1.0,1.0,0.118231,0.236364,0.340909,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Split Test

In [14]:
from sklearn.model_selection import train_test_split
# Eror nya ada di sini yh. Musti tentukan target apa dan data kamu yang mana saja. Coba dilihat pelan2
X = data_normalisasi.drop(['diabetes'],axis=1)
y = data_normalisasi['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [15]:
X_train.shape

(80000, 15)

In [16]:
y_train.shape

(80000,)

In [17]:
X_test.shape

(20000, 15)

# SVM

In [18]:
#z = np.array(X_train).reshape(-1, 1)
#z

In [19]:
from sklearn import svm
model_svm = svm.SVC()

In [20]:
model_svm.fit(X_train, y_train)

In [21]:
model_svm.score(X_test,y_test)

0.96185