# Install Package

In [1]:
# %pip install kaggle

# Downloads datasets

In [2]:
!kaggle datasets download -d prosperchuks/health-dataset

health-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


# Buat folder lalu extract zip hasil download 

In [3]:
# !mkdir health-dataset
# !unzip health-dataset.zip -d health-dataset
# !ls health-dataset

# Import package yang diperlukan

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load datasets

In [5]:
df = pd.read_csv("health-dataset/hypertension_data.csv")

In [6]:
df.head(2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,57.0,1.0,3,145,233,1,0,150,0,2.3,0,0,1,1
1,64.0,0.0,2,130,250,0,1,187,0,3.5,0,0,2,1


In [7]:
df[df["target"] == 0].head(2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
165,62.0,0.0,0,160,286,0,0,108,1,1.5,1,3,2,0
166,59.0,0.0,0,120,229,0,0,129,1,2.6,1,2,3,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26083 entries, 0 to 26082
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       26083 non-null  float64
 1   sex       26058 non-null  float64
 2   cp        26083 non-null  int64  
 3   trestbps  26083 non-null  int64  
 4   chol      26083 non-null  int64  
 5   fbs       26083 non-null  int64  
 6   restecg   26083 non-null  int64  
 7   thalach   26083 non-null  int64  
 8   exang     26083 non-null  int64  
 9   oldpeak   26083 non-null  float64
 10  slope     26083 non-null  int64  
 11  ca        26083 non-null  int64  
 12  thal      26083 non-null  int64  
 13  target    26083 non-null  int64  
dtypes: float64(3), int64(11)
memory usage: 2.8 MB


# Hapus kolom yang memiliki nilai NaN

In [9]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
df = df.reset_index(drop=True)

In [10]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [11]:
X = df.drop(columns="target", axis=1)
Y = df["target"]

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [13]:
X.shape, X_train.shape

((26058, 13), (20846, 13))

# Membuat model

In [23]:
model = LogisticRegression(max_iter=1000)

In [24]:
model.fit(X_train, Y_train)

In [25]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [27]:
print("Akurasi data training : ", training_data_accuracy)

Akurasi data training :  0.8608845821740382


In [28]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test )

In [29]:
print("Akurasi data test : ", test_data_accuracy)

Akurasi data test :  0.8610897927858787


# Buat data prediksi

In [30]:
# kena
input_data =  np.array([57.0,1.0,3,145,233,1,0,150,0,2.3,0,0,1])
# tidak
# input_data =  np.array([62.0,0.0,0,160,286,0,0,108,1,1.5,1,3,2])
input_data_reshaped = input_data.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0) :
    print("Pasien tidak terkena darah tinggi")
else :
    print("Pasien terkena penyakit darah tinggii")

[1]
Pasien terkena penyakit darah tinggii




# Simpan Model

In [31]:
import pickle

In [32]:
filename = 'hypertension.sav'
pickle.dump(model, open(filename, 'wb'))