In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import pandas as pd

def preprocess_data(data, target_column, save_path, file_path):
  num_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
  cat_cols = data.select_dtypes(include=['object']).columns.tolist()
  col_names = data.columns
  col_names = data.columns.drop(target_column)

  df_header = pd.DataFrame(columns=col_names)

  df_header.to_csv(file_path, index=False)
  print(f"Nama kolom berhasil disimpan ke: {file_path}")

  if target_column in num_cols:
    num_cols.remove(target_column)
  if target_column in cat_cols:
    cat_cols.remove(target_column)

  numeric_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='mean')),
      ('scaler', StandardScaler())
  ])

  categorical_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='most_frequent')),
      ('onehot', OneHotEncoder(handle_unknown='ignore'))
  ])

  preprocessor = ColumnTransformer (
      transformers=[
          ('num', numeric_transformer, num_cols),
          ('cat', categorical_transformer, cat_cols)
      ])

  X = data.drop(target_column, axis=1)
  y = data[target_column]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  X_train_preprocessed = preprocessor.fit_transform(X_train)
  X_test_preprocessed = preprocessor.transform(X_test)

  dump(preprocessor, save_path)

  return X_train_preprocessed, X_test_preprocessed, y_train, y_test

In [2]:
import gdown
import zipfile
from joblib import dump
file_id = '18aRJrZT6mX1g9zcvAK_rhT5_VeCyzber'
output = 'heart-disease.zip'
gdown.download(f'https://drive.google.com/uc?id={file_id}', output, quiet=False)

with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall('heart-disease')

data = pd.read_csv('heart-disease/heart.csv')
data.info()
data.head()

Downloading...
From: https://drive.google.com/uc?id=18aRJrZT6mX1g9zcvAK_rhT5_VeCyzber
To: /content/heart-disease.zip
100%|██████████| 3.44k/3.44k [00:00<00:00, 2.44MB/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
X_train, X_test, y_train, y_test = preprocess_data(data, 'target', 'preprocessor.joblib', 'data.csv')

Nama kolom berhasil disimpan ke: data.csv


In [4]:
from joblib import dump, load

def inference(new_data, load_path):
  preprocessor = load(load_path)
  print(f"Preprocessor berhasil di load dari: {load_path}")
  transformed_data = preprocessor.transform(new_data)
  return transformed_data

In [5]:
def inverse_transform_data(transformed_data, load_path, new_data_columns):
  preprocessor = load(load_path)
  numeric_transformer = preprocessor.named_transformers_['num']['scaler']
  num_cols = new_data_columns[:len(numeric_transformer.mean_)]
  transformed_numeric_data = transformed_data[:, :len(num_cols)]

  original_numeric_data = numeric_transformer.inverse_transform(transformed_numeric_data)

  inversed_data = pd.DataFrame(original_numeric_data, columns=num_cols)
  return inversed_data

In [6]:
import numpy as np

pipeline_path = 'preprocessor.joblib'
col = pd.read_csv('data.csv').columns.tolist()

new_data = [80, 0, 3, 128, 270, 0, 1, 150, 0, 3.0, 2, 1, 3 ]

new_data = np.array(new_data)

new_data = pd.DataFrame([new_data], columns=col)

transformed_data = inference(new_data, pipeline_path)
print(transformed_data)

transformed_data = inference(new_data, pipeline_path)

inversed_data = inverse_transform_data(transformed_data, pipeline_path, col)

print("Data setelah preprocessing (training):")
print(new_data)
print("\nData baru setelah transform:")
print(transformed_data)
print("\nData baru setelah di-inverse transform:")
print(inversed_data)

Preprocessor berhasil di load dari: preprocessor.joblib
[[ 2.78017693 -1.38407465  1.96807914 -0.14049642  0.43952674 -0.38330071
   0.8431327  -0.00518702 -0.67663234  1.80567466  0.95390513  0.32186034
   1.17848036]]
Preprocessor berhasil di load dari: preprocessor.joblib
Data setelah preprocessing (training):
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  80.0  0.0  3.0     128.0  270.0  0.0      1.0    150.0    0.0      3.0   

   slope   ca  thal  
0    2.0  1.0   3.0  

Data baru setelah transform:
[[ 2.78017693 -1.38407465  1.96807914 -0.14049642  0.43952674 -0.38330071
   0.8431327  -0.00518702 -0.67663234  1.80567466  0.95390513  0.32186034
   1.17848036]]

Data baru setelah di-inverse transform:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  80.0  0.0  3.0     128.0  270.0  0.0      1.0    150.0    0.0      3.0   

   slope   ca  thal  
0    2.0  1.0   3.0  
