In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [3]:
dataset = pd.read_csv('healthcare-dataset-stroke-data.csv')

# HANDLE NAN
dataset.bmi.replace(to_replace=np.nan, value=dataset.bmi.mean(), inplace=True)

#BINARY ENCODING
dataset['gender']=dataset['gender'].apply(lambda x : 1 if x=='Male' else 0) 
dataset["Residence_type"] = dataset["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
dataset["ever_married"] = dataset["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)

# ONEHOT ENCODING
label_encoder_smoke = LabelEncoder()
label_encoder_work = LabelEncoder()
dataset['smoking_status'] = label_encoder_smoke.fit_transform(dataset['smoking_status'])
dataset['work_type'] = label_encoder_work.fit_transform(dataset['work_type'])
#print("Classes for 'smoking_status' (alphabetical order):", label_encoder_smoke.classes_)
#print("Classes for 'work_type' (alphabetical order):", label_encoder_work.classes_)

y=dataset['stroke']
dataset.drop(columns=['stroke'],inplace=True)
x=dataset

# SPLIT
x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, train_size= 0.5, random_state = 0)

# SMOTE
sm = SMOTE(random_state=2)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

# Kết hợp lại các DataFrame và Series
x_combined = pd.concat([x_train_res, x_tmp], axis=0)
y_combined = pd.concat([y_train_res, y_tmp], axis=0)

# Kết hợp lại thành một DataFrame duy nhất
dataset_combined = pd.concat([x_combined, y_combined], axis=1)

# Ánh xạ ngược lại cho cột 'gender'
gender_mapping = {1: 'Male', 0: 'Female'}
dataset_combined['gender'] = dataset_combined['gender'].map(gender_mapping)

# Ánh xạ ngược lại cho cột 'Residence_type'
residence_type_mapping = {1: 'Urban', 0: 'Rural'}
dataset_combined['Residence_type'] = dataset_combined['Residence_type'].map(residence_type_mapping)

# Ánh xạ ngược lại cho cột 'smoking_status'
smoking_status_mapping = {0: 'Unknown', 1: 'formerly smoked', 2: 'never smoked', 3: 'smokes'}
dataset_combined['smoking_status'] = dataset_combined['smoking_status'].map(smoking_status_mapping)

# Ánh xạ ngược lại cho cột 'ever_married'
ever_married_mapping = {1: 'Yes', 0: 'No'}
dataset_combined['ever_married'] = dataset_combined['ever_married'].map(ever_married_mapping)

# Ánh xạ ngược lại cho cột 'work_type'
work_type_mapping = {0: 'Govt_job', 1: 'Never_worked', 2: 'Private', 3: 'Self-employed', 4: 'children'}
dataset_combined['work_type'] = dataset_combined['work_type'].map(work_type_mapping)

dataset_combined['bmi'].replace(28.893236911794666, np.nan, inplace=True)
# Lưu DataFrame kết hợp thành tệp CSV mới
dataset_combined.to_csv("combined_dataset.csv", index=False)