**Data reading and preparation**

Let's read the fictitious_szemely2.csv data file and the necessary python packages! Discard the id column from the scanned data!

In [None]:
from sklearn import preprocessing as skp
import numpy as np
import pandas as pd

szem = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/1_KorszDM_I/1_DSalapok/fiktiv_szemely2.csv', sep=';', header=0)
szem.drop('id', axis= 1, inplace = True)
szem

Since the string type values in the Smoking column would cause problems during the preparation, we will recode it into 1 dummy column that stores information on whether the person smokes (0: no; 1: yes).

In [None]:
szem = pd.get_dummies(szem, columns=['smoking'], drop_first=False)
szem = szem.drop(['smoking_No'], axis=1)
szem

In [None]:
szem2 = szem.copy()

2. Filling empty data

Let's see where continuous value attributes do not contain data! Let's write out their index and then these data lines as well!

In [None]:
missing_weight_idx = szem[szem['weight'].isnull()].index.tolist()
missing_height_idx = szem[szem['height'].isnull()].index.tolist()
missing_age_idx = szem[szem['age'].isnull()].index.tolist()

print ('Weight missing: ', missing_weight_idx)
print ('Height missing: ', missing_height_idx)
print ('Age missing: ', missing_age_idx)

print('\n')
print(szem.iloc[missing_weight_idx], '\n')
print(szem.iloc[missing_height_idx], '\n')
print(szem.iloc[missing_age_idx], '\n')

Filling in the missing body weight values with the average value:

In [None]:
from sklearn.impute import SimpleImputer

imp_weight = SimpleImputer(missing_values=np.nan, strategy='mean')
szem['weight'] = imp_weight.fit_transform(szem[['weight']]).ravel()

szem.iloc[missing_weight_idx]

Filling in the missing body heights with the median value:

In [None]:
imp_height = SimpleImputer(missing_values=np.nan, strategy='median')
szem['height'] = imp_height.fit_transform(szem[['height']]).ravel()

szem.iloc[missing_height_idx]

Imputing missing ages based on a model using IterativeImputer

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Imputation of missing values from the 'age' column with IterativeImputer
imputer = IterativeImputer(max_iter=10)
szem['age'] = imputer.fit_transform(szem[['age']])

# List rows with missing values
szem.iloc[missing_age_idx]

Since we have already replaced all missing values, let's work on the eye2 dataframe and fill in the missing ages using KNNImputer!

In [None]:
from sklearn.impute import KNNImputer

# Create KNNImputer and fill in the 'age' column
imp_age2 = KNNImputer(n_neighbors=5)
szem2[['age']] = imp_age2.fit_transform(szem2[['age']])

# List rows with missing values
szem2.iloc[missing_age_idx]

**3. Feature scaling**

In [None]:
scaler=skp.MinMaxScaler(feature_range=(0,1))
szem_scaled=scaler.fit_transform(szem)
szem_scaled = pd.DataFrame(szem_scaled, columns=szem.columns)

print("Original dataset:\n",szem)
print("\n Min-Max scaled dataset:\n",szem_scaled)

Standardization (z-score normalization)

In [None]:
szem = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/1_KorszDM_I/1_DSalapok/fiktiv_szemely2.csv', sep=';', header=0)
szem.drop('id', axis= 1, inplace = True)
szem = pd.get_dummies(szem, columns=['smoking'], drop_first=False)
szem = szem.drop(['smoking_No'], axis=1)

# Z-score normalization
szem_standard= pd.DataFrame(skp.StandardScaler().fit_transform(szem), columns=szem.columns)

print("Original dataset:\n",szem)
print("\n Z-score normal dataset: \n",szem_standard)

L-norm

In [None]:
szem = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/1_KorszDM_I/1_DSalapok/fiktiv_szemely2.csv', sep=';', header=0)
szem.drop('id', axis= 1, inplace = True)
szem = pd.get_dummies(szem, columns=['smoking'], drop_first=False)
szem = szem.drop(['smoking_No'], axis=1)


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=10)
szem_imputed = imp.fit_transform(szem)
szem_imputed = pd.DataFrame(szem_imputed, columns=szem.columns)
szem_imputed.info()

#L1 normalization
szem_imputed_l1 = skp.normalize(szem_imputed, norm="l1")
szem_imputed_l1 = pd.DataFrame(szem_imputed_l1, columns=szem.columns)

print("Original dataset:\n",szem_imputed)
print("\n L1 normal dataset:\n",szem_imputed_l1)

#L2 normalization
szem_imputed_l2 = skp.normalize(szem_imputed, norm="l2")
szem_imputed_l2 = pd.DataFrame(szem_imputed_l2, columns=szem.columns)

print("Original dataset:\n",szem_imputed)
print("\n L2 normal dataset:\n",szem_imputed_l2)

Binarization

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

szem = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/1_KorszDM_I/1_DSalapok/fiktiv_szemely2.csv', sep=';', header=0)
szem.drop('id', axis= 1, inplace = True)
szem = pd.get_dummies(szem, columns=['smoking'], drop_first=False)
szem = szem.drop(['smoking_No'], axis=1)


imp = IterativeImputer(max_iter=20)
szem_ = imp.fit_transform(szem)



In [None]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=180)

height = np.array(szem_imputed['height'])
height_binarized = binarizer.fit_transform(height.reshape(1,-1))

szem['height_binarized'] = height_binarized.reshape(-1,1).astype(int)

szem

In [None]:
#or
szem['height_binarized_v2'] = (szem['height'] > 180).astype(int)
szem