<div style="line-height:1.2;">

<h1 style="color:#BF66F2; margin-bottom: 0.5em;">Data preprocessing 1</h1>

<h4 style="margin-top: 0.3em; margin-bottom: 1.5em;"> Scale, Encode, and Impute features with sklearn.

<div style="line-height:1.4; margin-bottom: 1em;">
    <h3 style="color: lightblue; display: inline; margin-right: 0.5em;">Keywords:</h3> 
    ColumnTransformer + CountVectorizer + pandas astype(float) + numpy.log1p + 
</div>

<div style="line-height:1.4; margin-top: 1em;">
    <h3 style="color: red; display: inline; margin-right: 0.5em;">Notes:</h3> For several other example about Sklearn Preprocessing look at:  "../../Machine_Learning_guide/ml_common_techniques_sklearn.ipynb"
</div>

</div>

In [14]:
import numpy as np
import pandas as pd

from sklearn.base import TransformerMixin
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer, Binarizer, OneHotEncoder, \
    LabelEncoder, PolynomialFeatures, FunctionTransformer, PowerTransformer, \
    QuantileTransformer, KBinsDiscretizer, MultiLabelBinarizer, OrdinalEncoder

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
""" Read csv """
df_hug = pd.read_csv("./data_csv/hug_acquisitions_22.csv", header=None, skiprows=None, sep=';')
df_hug.head()
df_hug.columns = ['front_left_pressure_voltage', 'front_right_pressure_voltage', 'right_pressure_voltage', 'back_pressure_voltage', 'left_pressure_voltage', 
                'gyroscope_1_voltage', 'gyroscope_2_voltage']

df_hug.iloc[:, :5] = df_hug.iloc[:, :5].astype(float)

col_dtype = df_hug['gyroscope_1_voltage'].dtype

print(col_dtype)
df_hug.head()

int64


Unnamed: 0,front_left_pressure_voltage,front_right_pressure_voltage,right_pressure_voltage,back_pressure_voltage,left_pressure_voltage,gyroscope_1_voltage,gyroscope_2_voltage
0,0.42,0.43,0.38,0.28,0.3,35,-136
1,0.44,0.44,0.41,0.3,0.63,49,-107
2,0.48,0.46,0.47,0.31,0.36,-62,-206
3,0.51,0.47,0.52,0.31,0.47,-30,-220
4,0.5,0.47,0.51,0.31,0.39,52,-153


In [3]:
random_numbers = np.random.randint(low=0, high=101, size=len(df_hug)) #array of 1370 random integers between 0 and 100
df_temp = pd.DataFrame({'numerical': random_numbers})
print(df_temp.head(10))

   numerical
0         66
1         93
2         29
3         67
4         60
5         83
6         73
7         72
8         25
9         30


In [4]:
X = pd.DataFrame()

X['pressure_mean'] = df_hug[['front_left_pressure_voltage', 'front_right_pressure_voltage', 
                            'right_pressure_voltage', 'back_pressure_voltage', 'left_pressure_voltage']].mean(axis=1)

X['gyroscope_sum'] = df_hug['gyroscope_1_voltage'] + df_hug['gyroscope_2_voltage']

# Create a new "category" column based on a numerical column
X['category'] = pd.cut(df_temp['numerical'], bins=[0, 25, 50, 75, 100], labels=['low', 'medium', 'high', 'very high'])

print("df_hug.columns ==> {}".format(df_hug.columns))
print()
print("X.columns ==> {}".format(X.columns))
print(X.head(10))

df_hug.columns ==> Index(['front_left_pressure_voltage', 'front_right_pressure_voltage',
       'right_pressure_voltage', 'back_pressure_voltage',
       'left_pressure_voltage', 'gyroscope_1_voltage', 'gyroscope_2_voltage'],
      dtype='object')

X.columns ==> Index(['pressure_mean', 'gyroscope_sum', 'category'], dtype='object')
   pressure_mean  gyroscope_sum   category
0          0.362           -101       high
1          0.444            -58  very high
2          0.416           -268     medium
3          0.456           -250       high
4          0.436           -101       high
5          0.430            -98  very high
6          0.432            -89       high
7          0.432            -95       high
8          0.428            -91        low
9          0.424            -99     medium


<h2 style="color:#E74C3C"> Scalers: </h2>

In [5]:
# Standardization (Z-score scaling)
numeric_transformer = StandardScaler()
# One-Hot Encoding (for categorical features)
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

numeric_features = ['pressure_mean', 'gyroscope_sum']
categorical_features = ['category']

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features), 
                                            ('cat', categorical_transformer, categorical_features)])
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed[:5]

array([[ 0.86837867, -0.17697585,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 3.10523228,  1.11877944,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 2.34142861, -5.20932778,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ],
       [ 3.43257671, -4.66691859,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 2.88700266, -0.17697585,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ]])

In [6]:
# Min-Max Scaling
min_max_scaler = MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X_preprocessed)
X_minmax[:3]

array([[0.5877193 , 0.4698609 , 1.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.94736842, 0.53632148, 0.        , 0.        , 0.        ,
        1.        , 0.        ],
       [0.8245614 , 0.21174652, 0.        , 0.        , 1.        ,
        0.        , 0.        ]])

In [7]:
# Robust Scaling (resistant to outliers)
robust_scaler = RobustScaler()
X_robust = robust_scaler.fit_transform(X_preprocessed)
X_robust[:3]

array([[  0.66086957,  -0.66666667,   1.        ,   0.        ,
          0.        ,   0.        ,   0.        ],
       [  2.08695652,   4.11111111,   0.        ,   0.        ,
          0.        ,   1.        ,   0.        ],
       [  1.6       , -19.22222222,   0.        ,   0.        ,
          1.        ,   0.        ,   0.        ]])

In [8]:
# Normalization (scaling each sample to have unit norm)
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X_preprocessed)
X_normalized[:3]

array([[ 0.64989193, -0.13244818,  0.74839693,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.90038378,  0.32439791,  0.        ,  0.        ,  0.        ,
         0.28995698,  0.        ],
       [ 0.40381835, -0.89843531,  0.        ,  0.        ,  0.17246665,
         0.        ,  0.        ]])

In [9]:
# Binarization (thresholding)
binarizer = Binarizer(threshold=2.5)
X_binarized = binarizer.fit_transform(X_preprocessed)
X_binarized[:3]

array([[0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.]])

In [10]:
# One-Hot Encoding (for categorical features)
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X_preprocessed[:, -1].reshape(-1, 1))
X_encoded[:3]

<3x2 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [11]:
# Label Encoding (for target variable)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(X['category'])
y_encoded[:3]

array([0, 3, 2])

In [12]:
# Polynomial Features (e.g., adding quadratic terms)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_preprocessed[:, :2])
X_poly[:3]

array([[  1.        ,   0.86837867,  -0.17697585,   0.75408151,
         -0.15368205,   0.03132045],
       [  1.        ,   3.10523228,   1.11877944,   9.64246753,
          3.47407005,   1.25166744],
       [  1.        ,   2.34142861,  -5.20932778,   5.48228794,
        -12.19726911,  27.13709596]])

In [13]:
# Custom Transformation using FunctionTransformer
custom_transformer = FunctionTransformer(func=np.log1p)
X_log_transformed = custom_transformer.transform(X_preprocessed)
X_log_transformed[:3]

array([[ 0.62507103, -0.19476973,  0.69314718,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 1.41226233,  0.75084019,  0.        ,  0.        ,  0.        ,
         0.69314718,  0.        ],
       [ 1.20639844,         nan,  0.        ,  0.        ,  0.69314718,
         0.        ,  0.        ]])

In [34]:
# Power Transformation (e.g., Box-Cox transformation)
power_transformer = PowerTransformer(method='yeo-johnson')
X_power_transformed = power_transformer.fit_transform(X_preprocessed[:, :2])
X_power_transformed[:3]

array([[ 0.87701866, -0.16200766],
       [ 2.86906994,  1.11117151],
       [ 2.2070467 , -5.61495872]])

In [35]:
# Discretization into bins
bin_discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
X_binned = bin_discretizer.fit_transform(X_preprocessed[:, :2])
X_binned[:3]

array([[1., 1.],
       [2., 1.],
       [2., 0.]])

In [36]:
# Ordinal Encoding (for ordinal categorical features)
ordinal_encoder = OrdinalEncoder()
X_ordinal_encoded = ordinal_encoder.fit_transform(X_preprocessed[:, -1].reshape(-1, 1))
X_ordinal_encoded[:3]

array([[0.],
       [0.],
       [0.]])

<h2 style="color:#E74C3C"> Handling missing values: </h2>

In [15]:
s_imputer_1 = SimpleImputer(strategy='mean')
s_imputer_2 = SimpleImputer(strategy='median')
s_imputer_3 = SimpleImputer(strategy='most_frequent')
X_imputed_1 = s_imputer_1.fit_transform(X_preprocessed)
X_imputed_2 = s_imputer_2.fit_transform(X_preprocessed)
X_imputed_3 = s_imputer_3.fit_transform(X_preprocessed)

print(X_imputed_1[:3])
print("\n", X_imputed_2[:3])
print("\n", X_imputed_3[:3])

[[ 0.86837867 -0.17697585  1.          0.          0.          0.
   0.        ]
 [ 3.10523228  1.11877944  0.          0.          0.          1.
   0.        ]
 [ 2.34142861 -5.20932778  0.          0.          1.          0.
   0.        ]]

 [[ 0.86837867 -0.17697585  1.          0.          0.          0.
   0.        ]
 [ 3.10523228  1.11877944  0.          0.          0.          1.
   0.        ]
 [ 2.34142861 -5.20932778  0.          0.          1.          0.
   0.        ]]

 [[ 0.86837867 -0.17697585  1.          0.          0.          0.
   0.        ]
 [ 3.10523228  1.11877944  0.          0.          0.          1.
   0.        ]
 [ 2.34142861 -5.20932778  0.          0.          1.          0.
   0.        ]]


In [17]:
imputer = KNNImputer(n_neighbors=5)
X_imputed_4 = s_imputer_3.fit_transform(X_preprocessed)
X_imputed_4[:3]

array([[ 0.86837867, -0.17697585,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 3.10523228,  1.11877944,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 2.34142861, -5.20932778,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ]])

#### => Custom imputer

In [18]:
class CustomImputer(TransformerMixin):
    """ Custom imputer for handling missing values in a pandas DataFrame.

    Parameters:
        - The imputation strategy to use ('mean', 'median', 'custom', etc.) [str]
        - Imputation is performed within subgroups based on this column. [str, optional]

    Attributes:
        - The imputed values based on the specified strategy [pandas Series]
    """

    def __init__(self, strategy='mean', subgroup_column=None):
        self.strategy = strategy
        self.subgroup_column = subgroup_column

    def fit(self, X, y=None):
        """ Fit the imputer to the input data """
        if self.strategy == 'mean':
            self.impute_values = np.nanmean(X, axis=0)
        elif self.strategy == 'median':
            self.impute_values = np.nanmedian(X, axis=0)
        elif self.strategy == 'custom':
            # Custom strategy: Impute with the maximum value in each column
            self.impute_values = np.nanmax(X, axis=0)
        return self

    def transform(self, X):
        """ Impute missing values in the input DataFrame """
        X_imputed = X.copy()
        missing_values = np.isnan(X)
        X_imputed[missing_values] = np.take(self.impute_values, np.where(missing_values)[1])
        return X_imputed


In [19]:
imputer_5 = CustomImputer(strategy='mean', subgroup_column='Category')
X_imputed_5 = imputer_5.fit_transform(X_preprocessed)
X_imputed_5[:3]

array([[ 0.86837867, -0.17697585,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 3.10523228,  1.11877944,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 2.34142861, -5.20932778,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ]])

<h2 style="color:#E74C3C"> Explore the dataset </h2>

In [20]:
""" Print the whole dataset """
df_names = pd.read_csv('./data_csv/names.csv') 
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df_names)

         index             name sex
0            0             Mary   F
1            1             Anna   F
2            2             Emma   F
3            3        Elizabeth   F
4            4           Minnie   F
5            5         Margaret   F
6            6              Ida   F
7            7            Alice   F
8            8           Bertha   F
9            9            Sarah   F
10          10            Annie   F
11          11            Clara   F
12          12             Ella   F
13          13         Florence   F
14          14             Cora   F
15          15           Martha   F
16          16            Laura   F
17          17           Nellie   F
18          18            Grace   F
19          19           Carrie   F
20          20            Maude   F
21          21            Mabel   F
22          22           Bessie   F
23          23           Jennie   F
24          24         Gertrude   F
25          25            Julia   F
26          26           Hat

In [21]:
print(df_names.size)
print(df_names.columns)
print(df_names.dtypes)
print(df_names.isnull().isnull().sum())

285075
Index(['index', 'name', 'sex'], dtype='object')
index     int64
name     object
sex      object
dtype: object
index    0
name     0
sex      0
dtype: int64


In [22]:
print(df_names[df_names.sex == 'F'].size)
print(df_names[df_names.sex == 'M'].size)

181800
103275


In [23]:
""" Replace all F and M with 0 and 1 respectively """
df_names1 = df_names.sex.replace({'F':0,'M':1})
print(df_names1.head())
print(df_names.size)

0    0
1    0
2    0
3    0
4    0
Name: sex, dtype: int64
285075


In [24]:
# Remove duplicates
df_names.sex.unique() 
df_names.size

285075

<div style="line-height:0.5">
<h2 style="color:#E74C3C"> CountVectorizer: </h2>
Convert a collection of text documents to a matrix of token counts, producing a sparse representation of the counts 
<div>

In [25]:
""" 
N.B.
X has shape (95025, 95025) => cannot be printed !
"""
Xfeatures = df_names['name']
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [26]:
cv.get_feature_names_out()

array(['aaban', 'aabha', 'aabid', ..., 'zyyanna', 'zyyon', 'zzyzx'],
      dtype=object)