In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

In [2]:
def build_default_preprocessor(
    numerical_features,
    categorical_features,
    numerical_impute_strategy='median',
    categorical_impute_strategy='most_frequent'
):
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=numerical_impute_strategy)),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=categorical_impute_strategy, fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
    ])
    return preprocessor

In [3]:
df_test = pd.DataFrame({
    # Categorical features
    'color': ['red', 'blue', 'green', 'red', 'yellow'],
    'size': ['S', 'M', 'L', 'XL', 'M'],
    'category': ['A', 'B', 'A', 'C', 'B'],
    
    # Numerical features
    'price': [100, 200, 150, 300, 250],
    'quantity': [5, 3, 8, 2, 4],
    
    # Missing values
    'rating': [4.5, np.nan, 3.8, 4.2, np.nan],
    
    # Boolean
    'in_stock': [True, False, True, True, False],
    
    # Dates
    'date': pd.date_range('2024-01-01', periods=5)
})

print("\nDataFrame Shape:", df_test.shape)
print("\nDataFrame Info:")
print(df_test.info())
print("\nFirst few rows:")
print(df_test.head())
print("\nData Types:")
print(df_test.dtypes)


DataFrame Shape: (5, 8)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   color     5 non-null      object        
 1   size      5 non-null      object        
 2   category  5 non-null      object        
 3   price     5 non-null      int64         
 4   quantity  5 non-null      int64         
 5   rating    3 non-null      float64       
 6   in_stock  5 non-null      bool          
 7   date      5 non-null      datetime64[ns]
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 413.0+ bytes
None

First few rows:
    color size category  price  quantity  rating  in_stock       date
0     red    S        A    100         5     4.5      True 2024-01-01
1    blue    M        B    200         3     NaN     False 2024-01-02
2   green    L        A    150         8     3.8      True 2024-01-03
3

In [5]:
numerical_features = ['price', 'quantity', 'rating']  # numerical columns
categorical_features = ['color', 'size', 'category']
columns = numerical_features + categorical_features
print(columns)

['price', 'quantity', 'rating', 'color', 'size', 'category']


In [6]:
df_test.head()

Unnamed: 0,color,size,category,price,quantity,rating,in_stock,date
0,red,S,A,100,5,4.5,True,2024-01-01
1,blue,M,B,200,3,,False,2024-01-02
2,green,L,A,150,8,3.8,True,2024-01-03
3,red,XL,C,300,2,4.2,True,2024-01-04
4,yellow,M,B,250,4,,False,2024-01-05


In [10]:
y = df_test.in_stock.astype(int).values
print(type(y))
print(y[:10])

<class 'numpy.ndarray'>
[1 0 1 1 0]


In [12]:
df = df_test[columns]
df.head()

Unnamed: 0,price,quantity,rating,color,size,category
0,100,5,4.5,red,S,A
1,200,3,,blue,M,B
2,150,8,3.8,green,L,A
3,300,2,4.2,red,XL,C
4,250,4,,yellow,M,B


In [13]:
# Build the preprocessor
preprocessor = build_default_preprocessor(
    numerical_features=numerical_features,
    categorical_features=categorical_features,
    numerical_impute_strategy='median',
    categorical_impute_strategy='most_frequent'
)

In [14]:
preprocessor

In [15]:
x1 = preprocessor.fit_transform(df)

In [16]:
x1[:5]

array([[-1.41421356,  0.29138576,  1.43684242,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [ 0.        , -0.6799001 ,  0.08980265,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ],
       [-0.70710678,  1.74831455, -1.70625037,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [ 1.41421356, -1.16554303,  0.08980265,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ],
       [ 0.70710678, -0.19425717,  0.08980265,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.  

In [17]:
def fit_transform_dataframe(
    df,
    pipeline,
    numerical_features,
    categorical_features
):
    dt_transformed = pipeline.fit_transform(df)
    categorical_transformed_features = pipeline.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
    all_features = numerical_features + list(categorical_transformed_features)
    dt_transformed = pd.DataFrame(dt_transformed, columns=all_features, index=df.index)
    return dt_transformed

In [18]:
x2 = fit_transform_dataframe(df, preprocessor, numerical_features, categorical_features)

In [19]:
x2.head()

Unnamed: 0,price,quantity,rating,color_blue,color_green,color_red,color_yellow,size_L,size_M,size_S,size_XL,category_A,category_B,category_C
0,-1.414214,0.291386,1.436842,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,-0.6799,0.089803,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-0.707107,1.748315,-1.70625,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.414214,-1.165543,0.089803,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.707107,-0.194257,0.089803,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [21]:
x3 = x2.values

In [23]:
x3[:5]

array([[-1.41421356,  0.29138576,  1.43684242,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [ 0.        , -0.6799001 ,  0.08980265,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ],
       [-0.70710678,  1.74831455, -1.70625037,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [ 1.41421356, -1.16554303,  0.08980265,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ],
       [ 0.70710678, -0.19425717,  0.08980265,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.  

In [24]:
df_test.head()

Unnamed: 0,color,size,category,price,quantity,rating,in_stock,date
0,red,S,A,100,5,4.5,True,2024-01-01
1,blue,M,B,200,3,,False,2024-01-02
2,green,L,A,150,8,3.8,True,2024-01-03
3,red,XL,C,300,2,4.2,True,2024-01-04
4,yellow,M,B,250,4,,False,2024-01-05


In [25]:
x4 = df_test[columns]
print(x4[:2])

   price  quantity  rating color size category
0    100         5     4.5   red    S        A
1    200         3     NaN  blue    M        B


In [27]:
x5 = x4.values
print(x5[:2])

[[100 5 4.5 'red' 'S' 'A']
 [200 3 nan 'blue' 'M' 'B']]


In [28]:
preprocessor1 = build_default_preprocessor(numerical_features=[0,1,2], categorical_features=[3,4,5])

In [29]:
x6 = preprocessor1.fit_transform(x5)

In [30]:
x6[:2]

array([[-1.41421356,  0.29138576,  1.43684242,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [ 0.        , -0.6799001 ,  0.08980265,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ]])

In [31]:
x7 = df_test[columns]
x7.head()

Unnamed: 0,price,quantity,rating,color,size,category
0,100,5,4.5,red,S,A
1,200,3,,blue,M,B
2,150,8,3.8,green,L,A
3,300,2,4.2,red,XL,C
4,250,4,,yellow,M,B


In [32]:
x2.head()

Unnamed: 0,price,quantity,rating,color_blue,color_green,color_red,color_yellow,size_L,size_M,size_S,size_XL,category_A,category_B,category_C
0,-1.414214,0.291386,1.436842,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,-0.6799,0.089803,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-0.707107,1.748315,-1.70625,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.414214,-1.165543,0.089803,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.707107,-0.194257,0.089803,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [33]:
def to_dict(df: pd.DataFrame, columns: list[str]) -> list[dict]:
    return df[columns].to_dict(orient='records')

In [34]:
dict_x = to_dict(df, columns)

In [35]:
dict_x

[{'price': 100,
  'quantity': 5,
  'rating': 4.5,
  'color': 'red',
  'size': 'S',
  'category': 'A'},
 {'price': 200,
  'quantity': 3,
  'rating': nan,
  'color': 'blue',
  'size': 'M',
  'category': 'B'},
 {'price': 150,
  'quantity': 8,
  'rating': 3.8,
  'color': 'green',
  'size': 'L',
  'category': 'A'},
 {'price': 300,
  'quantity': 2,
  'rating': 4.2,
  'color': 'red',
  'size': 'XL',
  'category': 'C'},
 {'price': 250,
  'quantity': 4,
  'rating': nan,
  'color': 'yellow',
  'size': 'M',
  'category': 'B'}]

In [37]:
from sklearn.feature_extraction import DictVectorizer

def get_dict_vectorizer():
    return DictVectorizer(sparse=False)

In [38]:
vect = get_dict_vectorizer()

In [40]:
x8 = vect.fit_transform(dict_x)

In [41]:
x8

array([[  1. ,   0. ,   0. ,   0. ,   0. ,   1. ,   0. , 100. ,   5. ,
          4.5,   0. ,   0. ,   1. ,   0. ],
       [  0. ,   1. ,   0. ,   1. ,   0. ,   0. ,   0. , 200. ,   3. ,
          nan,   0. ,   1. ,   0. ,   0. ],
       [  1. ,   0. ,   0. ,   0. ,   1. ,   0. ,   0. , 150. ,   8. ,
          3.8,   1. ,   0. ,   0. ,   0. ],
       [  0. ,   0. ,   1. ,   0. ,   0. ,   1. ,   0. , 300. ,   2. ,
          4.2,   0. ,   0. ,   0. ,   1. ],
       [  0. ,   1. ,   0. ,   0. ,   0. ,   0. ,   1. , 250. ,   4. ,
          nan,   0. ,   1. ,   0. ,   0. ]])

In [42]:
vect.feature_names_

['category=A',
 'category=B',
 'category=C',
 'color=blue',
 'color=green',
 'color=red',
 'color=yellow',
 'price',
 'quantity',
 'rating',
 'size=L',
 'size=M',
 'size=S',
 'size=XL']