## Preprocessing pipelines

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
import pandas as pd
import os
import sys
import importlib
from column_drop import *

ModuleNotFoundError: No module named 'column_drop'

## Preprocessing pipelines

In this part we will perform preprocessing steps as:
* dropping irrelevant columns
* imputing missing values
* categorical features encoding
* scaling 


In [1]:
# dropping columns
drop_cols_pipeline = Pipeline([('missing_drop',DropMissing()),
                               ('drop_low_variance', DropLowVarianceCategorical()),
                               ('drop_cardinality', DropHighCardinality())])

# pipeline for imputing and binning numerical cols:
numeric_pipeline = Pipeline([('numeric_imputer', SimpleImputer(strategy='median'))])

categorical_pipeline = Pipeline([('encoding', CustomLabelEncoder()), # label encoder because we have a lot of unique categorical values which will result in great dimention increase
                                 ('categorical_imputer', SimpleImputer(strategy='most_frequent'))]) 

transformer = ColumnTransformer([('cat_pipe', categorical_pipeline, make_column_selector(dtype_include='object')),
                                 ('num_pipe', numeric_pipeline, make_column_selector(dtype_include='number'))],
                                 remainder='drop',
                                 n_jobs=-1)
preprocessing_pipeline = Pipeline([('drop', drop_cols_pipeline),
                                   ('column_transform', transformer),
                                   ('scale', MinMaxScaler())])

NameError: name 'Pipeline' is not defined

### Test:

In [169]:
train_df3 = pd.read_csv("../data/train/X3_train.csv")

In [170]:
train_df3

Unnamed: 0,Date,Rating,Header,Status,Description,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value For Money,Wifi & Connectivity
0,15th November 2021,1,"""penny pinching by BA in club class""",Not Verified,"Not Verified | Outbound, seated in row 10, I ...",A320neo,Couple Leisure,Business Class,London Heathrow to Tenerife South,November 2021,3.0,3.0,1.0,,2.0,1,
1,11th May 2022,4,"""no cabin divider in the aircraft""",Trip Verified,✅ Trip Verified | Very low standards for Club...,A320,Couple Leisure,Business Class,Athens to London,May 2022,2.0,4.0,3.0,,3.0,2,2.0
2,25th March 2022,4,"""The food was awful""",Not Verified,Not Verified | The food was awful. An over ni...,,Couple Leisure,Premium Economy,Dubai to Heathrow,March 2022,3.0,2.0,1.0,3.0,4.0,2,
3,11th June 2022,1,"""This airline is a complete disaster""",Trip Verified,✅ Trip Verified | Worst airline experience ev...,,Family Leisure,Business Class,Dallas Fort-Worth to Toulouse via London,June 2022,1.0,1.0,1.0,1.0,1.0,1,1.0
4,20th October 2019,1,"""The seat was uncomfortable""",Trip Verified,✅ Trip Verified | My flight from London to Si...,Boeing 777,Couple Leisure,Premium Economy,London to Singapore,October 2019,1.0,1.0,1.0,,3.0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,13th September 2023,1,"""the worst airline service""",Trip Verified,✅ Trip Verified | My daughter and I were deni...,,Business,Business Class,Madrid to Vancouver via London,April 2023,3.0,3.0,,,1.0,1,
696,2nd March 2023,10,"""Would happily fly them again""",Trip Verified,✅ Trip Verified | Would happily fly them agai...,Boeing 777 / A320,Solo Leisure,Economy Class,New York to Istanbul via London,March 2023,5.0,5.0,5.0,5.0,5.0,5,
697,7th August 2019,1,"""time for some heads to roll""",Trip Verified,✅ Trip Verified | Palma to London Heathrow. H...,A320,Couple Leisure,Business Class,Palma to London Heathrow,August 2019,1.0,1.0,1.0,1.0,1.0,1,1.0
698,19th May 2022,1,"""They actually give me a voucher""",Trip Verified,✅ Trip Verified | BA says that tickets are fl...,,Business,Economy Class,Columbus to Nairobi via Chicago / London,May 2022,,,,,,1,


In [171]:
transformed = preprocessing_pipeline.fit_transform(train_df3)
pd.DataFrame(transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,0.230769,0.25,0.000000,0.777778,0.00,0.50,0.50,0.0,0.5,0.25,0.00
1,1.0,0.076923,0.25,0.000000,0.698413,0.25,0.25,0.75,0.5,0.5,0.50,0.25
2,0.0,1.000000,0.25,1.000000,0.619048,0.25,0.50,0.25,0.0,0.5,0.75,0.25
3,1.0,1.000000,0.50,0.000000,0.555556,0.00,0.00,0.00,0.0,0.0,0.00,0.00
4,1.0,0.676923,0.25,1.000000,0.841270,0.00,0.00,0.00,0.0,0.5,0.50,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1.0,1.000000,0.00,0.000000,0.063492,0.00,0.50,0.50,0.5,0.5,0.00,0.00
696,1.0,0.707692,0.75,0.333333,0.634921,1.00,1.00,1.00,1.0,1.0,1.00,1.00
697,1.0,0.076923,0.25,0.000000,0.079365,0.00,0.00,0.00,0.0,0.0,0.00,0.00
698,1.0,1.000000,0.00,0.333333,0.698413,0.00,0.50,0.50,0.5,0.5,0.25,0.00
