In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump

# Load the data
df = pd.read_csv('data.csv')
df.columns

Index(['session_id', 'request_id', 'event_time', 'event', 'source', 'origin',
       'journey_type', 'destination', 'departure_date', 'return_date',
       'out_elapsed_flight_time', 'in_elapsed_flight_time', 'total_price',
       'total_markup_amount', 'out_flight_numbers', 'adt_numbers',
       'cnn_numbers', 'inf_numbers', 'mkp_source'],
      dtype='object')

In [3]:
# Assign atributes
df = df.drop(['session_id',
              'request_id',
              'event_time',
              'event',
              'origin',
              'destination',
              'out_flight_numbers',
              'departure_date',
              'return_date',
              'mkp_source'], 
             axis=1)
numeric_features = ['in_elapsed_flight_time',
                    'out_elapsed_flight_time',
                    'total_price',
                    'total_markup_amount',
                    'adt_numbers',
                    'cnn_numbers',
                    'inf_numbers']
categorical_features = ['source']

# Preprocess the data
X = df.drop('journey_type', axis=1)
y = df['journey_type']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Define the pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

# Preprocessing
model = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit the pipeline to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.transform(X_test.head(3))
y_pred

array([[-0.56492606, -0.56534693, -0.55429221, -0.7716228 ,  0.23538556,
        -0.28507866, -0.0877058 ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [ 0.46520748,  0.38337198,  1.12360337,  2.46485304,  0.23538556,
        -0.28507866, -0.0877058 ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [-0.84858602, -0.85283751, -0.18529602,  0.74637029,  0.23538556,
        -0.28507866, -0.0877058 ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ]])

In [20]:
# Preprocessing + training
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

# Fit the pipeline to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(f'Test Accuracy: {accuracy_score(y_test, y_pred):.2f}')

Test Accuracy: 0.94


In [6]:
# Get processed data column names
import numpy as np

# numerical columns not change, categorical columns change from one-hot encoder
new_cat_cols = model.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"].get_feature_names(categorical_features)

# concatenate categorical columns with numerical columns to get all columns
all_cols = np.concatenate([numeric_features, new_cat_cols])

# create processed pandas dataframe
pd.DataFrame(y_pred, columns=all_cols)