# ML Pipeline using scikit-learn

In [0]:
from pyspark.sql.functions import col

# Load dataset
data_path = "/Volumes/levkiwi_lakehouse/ml_sandbox/data/train.csv"
train_df = spark.read.csv(data_path, header=True, inferSchema=True)

# Cast Boolean columns to int
train_df = train_df.withColumn("PassengerId", col("PassengerId").cast("string")) \
                   .withColumn("VIP", col("VIP").cast("int")) \
                   .withColumn("CryoSleep", col("CryoSleep").cast("int")) \
                   .withColumn("Transported", col("Transported").cast("int")) 

display(train_df)

## Pandas & scikit-learn pipeline

In [0]:
import pandas as pd
train = train_df.toPandas()

train.head()

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean"))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder())
])

# Step 2: Create a ColumnTransformer that applies the transformations to the columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [0]:
# Converting back to Pandas DataFrame
onehot_encoder_feature_names = list(preprocessing_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder'].get_feature_names_out())
column_order =  numerical_cols + onehot_encoder_feature_names

# Show the cleaned DataFrame
pd.DataFrame(X_preprocessed, columns=column_order, index=train.index)

## Decision Tree Classifier 

We extend the pipeline with a decision tree classifier to predict the Transported variable.

In [0]:
from sklearn.tree import DecisionTreeClassifier

X = train.drop('Transported', axis=1)
y = train['Transported']

# Define the hyperparameters for the DecisionTreeClassifier
hyperparams = {
    'criterion': 'entropy',     # Function to measure the quality of a split
    'max_depth': 3,             # Limits the depth of the tree to prevent overfitting
    'min_samples_split': 20,    # The minimum number of samples required to split an internal node
    'min_samples_leaf': 10,     # The minimum number of samples required to be at a leaf node
    'random_state': 42          # Ensures reproducibility of the results
}

# Update the model pipeline with the new DecisionTreeClassifier parameters
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(**hyperparams))
])

# Fit the model
model_pipeline.fit(X, y)

model_pipeline

In [0]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Extract the decision tree model
decision_tree_model = model_pipeline.named_steps['classifier']

# Plot the decision tree
plt.figure(figsize=(20,10))
plot_tree(decision_tree_model, 
          filled=True, 
          rounded=True,
          class_names=['Not Transported', 'Transported'],
          feature_names=column_order)  # Ensure 'column_order' matches the order of features in the trained model
plt.show()

## Loading test dataset

In [0]:
data_path = "/Volumes/levkiwi_lakehouse/ml_sandbox/data/test.csv"
test_df = spark.read.csv(data_path, header=True, inferSchema=True)

test = test_df.toPandas()

display(test)

In [0]:
X_test = test

y_pred = model_pipeline.predict(X_test)

kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
kaggle_submission

In [0]:
kaggle_submission.to_csv("/Volumes/levkiwi_lakehouse/ml_sandbox/data/simple_decision_tree.csv", index=True)