In [6]:
import pandas as pd
test = pd.read_csv(r"C:\Users\hi\Desktop\Github Repositories\spaceship-titanic-ml-pipeline\Data\test.csv")
train = pd.read_csv(r"C:\Users\hi\Desktop\Github Repositories\spaceship-titanic-ml-pipeline\Data\train.csv")

In [7]:
train.Cabin.head()

0    B/0/P
1    F/0/S
2    A/0/S
3    A/0/S
4    F/1/S
Name: Cabin, dtype: object

In [8]:
# train['Cabin_zone'] = train.Cabin.apply(lambda x: x.split('/')[0])
# train['Cabin_level'] = train.Cabin.apply(lambda x: x.split('/')[1])
# train['Cabin_side'] = train.Cabin.apply(lambda x: x.split('/')[2])

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [10]:
numerical_col = train.select_dtypes('float64').columns
categoric_col = train.select_dtypes('object').columns

In [11]:
# using only low cardinality columns
req_cat_col = []
for col in categoric_col:
    if train[col].nunique() < 10:
        req_cat_col.append(col)
        

In [12]:
categoric_col = req_cat_col

In [13]:
req_col = list(numerical_col) + list(categoric_col)
x_train= train[req_col].copy()
x_test = test[req_col].copy()

In [14]:
categoric_col

['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

In [15]:
y = train['Transported'].astype(int)

In [16]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
numerical_transformation = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='constant',)),
        ('scaler', StandardScaler())
    ]
)

categoric_transformation = Pipeline(
    steps=[
    ('imputing', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [17]:
from sklearn.compose import ColumnTransformer
preprocessing = ColumnTransformer(
    transformers=[
        ('num',numerical_transformation, numerical_col),
        ('cat',categoric_transformation, categoric_col)
    ]
)

In [18]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


In [19]:
base_models =[
    ('rf', RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=42)),
    ('xgb', XGBClassifier(random_state=42, eval_metric='logloss')),
    ('lr', LogisticRegression(max_iter=1000))
]

In [20]:
meta_model = StackingClassifier(
    estimators= base_models,
    final_estimator= LogisticRegression(max_iter=1000),
    cv=5,
    n_jobs=-1
)

In [21]:
model = Pipeline(
    steps=[
        ('preprocessing', preprocessing),
        ('clf', meta_model)
    ]
)

In [22]:
model.fit(x_train, y)

In [24]:
y_pred = model.predict(x_test)

In [26]:
stacking_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': y_pred
})

stacking_submission['Transported'] = stacking_submission.Transported.astype(bool)

In [28]:
stacking_submission.to_csv('stacking_submission.csv', index=False)