# Setup the Dependencies for Kaggle

In [None]:
!pip install kaggle

In [2]:
!mkdir ~/.kaggle2

In [3]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [6]:
!chmod 600 ~/.kaggle/kaggle.json

In [8]:
!kaggle competitions download -c spaceship-titanic

Downloading spaceship-titanic.zip to /content
100% 299k/299k [00:00<00:00, 560kB/s]
100% 299k/299k [00:00<00:00, 560kB/s]


In [9]:
!unzip spaceship-titanic.zip

Archive:  spaceship-titanic.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Load in and Do Fast EDA

In [None]:
!pip install ydata-profiling

In [13]:
import pandas as pd
from ydata_profiling import ProfileReport

In [15]:
df=pd.read_csv('train.csv')

In [16]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [20]:
profile=ProfileReport(df, title='Profiling Report')

In [21]:
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# Preprocessing

In [68]:
def split_cabin(x):
  if len(str(x).split('/')) < 3:
    return ['Missing','Missing','Missing']
  else:
    return str(x).split('/')

In [69]:
# Create a preprocessing function to transform our dataset
def preprocessing(df):
  # Fill missing values in Homeplanet with missing
  df['HomePlanet'].fillna('Missing', inplace=True)
  # CryoSleep - highly correlated
  df['CryoSleep'].fillna('Missing', inplace=True)
  # Cabin preprocessing - extract deck and side
  df['TempCabin'] = df['Cabin'].apply(lambda x: split_cabin(x))
  df['Deck'] = df['TempCabin'].apply(lambda x: x[0])
  df['Side'] = df['TempCabin'].apply(lambda x: x[2])
  df.drop(columns=['TempCabin','Cabin'], axis=1, inplace=True)
  # Destination
  df['Destination'].fillna('Missing', inplace=True)
  # Age
  df['Age'].fillna(df['Age'].mean(), inplace=True)
  # VIP
  df['VIP'].fillna('Missing', inplace=True)
  # Monetary spending columns
  df['RoomService'].fillna(0, inplace=True)
  df['FoodCourt'].fillna(0, inplace=True)
  df['ShoppingMall'].fillna(0, inplace=True)
  df['Spa'].fillna(0, inplace=True)
  df['VRDeck'].fillna(0, inplace=True)
  # Drop name due to high cardinality
  df.drop(columns='Name', axis=1, inplace=True)

In [70]:
abt = df.copy()

In [71]:
preprocessing(abt)

In [74]:
abt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Destination   8693 non-null   object 
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   object 
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   bool   
 12  Deck          8693 non-null   object 
 13  Side          8693 non-null   object 
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


# Modelling
 - Feature and Target values - X, y
 - One hot encode any categorical features
 - Train, holdout split
 - Train on a bunch of algos

In [75]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [76]:
# Create feature columns
# Drop identifier columns
X = abt.drop(columns=['Transported','PassengerId'], axis=1)
# One hot encode
X = pd.get_dummies(X)
# Create target column
y = abt['Transported']

In [78]:
# Create training and testing partitions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

# Setup ML  Pipelines

In [79]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

In [80]:
pipelines = {
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1234)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1234))
}

In [81]:
GradientBoostingClassifier().get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [82]:
grid = {
    'rf': {
        'randomforestclassifier__n_estimators':[100,200,300]
    },
    'gb': {
        'gradientboostingclassifier__n_estimators':[100,200,300]
    }
}

In [83]:
pipelines.items()

dict_items([('rf', Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=1234))])), ('gb', Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(random_state=1234))]))])

In [84]:
# Create a blank dictionary to hold the models
fit_models = {}
# Loop through all the algos
for algo, pipeline in pipelines.items():
  print(f'Training the {algo} model.')
  # Create new Grid Search CV Class
  model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10)
  # Train the model
  model.fit(X_train, y_train)
  # Store results inside of the dictionary
  fit_models[algo] = model

Training the rf model.
Training the gb model.


# Evaluate Performance on Test Partition
- Grab the testing data from the test.csv and evaluate on that

In [66]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [85]:
# Evaluate the performance of the model
for algo, model in fit_models.items():
  yhat = model.predict(X_test)
  accuracy = accuracy_score(y_test, yhat)
  precision = precision_score(y_test, yhat)
  recall = recall_score(y_test, yhat)
  print(f'Metrics for {algo}: accuracy-> {accuracy} precision-> {precision} recall-> {recall}')

Metrics for rf: accuracy-> 0.7910276073619632 precision-> 0.8050365556458164 recall-> 0.7646604938271605
Metrics for gb: accuracy-> 0.8075153374233128 precision-> 0.7827635327635327 recall-> 0.8479938271604939


# Predict on Test Data

In [88]:
# Read in the test.csv dataset
test_df = pd.read_csv('test.csv')
# Deep copy
abt_test = test_df.copy()
# Run through the preprocessing pipeline
preprocessing(abt_test)
# One hot encode categorical variables
abt_test = pd.get_dummies(abt_test.drop('PassengerId', axis=1))

In [92]:
yhat_test = fit_models['gb'].predict(abt_test)

In [93]:
submission = pd.DataFrame([test_df['PassengerId'],yhat_test]).T
submission.columns = ['PassengerId', 'Transported']

In [96]:
submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


# Submit the Kaggle

In [101]:
submission.to_csv('kaggle_submission.csv', index=False)

In [102]:
!kaggle competitions submit -c spaceship-titanic -m 'initial gb model' -f 'kaggle_submission.csv'

100% 56.2k/56.2k [00:01<00:00, 37.1kB/s]
Successfully submitted to Spaceship Titanic