In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt 
from IPython.display import display
import re
# Set Up The Model 
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import classification_report


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

data/test.csv
data/train.csv
data/sample_submission.csv


# About the data: 

`PassengerId` - A unique Id for each passenger. *Each Id takes the form gggg_pp* where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

`HomePlanet` - The planet the passenger departed from, typically their planet of permanent residence.

`CryoSleep` - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

`Cabin` - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

`Destination` - The planet the passenger will be debarking to.

`Age` - The age of the passenger.

`VIP` - Whether the passenger has paid for special VIP service during the voyage.

`RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck` - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

`Name` - The first and last names of the passenger.

`Transported` - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
y = train[['PassengerId','Transported']].copy()

train['data_type'] = 'train'
test['data_type'] = 'test'

full = pd.concat([train.drop(columns='Transported'), test], ignore_index=True)
full.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,data_type
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,train
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,train
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,train
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,train
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,train


# Feature Engineering

1. Create Columns for Group and Person Number as defined in the Variable Explanations. 
2. Create Columns for Cabin in a similar format. 
3. Create Total_Spent Column 

In [3]:
# Lets Explore the Passenger ID Column, It comes in the form gggg_pp, where gggg is the passenger group and pp is the person in that group
def split_group_person(passenger_id):
    return passenger_id.split('_')[0], passenger_id.split('_')[1]

# For the Cabin Column 
def splitter(text):
    fields = re.findall('\w+', text)
    return fields

def seperate(df, key, into, split_method):

    def apply_splitter(text):
        if type(text) == str:
            fields = split_method(text)
            return pd.Series({into[i]: f for i, f in enumerate(fields)})
        else:
            return pd.Series({into[i]: np.nan for i in range(len(into))})
    # Fixed Variables
    fixed_vars = df.columns.difference([key])
    # Og Df 
    tibble = df[fixed_vars].copy()
    # Apply Splitter
    tibble_extra = df[key].apply(apply_splitter)
    return pd.concat([tibble, tibble_extra], axis = 1)


full_with_groups = seperate(full, 'Cabin', into = ['deck', 'num', 'side'], split_method=splitter)
full_with_groups = seperate(full_with_groups, 'PassengerId', ['group', 'person'], split_group_person)

#full_with_groups['Total_Spent'] = full_with_groups[['RoomService', 'ShoppingMall','Spa', 'VRDeck', 'FoodCourt']].sum(skipna=True)
# Since Name Column is not necessarily important, and is duplicated with PassengerID Column, we will drop it
full_with_groups.drop(columns='Name', inplace=True)

full_with_groups.head()

Unnamed: 0,Age,CryoSleep,Destination,FoodCourt,HomePlanet,RoomService,ShoppingMall,Spa,VIP,VRDeck,data_type,deck,num,side,group,person
0,39.0,False,TRAPPIST-1e,0.0,Europa,0.0,0.0,0.0,False,0.0,train,B,0,P,1,1
1,24.0,False,TRAPPIST-1e,9.0,Earth,109.0,25.0,549.0,False,44.0,train,F,0,S,2,1
2,58.0,False,TRAPPIST-1e,3576.0,Europa,43.0,0.0,6715.0,True,49.0,train,A,0,S,3,1
3,33.0,False,TRAPPIST-1e,1283.0,Europa,0.0,371.0,3329.0,False,193.0,train,A,0,S,3,2
4,16.0,False,TRAPPIST-1e,70.0,Earth,303.0,151.0,565.0,False,2.0,train,F,1,S,4,1


In [4]:
cats = []
nums = []
for x in full_with_groups.columns:
    if full_with_groups[x].dtype == 'object':
        cats.append(x)
    else:
        nums.append(x)

# Data Cleaning and Exploration


In [5]:
# All Missing Values
full_with_groups_OG = full_with_groups.copy()

def count_nan(full_with_groups):
    # This Table indicates the Group Id , and The number of people in that group. 
    # Lets get the Number of missing values per group
    group_counts = full_with_groups.group.value_counts()
    out = []
    out_df = []
    # Finding which groups have the most missing data. 
    for i in [1,2,3,4,5,6,7,8]:
        person_count = group_counts[np.where(group_counts == i)[0]].index.to_list() # Get Group IDs
        df_out = full_with_groups[full_with_groups.group.isin(person_count)]
        out.append(df_out.isna().sum())
        out_df.append(df_out)

    print('----Missing values Per Group Counts----')
    missing_vals = pd.concat(out, axis = 1, names=['group', 'missing_values'])
    display(missing_vals)
count_nan(full_with_groups)

----Missing values Per Group Counts----


Unnamed: 0,0,1,2,3,4,5,6,7
Age,142,57,33,12,8,6,6,6
CryoSleep,154,63,35,24,12,8,10,4
Destination,154,59,27,9,10,3,11,1
FoodCourt,160,53,38,12,8,6,10,2
HomePlanet,155,60,29,17,13,7,6,1
RoomService,158,53,22,15,7,1,3,4
ShoppingMall,163,64,37,14,8,9,8,3
Spa,160,52,31,17,5,6,9,4
VIP,170,54,35,9,16,4,6,2
VRDeck,151,52,24,14,10,7,8,2


# Next Look at Destination *OR* HomePlanet with Missing Values 

In [6]:
route_df = full_with_groups.copy()
criteria_or =  (route_df['Destination'].isna()) | (route_df['HomePlanet'].isna())

#Finding Some matches, lets update the dataframe for the missing Destination Values.
group_destination = dict(route_df.groupby('group')['Destination'].value_counts().sort_values(ascending=False).index.to_list())
group_home = dict(route_df.groupby('group')['HomePlanet'].value_counts().sort_values(ascending=False).index.to_list())

# Home and Destiation map 
home_destination = dict(route_df.groupby('HomePlanet')['Destination'].value_counts().sort_values(ascending=False).index.to_list())
destination_home = dict(route_df.groupby('Destination')['HomePlanet'].value_counts().sort_values(ascending=False).index.to_list())

# Look up groups that have missing values for Destination and Home Planet.
route_df.loc[criteria_or, 'Destination'] = route_df.loc[criteria_or, 'Destination'].fillna(route_df.loc[criteria_or, 'group'].map(group_destination))
route_df.loc[criteria_or, 'HomePlanet'] = route_df.loc[criteria_or, 'HomePlanet'].fillna(route_df.loc[criteria_or, 'group'].map(group_home))

# Update the remaining groups that have missing values for Destination and Home Planet.
route_df.loc[criteria_or, 'Destination'] = route_df.loc[criteria_or, 'Destination'].fillna(route_df.loc[criteria_or, 'HomePlanet'].map(home_destination))
route_df.loc[criteria_or, 'HomePlanet'] = route_df.loc[criteria_or, 'HomePlanet'].fillna(route_df.loc[criteria_or, 'Destination'].map(destination_home))

# Check the dataframe for the groups that have missing values for Destination and Home Planet.
count_nan(route_df)

----Missing values Per Group Counts----


Unnamed: 0,0,1,2,3,4,5,6,7
Age,142,57,33,12,8,6,6,6
CryoSleep,154,63,35,24,12,8,10,4
Destination,3,0,0,0,0,0,0,0
FoodCourt,160,53,38,12,8,6,10,2
HomePlanet,3,0,0,0,0,0,0,0
RoomService,158,53,22,15,7,1,3,4
ShoppingMall,163,64,37,14,8,9,8,3
Spa,160,52,31,17,5,6,9,4
VIP,170,54,35,9,16,4,6,2
VRDeck,151,52,24,14,10,7,8,2


In [7]:
def fill_cat(df):
    na_df = df.sort_values('Destination').copy() # View All Values that are missing. 

    # Sorting By Destination, allows us to Fill in missing values for Destination and Home Planet in 8 person groups. 
    na_df[['Destination', 'HomePlanet']] = na_df[['Destination', 'HomePlanet']].fillna(method = 'ffill')

    # Sorting By Group, allows us to Fill in missing values for Deck, Num, And Side. 
    na_df.sort_values('group', inplace = True)
    na_df[['deck', 'num', 'side']] = na_df[['deck', 'num', 'side']].fillna(method = 'ffill')

    # Seems Like a majority of VIP's are not present in these groups. 
    vip_val = na_df.VIP.value_counts().index # Get the majority of VIP's values
    na_df.VIP.fillna(vip_val[0], inplace = True)

    # Checking CryoSleep Column
    na_df.sort_values('Destination', inplace = True)
    na_df.CryoSleep = na_df.CryoSleep.fillna(method = 'ffill')

    # Great that takes care of the categorical Values
    if (na_df[cats].isna().sum().all() == 0) == True:
        return na_df

count_nan(fill_cat(route_df))


----Missing values Per Group Counts----


Unnamed: 0,0,1,2,3,4,5,6,7
Age,142,57,33,12,8,6,6,6
CryoSleep,0,0,0,0,0,0,0,0
Destination,0,0,0,0,0,0,0,0
FoodCourt,160,53,38,12,8,6,10,2
HomePlanet,0,0,0,0,0,0,0,0
RoomService,158,53,22,15,7,1,3,4
ShoppingMall,163,64,37,14,8,9,8,3
Spa,160,52,31,17,5,6,9,4
VIP,0,0,0,0,0,0,0,0
VRDeck,151,52,24,14,10,7,8,2


In [8]:
# Numerical Values
#tmp =  sns.boxplot(x = 'CryoSleep', y = 'FoodCourt', data = na_df)

def fill_nums(df):
    na_df = df.copy()
    # Median Age Per Destination
    age_filler = dict(na_df.groupby('Destination').Age.median())

    # Need to Look up the destination and fill in the missing Age values.
    for i in na_df.Destination.unique():
        na_df.loc[na_df.Destination == i, 'Age'] = age_filler[i]

    # Since a majority of these values are 0, we will fill them in accordingly.
    na_df.RoomService = na_df.RoomService.fillna(na_df.RoomService.value_counts().index[0])
    na_df.ShoppingMall = na_df.ShoppingMall.fillna(na_df.ShoppingMall.value_counts().index[0])
    na_df.VRDeck = na_df.VRDeck.fillna(na_df.VRDeck.value_counts().index[0])
    na_df.Spa = na_df.Spa.fillna(na_df.Spa.value_counts().index[0])
    na_df.FoodCourt = na_df.FoodCourt.fillna(na_df.FoodCourt.value_counts().index[0])
    
    # Check If there are any missing values in the numerical values.
    if (na_df[nums].isna().sum().all() == 0) == True:
        return na_df

full_model = fill_nums(fill_cat(route_df))

In [9]:
full_model.to_numpy()
full_model.columns

Index(['Age', 'CryoSleep', 'Destination', 'FoodCourt', 'HomePlanet',
       'RoomService', 'ShoppingMall', 'Spa', 'VIP', 'VRDeck', 'data_type',
       'deck', 'num', 'side', 'group', 'person'],
      dtype='object')

# Convert all Categorical Variables to Numerical Factors 

In [21]:
def Number_converter(full_model):
    # Clean Data, Change Categorical Variables to Numerical Representations. 
    planet_maps = dict(zip(full_model['HomePlanet'].unique(),range(len(full_model['HomePlanet'].unique()))))
    dest_maps = dict(zip(full_model['Destination'].unique(),range(len(full_model['HomePlanet'].unique()))))
    deck_maps = dict(zip(full_model['deck'].unique(),range(len(full_model['deck'].unique()))))
    side_maps = dict(zip(full_model['side'].unique(),range(len(full_model['side'].unique()))))

    # Training data 
    full_model.HomePlanet = full_model.HomePlanet.map(planet_maps)
    full_model.Destination = full_model.Destination.map(dest_maps)
    full_model.deck = full_model.deck.map(deck_maps)
    full_model.side = full_model.side.map(side_maps)
    full_model.CryoSleep = full_model.CryoSleep.astype(bool)
    full_model.VIP = full_model.VIP.astype(bool)
    full_model.RoomService = full_model.RoomService.astype(float)
    full_model.ShoppingMall = full_model.ShoppingMall.astype(float)
    full_model.VRDeck = full_model.VRDeck.astype(float)
    #full_model[cats] = full_model[cats].astype(object)
    full_model.data_type = full_model.data_type.astype(str)


    X_train = full_model[full_model['data_type']== 'train'].copy()
    X_names = X_train.columns.tolist()
    X_test = full_model[full_model['data_type']== 'test'].copy()

    model_df = full_model.drop(columns = ['data_type']).copy()
    scaler = preprocessing.StandardScaler().fit(model_df)
    model_df = scaler.transform(model_df)

    X_train.drop(columns = ['data_type'], inplace = True)
    X_train = scaler.transform(X_train)
    X_test.drop(columns = ['data_type'], inplace = True)
    X_test = scaler.transform(X_test)
    return X_train, X_test, X_names

In [27]:
# Logit Regression
def log_model(X_train, y_train, X_test, X_names):
    # Fit the model 
    log_model = LogisticRegression(max_iter = 100).fit(X_train, y_train)

    # List Coef
    d = dict(zip(X_names, list(log_model.coef_[0])))
    coef_df = pd.DataFrame(d, index = [0])
    coef_df.insert(0, 'Intercept', log_model.intercept_)
    display(coef_df)

    # Get some Accuracy Stats. 
    predictions = log_model.predict(X_train)
    print(classification_report(y_train, predictions))

    # Log Model Predictions 
    df_out = dict(zip(y['PassengerId'], log_model.predict(X_test)))
    out_df = pd.DataFrame(df_out, index = ['Transported']).T.reset_index()
    out_df.rename(columns = {'index':'PassengerId'}, inplace = True)
    return out_df

In [29]:
def grad_booster_classifier(X_train, y_train, X_test, X_names):
    # Lets give Gradient Boosting a go
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

    # Define the model
    model = GradientBoostingClassifier()
    model.fit(X_train, y_train)

    # Define the Evaluation method 
    cv = RepeatedStratifiedKFold(n_splits = 3, n_repeats = 2, random_state = 1)

    # Evaluate the model on the dataset
    n_scores = cross_val_score(model, X_train, y_train, scoring = 'accuracy', cv = cv, n_jobs = -1, error_score = 0)

    # Print Performance
    print(f'Mean Accuracy:{np.mean(n_scores)}, Standard Deviation: {np.std(n_scores)}')

    # Prediction Using Gradient Boosting. 
    yhat = model.predict(X_test)

    # Data frame Submission
    df_gb_out = dict(zip(y['PassengerId'], yhat))
    out_df_gb = pd.DataFrame(df_gb_out, index = ['Transported']).T.reset_index()
    out_df_gb.rename(columns = {'index':'PassengerId'}, inplace = True)
    out_final = out_df_gb.copy()

    return out_final

In [30]:
# Define Data
X_train, X_test, X_names = Number_converter(full_with_groups.fillna(method = 'ffill')) 
y_train = y['Transported']
test_ids = y['PassengerId']
# Log Model 
out_df_log_model = log_model(X_train, y_train, X_test, X_names)
# Gradient Boosting Model
print('\n*Gradient Boosting Model*')
grad_out_df = grad_booster_classifier(X_train, y_train, X_test, X_names)