In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn import metrics as metrics
from sklearn.model_selection import train_test_split, ParameterGrid


from sklearn.dummy import DummyClassifier

# Import dataframes

In [2]:
df_train = pd.read_csv('data/train.csv', header=0) # -> treino
df_test = pd.read_csv('data/test.csv', header=0) # -> teste

# Feature engineering and data preparation
    - Not considering NaN still

In [3]:
# Define the WIP DataFrame for backup
df_full = pd.concat([df_train.drop(columns=['Transported']), df_test]).reset_index(drop=True)


wip_df = df_full.copy()

In [4]:
display(wip_df.head())
display(wip_df.info())
wip_df.describe()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
dtypes: float64(6), object(7)
memory usage: 1.3+ MB


None

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12700.0,12707.0,12681.0,12664.0,12686.0,12702.0
mean,28.771969,222.897852,451.961675,174.906033,308.476904,306.789482
std,14.387261,647.596664,1584.370747,590.55869,1130.279641,1180.097223
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,77.0,29.0,57.0,42.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


- Working with PassengerId
    - Separating Passenger groups within passengers

In [5]:
def get_passenger_group(passengerid: str) -> str:
    if re.search('_', passengerid):
        return passengerid[0:4]
    
    else:
        return 'check'
    
def set_passenger_groupsize(passengergroup: int, grouped_df: pd.DataFrame) -> str:
    if grouped_df['PassengerId'][passengergroup] == 1:
        return 'Alone'
    
    elif grouped_df['PassengerId'][passengergroup] == 2:
        return 'Pair'
    
    elif grouped_df['PassengerId'][passengergroup] >= 3:
        return 'Family'

In [6]:
wip_df['PassengerGroup'] = wip_df.apply(lambda x: get_passenger_group(x['PassengerId']), axis=1)
# wip_df.loc[wip_df['PassengerGroup'] == 'check'] <- no check found

In [7]:
grouped_df = wip_df[['PassengerGroup','PassengerId']].groupby('PassengerGroup').count()
wip_df['Riding'] = wip_df.apply(lambda x: set_passenger_groupsize(x['PassengerGroup'], grouped_df), axis=1)

In [8]:
wip_df.sample(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,PassengerGroup,Riding
9724,2160_02,Mars,True,F/428/P,55 Cancri e,0.0,False,0.0,0.0,0.0,0.0,0.0,Alex Anate,2160,Family
3093,3332_02,Mars,True,D/107/S,TRAPPIST-1e,37.0,False,0.0,0.0,0.0,0.0,0.0,Plat Flate,3332,Family
10205,3251_01,Earth,False,F/680/P,TRAPPIST-1e,28.0,False,0.0,0.0,1.0,0.0,739.0,Jeroy Coxterez,3251,Alone
5568,5930_01,Europa,True,B/227/S,55 Cancri e,24.0,False,0.0,0.0,0.0,0.0,0.0,Muonea Geakerat,5930,Family
8621,9197_02,Europa,False,C/308/P,,41.0,True,0.0,7964.0,0.0,3238.0,5839.0,Aludram Platch,9197,Family
3048,3293_01,Mars,False,F/623/S,TRAPPIST-1e,20.0,False,915.0,12.0,25.0,205.0,311.0,Wings Part,3293,Alone
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,2,Alone
12876,9070_01,Earth,False,,TRAPPIST-1e,20.0,False,323.0,0.0,542.0,0.0,0.0,Oraryn Flynney,9070,Pair
900,0970_01,Earth,True,G/142/P,TRAPPIST-1e,19.0,False,0.0,0.0,0.0,0.0,0.0,Tamie Blace,970,Alone
10960,4935_01,,False,G/796/P,TRAPPIST-1e,21.0,False,97.0,41.0,0.0,436.0,52.0,Brada Whitneyes,4935,Family


- Working with RoomService, FoodCourt, ShoppingMall, Spa and VRDeck bills
    - Resolving the problem with NaNs
    - Filling all other NaN with 0

In [9]:
columns = [
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck'
]

wip_df['TotalBill'] = wip_df[columns].sum(axis=1)
wip_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,PassengerGroup,Riding,TotalBill
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,1,Alone,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,2,Alone,736.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,3,Pair,10383.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,3,Pair,5176.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,4,Alone,1091.0


In [10]:
wip_df[columns] = wip_df[columns].fillna(0)

- Working with Destination Objects
    - TRAPPIST-1e
    - 55 Cancri e
    - PSO J318.5-22

In [11]:
wip_df['Destination'] = wip_df['Destination'].fillna('X')
wip_df['Destination'] = pd.Categorical(wip_df['Destination'])

In [12]:
wip_df['Destination'].value_counts(dropna=False)

TRAPPIST-1e      8871
55 Cancri e      2641
PSO J318.5-22    1184
X                 274
Name: Destination, dtype: int64

- Working with Cabins
    - Splitting into Deck/Side

In [13]:
wip_df['Cabin'] = wip_df['Cabin'].fillna('X/0/X')
wip_df[['Cabin_Deck','Cabin_Number','Cabin_Side']] = wip_df['Cabin'].str.split('/', expand=True)

In [14]:
wip_df['Cabin_Deck'] = pd.Categorical(wip_df['Cabin_Deck'])
wip_df['Cabin_Side'] = pd.Categorical(wip_df['Cabin_Side'])

- Working with CryoSleep
    - Transforming column to bool ignoring NaN

In [15]:
wip_df['CryoSleep'] = wip_df['CryoSleep'].astype(bool)

- Working with VIP
    - Transforming column to bool ignoring NaN

In [16]:
wip_df['VIP'] = wip_df['VIP'].astype(bool)

- Working with HomePlanet
    - Europa, Mars, Earth and Empty HomePlanets

In [17]:
wip_df['HomePlanet'] = wip_df['HomePlanet'].fillna('X')
wip_df['HomePlanet'] = pd.Categorical(wip_df['HomePlanet'])

- Working with Age
    - Filling NaN with median

In [18]:
wip_df['Age'] = wip_df.Age.fillna(wip_df.Age.median())

In [19]:
wip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   PassengerId     12970 non-null  object  
 1   HomePlanet      12970 non-null  category
 2   CryoSleep       12970 non-null  bool    
 3   Cabin           12970 non-null  object  
 4   Destination     12970 non-null  category
 5   Age             12970 non-null  float64 
 6   VIP             12970 non-null  bool    
 7   RoomService     12970 non-null  float64 
 8   FoodCourt       12970 non-null  float64 
 9   ShoppingMall    12970 non-null  float64 
 10  Spa             12970 non-null  float64 
 11  VRDeck          12970 non-null  float64 
 12  Name            12676 non-null  object  
 13  PassengerGroup  12970 non-null  object  
 14  Riding          12970 non-null  object  
 15  TotalBill       12970 non-null  float64 
 16  Cabin_Deck      12970 non-null  category
 17  Cabin_Number

- Getting the dummies from cols

In [20]:
dummy_cols = ['HomePlanet','Destination','Riding','Cabin_Deck','Cabin_Side']

dummies = pd.get_dummies(wip_df[['PassengerId','HomePlanet','Destination','Riding','Cabin_Deck','Cabin_Side']], dummy_na=True, columns=dummy_cols)

In [21]:
# Adding dummies and saving final DataFrame
df_full = wip_df.drop(columns=
    dummy_cols + 
    [
        'Cabin',
        'Name',
        'PassengerGroup',
        'Cabin_Number'
    ]
)

In [22]:
df_full = df_full.merge(dummies, on='PassengerId', how='inner')

# Model selection

In [23]:
# Returning the preprocessed DataFrame to its original values
df_train = df_train[['PassengerId','Transported']].merge(df_full, on='PassengerId', how='inner').copy()
df_test = df_test[['PassengerId']].merge(df_full, on='PassengerId', how='inner').copy()

df_train = df_train.drop(columns='PassengerId')

In [24]:
x = df_train.drop(columns='Transported')
y = df_train['Transported']

In [25]:
# Testing if dataset is valid to training
dm = DummyClassifier(random_state=28)
dm.fit(x, y)

dm_predictions = dm.predict(x)
metrics.accuracy_score(y, dm_predictions)

0.5036236051995858

- Train, Test, Split

In [26]:
train_x, val_x, train_y, val_y = train_test_split(x, y, train_size = 0.6, random_state=28)

- Checking Accuracy from different models

In [27]:
models = [
    LinearSVC(random_state=28),
    GaussianNB(),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=28),
    SVC(random_state=28),
    AdaBoostClassifier(random_state=28)
]

for model in models:
    model.fit(train_x, train_y)

    model_preds = model.predict(val_x)
    print(f'Model: {model}, ACC: {metrics.accuracy_score(val_y, model_preds)}')



Model: LinearSVC(random_state=28), ACC: 0.7380678550891316
Model: GaussianNB(), ACC: 0.7366302472685451
Model: KNeighborsClassifier(), ACC: 0.7708453133985049
Model: RandomForestClassifier(random_state=28), ACC: 0.7880966072455434
Model: SVC(random_state=28), ACC: 0.7886716503737781
Model: AdaBoostClassifier(random_state=28), ACC: 0.7826336975273146


- Checking with different tuning params

In [28]:
# SVC Cru
# RandomForestClassifier Cru
# KNeighborsClassifer

param_grid_kn = ParameterGrid({
    'n_neighbors': [3,5,7,10],
    'weights': ['uniform','distance'],
    'algorithm': ['auto','ball_tree','kd_tree','brute'],
    'leaf_size': [15,30,45,60],
    'p': [1,2]
})

param_grid_rf = ParameterGrid({
    'n_estimators' : [50, 75, 100, 150],
    'criterion': ['gini','entropy','log_loss']
})

param_grid_svc = ParameterGrid({
    'C': list(np.linspace(0.1,1,5)),
    'kernel': ['linear','poly','rbf','sigmoid']
})

In [29]:
scores_list = []
count = 0
for params in param_grid_kn:
    count += 1
    
    model = KNeighborsClassifier(**params)

    model.fit(train_x, train_y)

    val_preds = model.predict(val_x)
    train_preds = model.predict(train_x)

    model.fit(x, y)
    full_preds = model.predict(x)

    scores_list.append([model, metrics.accuracy_score(val_y, val_preds), metrics.accuracy_score(train_y, train_preds), metrics.accuracy_score(y, full_preds)])

    if count % 100 == 0:
        print(f'Tested {count} models for {model}')

df_kn = pd.DataFrame(scores_list)

Tested 100 models for KNeighborsClassifier(algorithm='ball_tree', leaf_size=45, n_neighbors=3,
                     weights='distance')
Tested 200 models for KNeighborsClassifier(algorithm='brute', leaf_size=15, weights='distance')


In [30]:
scores_list = []
count = 0
for params in param_grid_rf:
    count += 1
    
    model = RandomForestClassifier(random_state=28, **params)

    model.fit(train_x, train_y)

    val_preds = model.predict(val_x)
    train_preds = model.predict(train_x)

    model.fit(x, y)
    full_preds = model.predict(x)

    scores_list.append([model, metrics.accuracy_score(val_y, val_preds), metrics.accuracy_score(train_y, train_preds), metrics.accuracy_score(y, full_preds)])

    if count % 100 == 0:
        print(f'Tested {count} models for {model}')

df_rf = pd.DataFrame(scores_list)

In [31]:
# This one takes A LOT to run

# scores_list = {}
# count = 0
# for params in param_grid_svc:
#     count += 1
    
#     model = SVC(random_state=28, **params)

#     model.fit(train_x, train_y)

#     model_preds = model.predict(val_x)
#     scores_list[model] = metrics.accuracy_score(val_y, model_preds)

#     if count % 100 == 0:
#         print(f'Tested {count} models for {model}')

# df_svc = pd.DataFrame.from_dict(scores_list, orient='index').reset_index().rename(columns={0: 'ACC', 'index': 'Model'})

df_svc = pd.read_csv('model_results/svc_results.csv')

In [32]:
df_rf.rename(columns={
    0: 'Model',
    1: 'val_ACC',
    2: 'train_ACC'
}, inplace=True)

In [33]:
df_kn.to_csv('model_results/kn_results.csv', index=False)
df_rf.to_csv('model_results/rf_results.csv', index=False)

- KN Escolhido, KNeighborsClassifier(algorithm='ball_tree', leaf_size=15, n_neighbors=10, p=1, weights='distance')
- KN 3, KNeighborsClassifier(algorithm='brute', n_neighbors=10, p=1, weights='distance'), fitted with full, KN 4 fitted with train
- RandomForestClassifier(criterion='entropy', n_estimators=75, random_state=28)
- RandomForestClassifier(criterion='log_loss', n_estimators=75, random_state=28)
- SVC(C=0.1, kernel='linear', random_state=28)

In [34]:
estimator = RandomForestClassifier(criterion='log_loss', n_estimators=75, random_state=28)
model = AdaBoostClassifier(estimator=estimator, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=28)

In [35]:
estimators = [
    RandomForestClassifier(criterion='entropy', n_estimators=75, random_state=28),
    RandomForestClassifier(criterion='log_loss', n_estimators=75, random_state=28)
]

grid_ada = ParameterGrid({
    'n_estimators': [50,75,100,125,150],
    'learning_rate': [0.1,0.25,0.5,0.75,1],
    'algorithm': ['SAMME','SAMME.R']
})

scores_list = []
count = 0
for estimator in estimators:
    for params in grid_ada:
        count += 1

        model = AdaBoostClassifier(estimator=estimator, n_estimators=params['n_estimators'], learning_rate=params['learning_rate'], algorithm=params['algorithm'])

        model.fit(train_x, train_y)

        val_preds_ada = model.predict(val_x)
        train_preds_ada = model.predict(train_x)

        scores_list.append([model, metrics.accuracy_score(val_y, val_preds_ada), metrics.accuracy_score(train_y, train_preds_ada)])

        if count % 5 == 0:
            print(f'Tested {count} models for {model}')

Tested 5 models for AdaBoostClassifier(algorithm='SAMME',
                   estimator=RandomForestClassifier(criterion='entropy',
                                                    n_estimators=75,
                                                    random_state=28),
                   learning_rate=0.1, n_estimators=150)
Tested 10 models for AdaBoostClassifier(algorithm='SAMME',
                   estimator=RandomForestClassifier(criterion='entropy',
                                                    n_estimators=75,
                                                    random_state=28),
                   learning_rate=0.25, n_estimators=150)
Tested 15 models for AdaBoostClassifier(algorithm='SAMME',
                   estimator=RandomForestClassifier(criterion='entropy',
                                                    n_estimators=75,
                                                    random_state=28),
                   learning_rate=0.5, n_estimators=150)
Tested 20 models fo

KeyboardInterrupt: 

In [36]:
df_ada = pd.DataFrame(scores_list)
df_ada.to_csv('model_results/ada_results.csv')

In [None]:
model.fit(train_x,train_y)

preds_ada = model.predict(val_x)

metrics.accuracy_score(val_y, preds_ada)

0.7725704427832087

In [None]:
model = RandomForestClassifier(criterion='entropy', n_estimators=75, random_state=28)
model.fit(x, y)

preds_kn = model.predict(df_test[df_test.columns[1::]])

In [None]:
preds = pd.DataFrame(preds_kn).rename(columns={0: 'Transported'})

In [None]:
df_sub = pd.concat([df_test['PassengerId'], preds], axis=1)

In [None]:
df_sub.reset_index(drop=True).to_csv('space_titanic_results_rf.csv', index=False)