In [105]:
import pandas as pd
import numpy as np
import pickle
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [20]:
test = pd.read_csv('Data/test.csv')

In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [22]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [23]:
test = pd.concat([test['PassengerId'].str.split('_', expand = True).rename(columns = {0 : 'Group', 1 : 'Id'}), test.drop(['PassengerId'], axis = 1)], axis = 1)

In [24]:
test.head()

Unnamed: 0,Group,Id,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,13,1,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,18,1,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,19,1,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,21,1,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,23,1,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [27]:
test = pd.concat([test['Cabin'].str.split('/', expand = True).rename(columns = {0: 'Deck', 1: 'Room', 2: 'Side'}), 
          test.drop(['Cabin'], axis = 1)], axis = 1)

In [28]:
test.head()

Unnamed: 0,Deck,Room,Side,Group,Id,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,G,3,S,13,1,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,F,4,S,18,1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,C,0,S,19,1,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,C,1,S,21,1,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,F,5,S,23,1,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [36]:
encodeDict = {'S' : True, 'P' : False}
test['Side'] = test['Side'].replace(encodeDict)
test.rename(columns = {'Side' : 'Starboard'}, inplace = True)
test.head()

Unnamed: 0,Deck,Room,Starboard,Group,Id,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,G,3,True,13,1,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,F,4,True,18,1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,C,0,True,19,1,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,C,1,True,21,1,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,F,5,True,23,1,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [39]:
test = pd.concat([test['Name'].str.split(' ', expand = True).rename(columns = {0 : 'FirstName', 1: 'LastName'}),
          test.drop(['Name'], axis = 1)], axis = 1)

In [40]:
test.head()

Unnamed: 0,FirstName,LastName,Deck,Room,Starboard,Group,Id,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Nelly,Carsoning,G,3,True,13,1,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0
1,Lerome,Peckers,F,4,True,18,1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0
2,Sabih,Unhearfus,C,0,True,19,1,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0
3,Meratz,Caltilter,C,1,True,21,1,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0
4,Brence,Harperez,F,5,True,23,1,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0


In [42]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   FirstName     4183 non-null   object 
 1   LastName      4183 non-null   object 
 2   Deck          4177 non-null   object 
 3   Room          4177 non-null   object 
 4   Starboard     4177 non-null   object 
 5   Group         4277 non-null   object 
 6   Id            4277 non-null   object 
 7   HomePlanet    4190 non-null   object 
 8   CryoSleep     4184 non-null   object 
 9   Destination   4185 non-null   object 
 10  Age           4186 non-null   float64
 11  VIP           4184 non-null   object 
 12  RoomService   4195 non-null   float64
 13  FoodCourt     4171 non-null   float64
 14  ShoppingMall  4179 non-null   float64
 15  Spa           4176 non-null   float64
 16  VRDeck        4197 non-null   float64
dtypes: float64(6), object(11)
memory usage: 568.2+ KB


In [45]:
convertCols = ['Room', 'Group', 'Id', 'Age']
test[convertCols] = test[convertCols].astype('Int64')

In [47]:
boolCols = ['Starboard', 'CryoSleep', 'VIP']
test[boolCols] = test[boolCols].astype('boolean')

In [48]:
test.head()

Unnamed: 0,FirstName,LastName,Deck,Room,Starboard,Group,Id,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Nelly,Carsoning,G,3,True,13,1,Earth,True,TRAPPIST-1e,27,False,0.0,0.0,0.0,0.0,0.0
1,Lerome,Peckers,F,4,True,18,1,Earth,False,TRAPPIST-1e,19,False,0.0,9.0,0.0,2823.0,0.0
2,Sabih,Unhearfus,C,0,True,19,1,Europa,True,55 Cancri e,31,False,0.0,0.0,0.0,0.0,0.0
3,Meratz,Caltilter,C,1,True,21,1,Europa,False,TRAPPIST-1e,38,False,0.0,6652.0,0.0,181.0,585.0
4,Brence,Harperez,F,5,True,23,1,Earth,False,TRAPPIST-1e,20,False,10.0,0.0,635.0,0.0,0.0


In [49]:
test.isna().sum()

FirstName        94
LastName         94
Deck            100
Room            100
Starboard       100
Group             0
Id                0
HomePlanet       87
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

#### Europans Conclusions
+ 43.9% Chance of being in cryosleep
+ 6.3% Chance of being a VIP
+ Destination
    - 56.8 % chance going to TRAPPIST-1e
    - 42.3 % chance going to 55 Cancri e
    - 0.9% chance going to PSO J318.5-22 
+ Age - There are almost no young Europans, < 13yrs old
+ On decks A,B,C,D,E, and T, slightly higher chance of being on starbord side

#### Martian Conclusions
+ 36% Chance of not being in cryosleep
+ 3.8% chance of being a VIP
+ Destination
    - 85% Chance going to TRAPPIST-1e
    - 11% Chance going to 55 Cancri e
    - 3% chance going to PSO J318.5-22
+ Many more 30+ year olds than terrans
+ only on F, E, and D decks

### Terran Conclusions
+ 31% chance of cryosleep
+ No Terran in list is a VIP
+ Destination
    - 68.9% are going to TRAPPIST-1e
    - 15.8% going to PSO J318.5-22
    - 15.3 going to 55 Cancri e 
+ Most between 20 and 30, very very few above 63
+ by far most terrans spend 0 on ameneties but equal distribution of moderate spending on all amenities
+ Only on G,F, and E decks

In [50]:
test.loc[test['Deck'].isin(['A', 'B', 'C', 'T']) & test['HomePlanet'].isna(), ['HomePlanet']] = 'Europa'
test.loc[test['Deck'].isin(['G']) & test['HomePlanet'].isna(), ['HomePlanet']] = 'Earth'

In [51]:
test.isna().sum()

FirstName        94
LastName         94
Deck            100
Room            100
Starboard       100
Group             0
Id                0
HomePlanet       51
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

In [52]:
test.loc[(test['HomePlanet'].isna()) & (test['VIP'] == True) & (test['Deck'] == 'F'), ['HomePlanet']] = 'Mars'

In [53]:
test.isna().sum()

FirstName        94
LastName         94
Deck            100
Room            100
Starboard       100
Group             0
Id                0
HomePlanet       51
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

In [55]:
lastNames = test.loc[test['HomePlanet'].isna() & test['LastName'].notna()]['LastName'].tolist()
for i in lastNames:
    if len(test.loc[(test['HomePlanet'].notna()) & (test['LastName'] == i)]['HomePlanet'].tolist()) != 0:
        test.loc[(test['HomePlanet'].isna()) & (test['LastName'] == i), ['HomePlanet']] = test.loc[(test['HomePlanet'].notna()) & (test['LastName'] == i)]['HomePlanet'].tolist()[0]

In [56]:
test.isna().sum()

FirstName        94
LastName         94
Deck            100
Room            100
Starboard       100
Group             0
Id                0
HomePlanet       16
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

In [57]:
groupList = test.loc[test['HomePlanet'].isna()]['Group'].tolist()
test.loc[test['Group'].isin(groupList) & (test['Group'].duplicated(keep = False))]

Unnamed: 0,FirstName,LastName,Deck,Room,Starboard,Group,Id,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
2554,Weres,Cola,F,1066,True,5588,1,Mars,False,TRAPPIST-1e,24,False,18.0,25.0,0.0,0.0,1256.0
2555,Cobix,Cola,F,1066,True,5588,2,Mars,True,TRAPPIST-1e,47,False,0.0,0.0,0.0,0.0,0.0
2556,Cocors,Cola,F,1066,True,5588,3,Mars,True,TRAPPIST-1e,16,False,0.0,0.0,0.0,0.0,0.0
2557,Libers,Dan,F,1066,True,5588,4,,False,TRAPPIST-1e,59,False,4.0,10.0,1143.0,0.0,0.0
2558,Fes,Cola,F,1066,True,5588,5,Mars,True,TRAPPIST-1e,31,False,0.0,0.0,0.0,0.0,0.0
2951,,,G,1052,False,6499,1,Earth,False,TRAPPIST-1e,11,False,0.0,0.0,0.0,0.0,0.0
2952,,,F,1346,False,6499,2,,False,TRAPPIST-1e,13,False,2195.0,0.0,21.0,0.0,0.0
2953,Benry,Barnolderg,G,1052,False,6499,3,Earth,False,TRAPPIST-1e,2,False,0.0,0.0,0.0,0.0,0.0
2954,Berta,Barnolderg,G,1052,False,6499,4,Earth,False,PSO J318.5-22,12,False,0.0,0.0,0.0,0.0,0.0
2955,Verly,Barnolderg,F,1346,False,6499,5,Earth,False,TRAPPIST-1e,27,False,0.0,0.0,0.0,761.0,141.0


In [59]:
test.iloc[2557, 7] = 'Mars'
test.iloc[2952, 7] = 'Earth'
test.iloc[2989, 7] = 'Earth'
test.iloc[3431, 7] = 'Mars'
test.iloc[3456, 7] = 'Mars'


In [60]:
test.isna().sum()

FirstName        94
LastName         94
Deck            100
Room            100
Starboard       100
Group             0
Id                0
HomePlanet       11
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

In [61]:
test.loc[test['HomePlanet'].isna()]

Unnamed: 0,FirstName,LastName,Deck,Room,Starboard,Group,Id,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
276,Arkaban,Spriney,E,40,True,616,1,,False,TRAPPIST-1e,31,False,0.0,428.0,,1154.0,1025.0
340,Skunch,Ponie,F,135,True,730,1,,False,TRAPPIST-1e,44,False,14.0,0.0,1817.0,2.0,171.0
464,Andan,Fryan,F,180,True,967,1,,False,TRAPPIST-1e,19,False,1.0,0.0,0.0,0.0,632.0
481,Mara,Ments,F,192,True,1021,1,,False,TRAPPIST-1e,23,False,0.0,458.0,151.0,256.0,0.0
1890,Ankabih,Coneveseng,D,136,False,4052,1,,False,TRAPPIST-1e,26,False,6.0,5903.0,0.0,240.0,72.0
2043,Cupers,Watie,F,832,True,4414,1,,True,55 Cancri e,32,False,0.0,0.0,0.0,0.0,0.0
2365,Bonya,Goodson,F,1048,False,5139,1,,False,55 Cancri e,18,False,231.0,508.0,594.0,0.0,585.0
2471,Pies,Gelle,F,1028,True,5373,1,,False,TRAPPIST-1e,27,False,465.0,0.0,76.0,2.0,883.0
3220,Zedares,Maltorted,E,471,True,7065,1,,True,TRAPPIST-1e,28,False,0.0,0.0,0.0,0.0,
3858,,,F,1730,False,8435,1,,,TRAPPIST-1e,19,False,162.0,11.0,0.0,0.0,1216.0


In [77]:
test.iloc[2365, 7] = 'Europa'
test.iloc[2043, 7] = 'Europa'
test.iloc[276, 7] = 'Mars'
test.iloc[340, 7] = 'Mars'
test.iloc[464, 7] = 'Earth'
test.iloc[1890, 7] = 'Europa'
test.iloc[3858, 7] = 'Earth'



In [78]:
test.loc[test['HomePlanet'].isna()]

Unnamed: 0,FirstName,LastName,Deck,Room,Starboard,Group,Id,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
481,Mara,Ments,F,192,True,1021,1,,False,TRAPPIST-1e,23,False,0.0,458.0,151.0,256.0,0.0
2471,Pies,Gelle,F,1028,True,5373,1,,False,TRAPPIST-1e,27,False,465.0,0.0,76.0,2.0,883.0
3220,Zedares,Maltorted,E,471,True,7065,1,,True,TRAPPIST-1e,28,False,0.0,0.0,0.0,0.0,
4032,Raston,Maltorted,D,275,False,8775,1,,True,TRAPPIST-1e,40,False,0.0,0.0,0.0,0.0,0.0


In [79]:
test.loc[(test['HomePlanet'] == 'Earth') & (test['VIP'].isna()), ['VIP']] = False

In [80]:
amenetiesCols = ['RoomService', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck']
test.loc[test[amenetiesCols].sum(axis = 1) != 0, ['CryoSleep']] = False

In [83]:
planets = ['Europa', 'Mars', 'Earth']
cryoPer = [43.9, 36.0, 31.0]
for i, j in zip(planets, cryoPer):
    test.loc[(test['HomePlanet'] == i) & (test['CryoSleep'].isna()), ['CryoSleep']] = random.choices([True, False], weights = (j, 100-j))

In [84]:
vipPer = [6.3, 3.8, 0]
for i, j in zip(planets, vipPer):
    test.loc[(test['HomePlanet'] == i) & (test['VIP'].isna()), ['VIP']] = random.choices([True, False], weights = (j, 100-j))

In [85]:
deckLists = [['A', 'B', 'C', 'D', 'E', 'T'], ['F', 'E', 'D'], ['G', 'F', 'E']]
for i, j in zip(planets, deckLists):
    test.loc[(test['HomePlanet'] == i) & (test['Deck'].isna()), ['Deck']] = random.choices(j)

In [86]:
maxCap = test.groupby(['Deck', 'Room'], as_index = False)['Room'].count().groupby(['Deck']).max().reset_index()
roomDict = {}
for i in test['Deck'].unique():
    roomDict[i] = (test.loc[test['Deck'] == i]['Room'].unique())
for i in test.loc[test['Room'].isna()].index:
    deckNum = test.iloc[i]['Deck']
    # Generate random room already in room list in that deck
    roomRand = random.choices(roomDict[deckNum])[0]
    # find number of passengers in that room
    numInRoom = (test.loc[(test['Deck'] == deckNum) & (test['Room'] == roomRand)].groupby(['Room'])['Room'].count().any())
    # lookup max number of people in rooms on that deck
    roomMax = maxCap.loc[maxCap['Deck'] == deckNum]['Room'].unique()[0]
    if numInRoom < roomMax:
        # assign random room to passenger
        test.iloc[i, 3] = roomRand
    else:
        #generate random rooms until random room is less than room max
        while numInRoom >= roomMax:
            roomRand = random.choices(roomDict[deckNum])[0]
            numInRoom = numInRoom = (test.loc[(test['Deck'] == deckNum) & (test['Room'] == roomRand)].groupby(['Room'])['Room'].count().any())
        # break while loop and assign new random room to passenger
        test.iloc[i, 3] = roomRand

In [87]:
destPer = [[56.8, 42.3, 0.9], [85.0, 11.0, 3.0], [68.9, 15.3, 15.8]]
destinations = ['TRAPPIST-1e', '55 Cancri e', 'PSO J318.5-22']
for i,j in zip(planets, destPer):
    test.loc[(test['Destination'].isna()) & (test['HomePlanet'] == i), ['Destination']] = random.choices(destinations, weights = (j[0], j[1], j[2]))

In [88]:
test.isna().sum()

FirstName        94
LastName         94
Deck              0
Room              0
Starboard       100
Group             0
Id                0
HomePlanet        4
CryoSleep         0
Destination       0
Age              91
VIP               0
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

In [89]:
test.drop(['FirstName', 'LastName'], axis = 1, inplace = True)

In [91]:
catVars = ['Deck', 'HomePlanet', 'Destination']
testDum = pd.concat([test.drop(catVars, axis = 1), pd.get_dummies(test[catVars])], axis = 1)
# Scale Data
scaler = MinMaxScaler()
testDumKNN = pd.DataFrame(scaler.fit_transform(testDum), columns = testDum.columns)

In [92]:
imputer = KNNImputer(n_neighbors= 5)
testDumKNN = pd.DataFrame(imputer.fit_transform(testDumKNN), columns = testDumKNN.columns)
testDumKNN.head()

Unnamed: 0,Room,Starboard,Group,Id,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Deck_E,Deck_F,Deck_G,Deck_T,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0.001587,1.0,0.0,0.0,1.0,0.341772,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.002116,1.0,0.00054,0.0,0.0,0.240506,0.0,0.0,0.000356,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.000648,0.0,1.0,0.392405,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.000529,1.0,0.000864,0.0,0.0,0.481013,0.0,0.0,0.263206,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.002646,1.0,0.001079,0.0,0.0,0.253165,0.0,0.000865,0.0,0.07658,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [93]:
testDumKNN = pd.DataFrame(scaler.inverse_transform(testDumKNN), columns = testDumKNN.columns)
testDumKNN

Unnamed: 0,Room,Starboard,Group,Id,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Deck_E,Deck_F,Deck_G,Deck_T,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,3.0,1.0,13.0,1.0,1.0,27.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,4.0,1.0,18.0,1.0,0.0,19.0,0.0,0.0,9.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,19.0,1.0,1.0,31.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,1.0,21.0,1.0,0.0,38.0,0.0,0.0,6652.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,5.0,1.0,23.0,1.0,0.0,20.0,0.0,10.0,0.0,635.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1496.0,1.0,9266.0,2.0,1.0,34.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4273,2.0,0.4,9269.0,1.0,0.0,42.0,0.0,0.0,847.0,17.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4274,296.0,0.0,9271.0,1.0,1.0,30.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4275,297.0,0.0,9273.0,1.0,0.0,34.4,0.0,0.0,2680.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [94]:
# Undummy columns
decklist = [col for col in testDumKNN if 'Deck_' in col]
testDumKNN['Destination'] = testDumKNN[['Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Destination_55 Cancri e']].idxmax(axis=1).str.replace('Destination_', '')

In [95]:
testDumKNN['Deck'] = testDumKNN[decklist].idxmax(axis=1).str.replace('Deck_', '')

In [96]:
planetList = [col for col in testDumKNN if 'HomePlanet_' in col]
testDumKNN['HomePlanet'] = testDumKNN[planetList].idxmax(axis = 1).str.replace('HomePlanet_', '')

In [97]:
for i in test.columns[test.isnull().any()]:
    test[i] = testDumKNN[i]

In [98]:
test.isna().sum()

Deck            0
Room            0
Starboard       0
Group           0
Id              0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [99]:
test.drop(['Group', 'Id'], axis = 1, inplace = True)
test = pd.concat([pd.get_dummies(test[catVars], drop_first = True), test.drop(catVars, axis = 1)], axis = 1)
test[test.select_dtypes('boolean').columns] = test.select_dtypes('boolean').astype(int)
test['Starboard'] = test['Starboard'].astype(int)
scaler = MinMaxScaler()
test[test.select_dtypes(float).columns] = scaler.fit_transform(test.select_dtypes(float))

In [100]:
test

Unnamed: 0,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,...,Room,Starboard,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0,0,0,0,0,1,0,0,0,0,...,3,1,1,0.341772,0,0.000000,0.000000,0.00000,0.000000,0.000000
1,0,0,0,0,1,0,0,0,0,0,...,4,1,0,0.240506,0,0.000000,0.000356,0.00000,0.142260,0.000000
2,0,1,0,0,0,0,0,1,0,0,...,0,1,1,0.392405,0,0.000000,0.000000,0.00000,0.000000,0.000000
3,0,1,0,0,0,0,0,1,0,0,...,1,1,0,0.481013,0,0.000000,0.263206,0.00000,0.009121,0.026266
4,0,0,0,0,1,0,0,0,0,0,...,5,1,0,0.253165,0,0.000865,0.000000,0.07658,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0,0,0,0,0,1,0,0,0,0,...,1496,1,1,0.430380,0,0.000000,0.000000,0.00000,0.000000,0.000000
4273,0,0,0,1,0,0,0,0,0,0,...,2,0,0,0.531646,0,0.000000,0.033514,0.00205,0.000504,0.006466
4274,0,0,1,0,0,0,0,0,1,0,...,296,0,1,0.382278,0,0.000000,0.000000,0.00000,0.000000,0.000000
4275,0,0,1,0,0,0,0,1,0,0,...,297,0,0,0.435443,0,0.000000,0.106042,0.00000,0.000000,0.023482


In [104]:
rf = RandomForestClassifier()
svm = SVC()
sgd = SGDClassifier()
gbc = GradientBoostingClassifier()
lgb = LGBMClassifier()
xgb = XGBClassifier()
mlp = MLPClassifier()

In [112]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Deck_B                     4277 non-null   uint8  
 1   Deck_C                     4277 non-null   uint8  
 2   Deck_D                     4277 non-null   uint8  
 3   Deck_E                     4277 non-null   uint8  
 4   Deck_F                     4277 non-null   uint8  
 5   Deck_G                     4277 non-null   uint8  
 6   Deck_T                     4277 non-null   uint8  
 7   HomePlanet_Europa          4277 non-null   uint8  
 8   HomePlanet_Mars            4277 non-null   uint8  
 9   Destination_PSO J318.5-22  4277 non-null   uint8  
 10  Destination_TRAPPIST-1e    4277 non-null   uint8  
 11  Room                       4277 non-null   Int64  
 12  Starboard                  4277 non-null   int32  
 13  CryoSleep                  4277 non-null   int32

In [118]:
test[test.select_dtypes('uint8').columns] = test.select_dtypes('uint8').astype(int)

In [123]:
test['Room'] = test.Room.astype(int)

In [110]:
bestModel = pickle.load(open('bestmodel.pkl', 'rb'))

In [125]:
testPred = bestModel.predict(test)

In [128]:
pd.Series(testPred).value_counts()

0    4247
1      30
dtype: int64

In [129]:
train = pd.read_csv('cleanedtrain.csv')

In [130]:
train.columns

Index(['Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'Room', 'Starboard', 'CryoSleep', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Transported'],
      dtype='object')

In [131]:
test.columns

Index(['Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'Room', 'Starboard', 'CryoSleep', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
      dtype='object')

In [136]:
for i in test.columns:
    print(f'The value counts for {i} are {train[i].value_counts()}/n')

The value counts for Deck_B are 0    7914
1     779
Name: Deck_B, dtype: int64/n
The value counts for Deck_C are 0    7946
1     747
Name: Deck_C, dtype: int64/n
The value counts for Deck_D are 0    8215
1     478
Name: Deck_D, dtype: int64/n
The value counts for Deck_E are 0    7717
1     976
Name: Deck_E, dtype: int64/n
The value counts for Deck_F are 0    5899
1    2794
Name: Deck_F, dtype: int64/n
The value counts for Deck_G are 0    6035
1    2658
Name: Deck_G, dtype: int64/n
The value counts for Deck_T are 0    8688
1       5
Name: Deck_T, dtype: int64/n
The value counts for HomePlanet_Europa are 0    6518
1    2175
Name: HomePlanet_Europa, dtype: int64/n
The value counts for HomePlanet_Mars are 0    6889
1    1804
Name: HomePlanet_Mars, dtype: int64/n
The value counts for Destination_PSO J318.5-22 are 0    7897
1     796
Name: Destination_PSO J318.5-22, dtype: int64/n
The value counts for Destination_TRAPPIST-1e are 1    6060
0    2633
Name: Destination_TRAPPIST-1e, dtype: int64

In [138]:
test.to_csv('cleanedtest.csv', index = False)

In [139]:
test

Unnamed: 0,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,...,Room,Starboard,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0,0,0,0,0,1,0,0,0,0,...,3,1,1,0.341772,0,0.000000,0.000000,0.00000,0.000000,0.000000
1,0,0,0,0,1,0,0,0,0,0,...,4,1,0,0.240506,0,0.000000,0.000356,0.00000,0.142260,0.000000
2,0,1,0,0,0,0,0,1,0,0,...,0,1,1,0.392405,0,0.000000,0.000000,0.00000,0.000000,0.000000
3,0,1,0,0,0,0,0,1,0,0,...,1,1,0,0.481013,0,0.000000,0.263206,0.00000,0.009121,0.026266
4,0,0,0,0,1,0,0,0,0,0,...,5,1,0,0.253165,0,0.000865,0.000000,0.07658,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0,0,0,0,0,1,0,0,0,0,...,1496,1,1,0.430380,0,0.000000,0.000000,0.00000,0.000000,0.000000
4273,0,0,0,1,0,0,0,0,0,0,...,2,0,0,0.531646,0,0.000000,0.033514,0.00205,0.000504,0.006466
4274,0,0,1,0,0,0,0,0,1,0,...,296,0,1,0.382278,0,0.000000,0.000000,0.00000,0.000000,0.000000
4275,0,0,1,0,0,0,0,1,0,0,...,297,0,0,0.435443,0,0.000000,0.106042,0.00000,0.000000,0.023482
