In [1]:
import pandas as pd
import pickle

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.shape

(8693, 14)

In [4]:
data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
obj_list = []
for types, column in zip(data.dtypes, data.columns) :
    if types == 'object' :
        obj_list.append(column)

In [6]:
for col in obj_list[1:] :
    print(col, data[col].unique())
    print(data[col].value_counts(dropna=False))

HomePlanet ['Europa' 'Earth' 'Mars' nan]
Earth     4602
Europa    2131
Mars      1759
NaN        201
Name: HomePlanet, dtype: int64
CryoSleep [False True nan]
False    5439
True     3037
NaN       217
Name: CryoSleep, dtype: int64
Cabin ['B/0/P' 'F/0/S' 'A/0/S' ... 'G/1499/S' 'G/1500/S' 'E/608/S']
NaN        199
G/734/S      8
C/137/S      7
B/201/P      7
G/109/P      7
          ... 
G/556/P      1
E/231/S      1
G/545/S      1
G/543/S      1
C/178/S      1
Name: Cabin, Length: 6561, dtype: int64
Destination ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
NaN               182
Name: Destination, dtype: int64
VIP [False True nan]
False    8291
NaN       203
True      199
Name: VIP, dtype: int64
Name ['Maham Ofracculy' 'Juanna Vines' 'Altark Susent' ... 'Fayey Connon'
 'Celeon Hontichre' 'Propsh Hontichre']
NaN                   200
Sus Coolez              2
Elaney Webstephrey      2
Dia Cartez              2
Grake Por

-------

In [7]:
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [8]:
data.groupby('Transported')['Age'].mean()

Transported
False    29.922858
True     27.748834
Name: Age, dtype: float64

In [9]:
data.groupby(['VIP','Transported']).size()

VIP    Transported
False  False          4093
       True           4198
True   False           123
       True             76
dtype: int64

In [10]:
data.groupby(['VIP','HomePlanet']).size()

VIP    HomePlanet
False  Earth         4487
       Europa        1958
       Mars          1653
True   Europa         131
       Mars            63
dtype: int64

In [11]:
data.groupby(['Transported','HomePlanet']).size()

Transported  HomePlanet
False        Earth         2651
             Europa         727
             Mars           839
True         Earth         1951
             Europa        1404
             Mars           920
dtype: int64

---------

## HomePlanet Null 처리

In [12]:
data['Group'] = data['PassengerId'].str[:4]
# data['Group'] = data['PassengerId'].split('_').str[0]

In [13]:
home_group = data.groupby(['Group','HomePlanet']).size().reset_index()

In [14]:
home_dict = {}
for g, hp in zip(home_group['Group'], home_group['HomePlanet']) :
    home_dict[g] = hp

In [15]:
with open('home_dict.pkl', 'wb') as f :
    pickle.dump(home_dict, f)

In [16]:
data['HomePlanet'].fillna(data['Group'].map(home_dict), inplace=True)

In [17]:
data['HomePlanet'].isnull().sum()

111

In [18]:
data[data['HomePlanet'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group
186,0210_01,,True,D/6/P,55 Cancri e,24.0,False,0.0,0.0,,0.0,0.0,Arraid Inicont,True,0210
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté,False,0242
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True,0251
274,0303_01,,True,G/41/S,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Oraryn Kirklander,True,0303
286,0315_01,,True,G/42/S,PSO J318.5-22,35.0,False,0.0,0.0,0.0,0.0,0.0,Adriet Valezaley,True,0315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8468,9043_01,,True,F/1848/P,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,0.0,0.0,Cobix Erle,True,9043
8515,9084_01,,False,E/582/P,TRAPPIST-1e,25.0,False,1258.0,0.0,22.0,19.0,0.0,Jurs Mone,False,9084
8666,9248_01,,False,F/1792/S,55 Cancri e,38.0,,28.0,1208.0,973.0,207.0,0.0,Gian Perle,True,9248
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,Ties Apple,False,9257


In [19]:
data['Name2'] = data['Name'].str.split(' ').str[-1]

In [20]:
name_group = data.groupby(['Name2','HomePlanet']).size().reset_index()

In [21]:
name_group[name_group['Name2'].duplicated()]

Unnamed: 0,Name2,HomePlanet,0


In [22]:
name_dict = {}
for n, hp in zip(name_group['Name2'], name_group['HomePlanet']) :
    name_dict[n] = hp

In [23]:
with open('name_dict.pkl', 'wb') as f :
    pickle.dump(name_dict, f)

In [24]:
data['HomePlanet'].fillna(data['Name2'].map(name_dict), inplace=True)

In [25]:
data['HomePlanet'].isnull().sum()

12

In [26]:
data[data['Name2'] == 'Amsive']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,Name2
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True,251,Amsive


In [27]:
data[data['HomePlanet'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,Name2
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté,False,242,Sté
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True,251,Amsive
807,0853_01,,True,A/9/S,55 Cancri e,38.0,False,0.0,0.0,0.0,0.0,0.0,Hamelik Ageurante,True,853,Ageurante
1855,1978_01,,True,G/311/S,TRAPPIST-1e,19.0,False,0.0,0.0,0.0,0.0,0.0,,True,1978,
2274,2443_01,,False,D/72/P,TRAPPIST-1e,31.0,False,1458.0,421.0,76.0,0.0,0.0,,False,2443,
2631,2817_01,,False,F/584/P,TRAPPIST-1e,25.0,False,237.0,0.0,910.0,0.0,12.0,Sealfs Sutty,False,2817,Sutty
3091,3331_01,,False,F/631/S,TRAPPIST-1e,40.0,False,666.0,4.0,83.0,0.0,50.0,,True,3331,
4548,4840_01,,True,F/915/S,TRAPPIST-1e,36.0,False,0.0,0.0,,0.0,0.0,,True,4840,
5252,5603_01,,False,E/365/S,TRAPPIST-1e,34.0,False,170.0,1256.0,0.0,3926.0,7121.0,Kocha Cluitty,False,5603,Cluitty
5634,5989_01,,False,F/1141/S,TRAPPIST-1e,20.0,False,0.0,0.0,,703.0,0.0,Darrie Holcompton,False,5989,Holcompton


In [28]:
data.groupby(['Destination','HomePlanet']).size()

Destination    HomePlanet
55 Cancri e    Earth          700
               Europa         902
               Mars           196
PSO J318.5-22  Earth          725
               Europa          19
               Mars            51
TRAPPIST-1e    Earth         3177
               Europa        1214
               Mars          1515
dtype: int64

In [29]:
data['HomePlanet'] = data['HomePlanet'].fillna(data['Destination'].map(
    {'55 Cancri e':'Europa', 
     'PSO J318.5-22':'Earth', 
     'TRAPPIST-1e':'Earth'}))

In [30]:
data['HomePlanet'].isnull().sum()

0

---------

## VIP Null 처리

In [31]:
data['VIP'].value_counts(dropna=False)

False    8291
NaN       203
True      199
Name: VIP, dtype: int64

In [32]:
data['VIP'].fillna(False, inplace=True)

In [33]:
data['VIP'].isnull().sum()

0

In [34]:
data.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
Group             0
Name2           200
dtype: int64

---

## fee 처리

In [35]:
fee_list = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [36]:
for col in fee_list :
    data[col].fillna(data[col].median(), inplace=True)
    

In [37]:
for col in fee_list :
    print(data[col].median())

0.0
0.0
0.0
0.0
0.0


In [38]:
data['fee'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']

In [39]:
data['fee'].isnull().sum()

0

In [40]:
data.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Group             0
Name2           200
fee               0
dtype: int64

---

In [41]:
data['deck'] = data['Cabin'].str.split('/').str[0]

In [42]:
data['side'] = data['Cabin'].str.split('/').str[-1]

In [43]:
data.groupby(['HomePlanet','deck']).size()

HomePlanet  deck
Earth       D          1
            E        404
            F       1652
            G       2559
Europa      A        256
            B        779
            C        747
            D        192
            E        133
            T          5
Mars        D        285
            E        339
            F       1142
dtype: int64

In [44]:
data.groupby(['HomePlanet','side']).size()

HomePlanet  side
Earth       P       2324
            S       2292
Europa      P        972
            S       1140
Mars        P        910
            S        856
dtype: int64

Earth -> G / Europa -> B

In [45]:
data['deck'].fillna(data['HomePlanet'].map({'Earth':'G', 'Europa':'B', 'Mars':'F'}), inplace=True)

In [46]:
data.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Group             0
Name2           200
fee               0
deck              0
side            199
dtype: int64

In [47]:
side_group = data.groupby(['Group', 'side']).size().reset_index()

In [48]:
side_group[side_group['Group'].duplicated()]

Unnamed: 0,Group,side,0


In [49]:
side_dict = {}
for g, s in zip(side_group['Group'], side_group['side']) :
    side_dict[g] = s

In [50]:
with open('side_dict.pkl', 'wb') as f :
    pickle.dump(side_dict, f)

In [51]:
data['side'].fillna(data['Group'].map(side_dict), inplace=True)

In [52]:
data['side'].value_counts()

S    4343
P    4251
Name: side, dtype: int64

In [53]:
data['side'].fillna('S', inplace=True)

In [54]:
data.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Group             0
Name2           200
fee               0
deck              0
side              0
dtype: int64

In [55]:
dest_group = data.groupby(['Group','Destination']).size().reset_index()

In [56]:
dest_group[dest_group['Group'].duplicated()]

Unnamed: 0,Group,Destination,0
8,0008,TRAPPIST-1e,1
17,0017,TRAPPIST-1e,1
19,0020,PSO J318.5-22,1
20,0020,TRAPPIST-1e,3
36,0044,PSO J318.5-22,1
...,...,...,...
6836,9219,TRAPPIST-1e,1
6838,9220,TRAPPIST-1e,2
6844,9227,TRAPPIST-1e,3
6847,9231,TRAPPIST-1e,2


In [57]:
dest_group = dest_group.sort_values(0, ascending=False)

In [58]:
dest_group = dest_group.drop_duplicates()

In [59]:
dest_dict = {}
for g, d in zip(dest_group['Group'], dest_group['Destination']) :
    dest_dict[g] = d

In [60]:
with open('dest_dict.pkl', 'wb') as f :
    pickle.dump(dest_dict, f)

In [61]:
data['Destination'].fillna(data['Group'].map(dest_dict), inplace=True)

In [62]:
data['Destination'].isnull().sum()

103

In [63]:
data['Destination'].fillna('TRAPPIST-1e', inplace=True)

In [64]:
data['Destination'].isnull().sum()

0

In [65]:
data[data['CryoSleep'] == True]['fee'].mean()

0.0

In [66]:
data[data['CryoSleep'].isnull()]['fee']

92         0.0
98       703.0
104     2018.0
111        0.0
152      990.0
         ...  
8620       0.0
8651       0.0
8664       0.0
8675    2056.0
8687    3540.0
Name: fee, Length: 217, dtype: float64

In [67]:
import numpy as np

In [68]:
data['CryoSleep'] = np.where(data['fee'] > 0, False, True)

In [69]:
data['CryoSleep'].isnull().sum()

0

In [70]:
data['Age'].median()

27.0

In [73]:
data['Age'].fillna(data['Age'].median(), inplace=True)

In [74]:
data.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Group             0
Name2           200
fee               0
deck              0
side              0
dtype: int64

In [75]:
del_col = ['PassengerId', 'Cabin', 'Name', 'Name2']

In [76]:
data = data.drop(del_col, axis=1)

In [77]:
data.isnull().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Group           0
fee             0
deck            0
side            0
dtype: int64

In [78]:
data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,fee,deck,side
0,Europa,True,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,0.0,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,736.0,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,10383.0,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,5176.0,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,1091.0,F,S


In [79]:
from sklearn.preprocessing import LabelEncoder

In [83]:
obj_col = []
for col, types in zip(data.dtypes.index, data.dtypes.values) :
    if(types == 'object') :
        obj_col.append(col)

In [None]:
data['CryoSleep'].isnull().sum()

In [84]:
obj_col

['HomePlanet', 'Destination', 'Group', 'deck', 'side']

In [86]:
for col in obj_col :
    label = LabelEncoder()
    data[col] = label.fit_transform(data[col])
    
    

In [89]:
data.columns

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported', 'Group',
       'fee', 'deck', 'side'],
      dtype='object')

In [310]:
X = data.drop(['Transported','Group'], axis=1)
Y = data['Transported']

In [311]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

In [312]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=0)

In [313]:
rfc = RandomForestClassifier()

In [314]:
rfc.fit(x_train, y_train)

RandomForestClassifier()

In [315]:
rfc.score(x_train, y_train)

0.9419039401783147

In [316]:
rfc.score(x_test, y_test)

0.7757331799884991

In [317]:
rfc.feature_names_in_

array(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'fee',
       'deck', 'side'], dtype=object)

In [318]:
rfc.feature_importances_

array([0.03844737, 0.07113612, 0.02166046, 0.1406655 , 0.00246919,
       0.09619787, 0.09945855, 0.08194453, 0.09589026, 0.09471778,
       0.16650616, 0.07156105, 0.01934517])

In [319]:
abc = AdaBoostClassifier()

In [320]:
abc.fit(x_train, y_train)

AdaBoostClassifier()

In [321]:
abc.score(x_train, y_train)

0.7935001438021283

In [322]:
abc.score(x_test, y_test)

0.7809085681426107

In [323]:
gbc = GradientBoostingClassifier()

In [324]:
gbc.fit(x_train, y_train)

GradientBoostingClassifier()

In [325]:
gbc.score(x_train, y_train)

0.813920046016681

In [326]:
gbc.score(x_test, y_test)

0.78953421506613

In [327]:
model_list = [('abc', abc),('gbc', gbc), ('rfc', rfc)]

In [328]:
vc_model = VotingClassifier(estimators=model_list)

In [329]:
vc_model.fit(x_train, y_train)

VotingClassifier(estimators=[('abc', AdaBoostClassifier()),
                             ('gbc', GradientBoostingClassifier()),
                             ('rfc', RandomForestClassifier())])

In [330]:
vc_model.score(x_test, y_test)

0.7878090856814262

In [331]:
import pickle

In [332]:
with open('vc_model.pkl', 'wb') as f :
    pickle.dump(vc_model, f)

In [None]:
data['deck'] = data['Cabin'].str.split('/').str[0]

In [None]:
data['side'] = data['Cabin'].str.split('/').str[-1]

In [None]:
real['deck'] = real['Cabin'].str.split('/').str[0]

In [None]:
data.groupby(['deck','CryoSleep'])['PassengerId'].count()

In [None]:
deck_dict = {'A':'False','B':'True','C':'False','D':'False','E':'False','F':'False','G':'True','T':'False'}

In [None]:
data['CryoSleep'].fillna(data['deck'].map(deck_dict))

In [None]:
group_deck = data.groupby(['Group','deck'])['PassengerId'].count().reset_index()

In [None]:
data['deck'].value_counts()

In [None]:
data[data['Group'] == '0101']

In [None]:
data[data['VIP'] == True]['deck'].value_counts()

In [None]:
data[data['VIP'] == True]['side'].value_counts()

In [None]:
data[data['Cabin'].isnull()]['Group']

-------

In [None]:
fee_col = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
for col in fee_col :
    print(data[data['Age'] <= 12][col].mean())

In [None]:
data[data['Age'] == 0]['VIP'].value_counts()

In [None]:
data[data['VIP'].isnull()]['HomePlanet'].value_counts()

In [None]:
data[data['VIP'] == True]['HomePlanet'].value_counts()

In [None]:
data.groupby(['side','Transported'])['total'].count()

In [None]:
data.groupby(['HomePlanet','Transported'])['total'].count()

In [None]:
data.groupby(['Transported'])['total'].mean()

In [None]:
data.groupby(['Transported'])['Age'].mean()

In [None]:
data[data['VIP'] == True]

In [None]:
data.groupby('Group').size().reset_index()

In [None]:
data[data['Age'].isnull()]['VIP'].value_counts()

In [None]:
data[data['VIP'] == True]['Group']

In [None]:
data['Group'].value_counts()

In [None]:
data[data['Age'] <= 12]

In [None]:
data[data['Age'].isnull()]

In [None]:
data['total'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']

In [None]:
data[data['VIP'] == False]['total'].max()

In [None]:
data[data['VIP'] == True]['total'].max()

In [None]:
data.groupby('VIP')['total'].median()

In [None]:
data.groupby('VIP')['total'].mean()

---

In [None]:
real = pd.read_csv('test.csv')

In [None]:
real.shape

In [None]:
real.head()

In [None]:
real.isnull().sum()

In [None]:
real[real['VIP'].isnull()]['HomePlanet'].value_counts()

In [None]:
real[real['VIP'] == True]['HomePlanet'].value_counts()

In [None]:
submit = pd.read_csv('sample_submission.csv')

In [None]:
submit.head()