In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [43]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.head()
df_test['Transported'] = False
df = pd.concat([df_train, df_test], sort=False)
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [44]:
df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Transported       0
dtype: int64

In [45]:
df[['Deck','Num','Side']] = df['Cabin'].str.split('/' , expand=True)
df= df.drop(columns=['Cabin'])
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [46]:
df['Deck'].value_counts()
df['Num'].value_counts()
df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna('-1')
df['Side'] = df['Side'].fillna('U')
df['Side'].value_counts()


Side
S    6381
P    6290
U     299
Name: count, dtype: int64

In [47]:
df['Deck'] = df['Deck'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7, 'U': 8})
df['Side'] = df['Side'].map({'U': -1, 'S': 1, 'P': 2})


In [48]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,0,2
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,5,0,1
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0,0,1
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0,0,1
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,5,1,1


In [49]:
df.drop(columns=['Name' , 'PassengerId'], axis=1,inplace=True)
impute_list = ['Age','Deck','Num','VIP','FoodCourt','Side','RoomService','ShoppingMall','Spa','VRDeck','CryoSleep']
rest = list(set(df.columns) - set(impute_list))
df_rest = df[rest].reset_index(drop=True)
imp =  KNNImputer(n_neighbors=5)
df_imputed = imp.fit_transform(df[impute_list])
df_imputed = pd.DataFrame(df_imputed, columns=impute_list)
df_imputed = df_imputed.reset_index(drop=True)
df = pd.concat([df_rest, df_imputed], axis=1)

In [50]:
df.isna().sum()

Transported       0
Destination     274
HomePlanet      288
Age               0
Deck              0
Num               0
VIP               0
FoodCourt         0
Side              0
RoomService       0
ShoppingMall      0
Spa               0
VRDeck            0
CryoSleep         0
dtype: int64

In [51]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')
category_columns = ['HomePlanet','Destination']
for column in category_columns:
    df = pd.concat([df, pd.get_dummies(df[column], prefix=column)], axis=1)
df = df.drop(columns=category_columns)

In [52]:
total_amt = ['RoomService','ShoppingMall','Spa','VRDeck','FoodCourt']
df['amt_spent'] = df[total_amt].sum(axis=1)
df['std_amt_spent'] = df[total_amt].std(axis=1)
df['mean_amt_spent'] = df[total_amt].mean(axis=1)


In [53]:

df.head()


Unnamed: 0,Transported,Age,Deck,Num,VIP,FoodCourt,Side,RoomService,ShoppingMall,Spa,...,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U,amt_spent,std_amt_spent,mean_amt_spent
0,False,39.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,True,False,False,False,False,True,False,0.0,0.0,0.0
1,True,24.0,5.0,0.0,0.0,9.0,1.0,109.0,25.0,549.0,...,False,False,False,False,False,True,False,736.0,227.807375,147.2
2,False,58.0,0.0,0.0,1.0,3576.0,1.0,43.0,0.0,6715.0,...,True,False,False,False,False,True,False,10383.0,3013.383198,2076.6
3,False,33.0,0.0,0.0,0.0,1283.0,1.0,0.0,371.0,3329.0,...,True,False,False,False,False,True,False,5176.0,1373.410427,1035.2
4,True,16.0,5.0,1.0,0.0,70.0,1.0,303.0,151.0,565.0,...,False,False,False,False,False,True,False,1091.0,223.988169,218.2


In [54]:
df.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.324373
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
FoodCourt                    0.034746
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004154
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018720
Num                         -0.035240
Age                         -0.050548
Side                        -0.058716
Destination_TRAPPIST-1e     -0.072731
Deck                        -0.084981
HomePlanet_Earth            -0.119644
std_amt_spent               -0.121135
amt_spent                   -0.140425
mean_amt_spent              -0.140425
VRDeck                      -0.142783
Spa                         -0.154759
RoomService                 -0.174781
Name: Transported, dtype: float64

In [55]:
df['3_high-corr'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['3_low-corr'] = df['amt_spent'] + df['mean_amt_spent'] + df['HomePlanet_Earth']
df.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.324373
3_high-corr                  0.284177
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
FoodCourt                    0.034746
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004154
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018720
Num                         -0.035240
Age                         -0.050548
Side                        -0.058716
Destination_TRAPPIST-1e     -0.072731
Deck                        -0.084981
HomePlanet_Earth            -0.119644
std_amt_spent               -0.121135
amt_spent                   -0.140425
mean_amt_spent              -0.140425
3_low-corr                  -0.140448
VRDeck                      -0.142783
Spa                         -0.154759
RoomService                 -0.174781
Name: Transported, dtype: float64

In [None]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
df_train,df_test = df[:df_train.shape[0]], df[df_train.shape[0]:]
df_test = df_test.drop(columns=['Transported'])
df_train.shape  , df_test.shape

((8693, 25), (4277, 24))

In [None]:
X = df_train.drop(columns=['Transported'])
y = df_train['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_1 = XGBClassifier()
model_2 = DecisionTreeClassifier()
model_3 = RandomForestClassifier()
model_4 = LogisticRegression()
model_5 = LGBMClassifier()

NameError: name 'train_test_split' is not defined

In [None]:
model_1.fit(X_train, y_train)
pred = model_1.predict(X_test)
accuracy_score(y_test, pred)

NameError: name 'model_1' is not defined

In [None]:
model_2.fit(X_train, y_train)
pred = model_2.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
model_3.fit(X_train, y_train)  
pred = model_3.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
model_4.fit(X_train, y_train)
pred = model_4.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
model_5.fit(X_train, y_train)
pred = model_5.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
df_dummy = pd.readcsv('test.csv')
pred = model_5.predict(df_test)
final = pd.DataFrame()
final['PassengerId'] = df_dummy['PassengerId']
final['Transported'] = pred

final.to_csv('submission.csv', index=False)
