In [91]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('future.no_silent_downcasting', True)

In [92]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [93]:
print(train.shape)
print(test.shape)

(8693, 14)
(4277, 13)


In [94]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [95]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [96]:
# checking for imbalance
train.Transported.value_counts()

Transported
True     4378
False    4315
Name: count, dtype: int64

In [97]:
train.Cabin.value_counts()

Cabin
G/734/S     8
F/1194/P    7
B/201/P     7
G/981/S     7
G/109/P     7
           ..
E/56/P      1
A/98/P      1
G/1499/S    1
G/1500/S    1
D/252/P     1
Name: count, Length: 6560, dtype: int64

In [98]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [99]:
print(train.HomePlanet.unique())
print(test.HomePlanet.unique())
print(train.isna().sum())
print(test.isna().sum())

['Europa' 'Earth' 'Mars' nan]
['Earth' 'Europa' 'Mars' nan]
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [100]:
# handling na values
amenities = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

for col in amenities:
    train[col] = train[col].fillna(0)


In [101]:
print(train.isna().sum())

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
dtype: int64


In [102]:
# filling age with median
train['Age'] = train.Age.fillna(train.Age.median())

In [103]:
#filling categoric data with most frequent
cat_col = ['HomePlanet','Destination','VIP']
for col in cat_col:
    train[col] = train[col].fillna(train[col].mode()[0])

In [104]:
train['Cabin'] = train.Cabin.fillna('Unknown')

In [105]:
train.loc[(train['CryoSleep'].isnull()) & (train[amenities].sum(axis=1) > 0), 'CryoSleep'] = False

In [106]:
train['CryoSleep'] = train.CryoSleep.fillna(train.CryoSleep.mode()[0])

In [107]:
print(train.isna().sum())

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin             0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
dtype: int64


In [108]:
#using the fill na values in training dataset in testing dataset
cat_col = ['HomePlanet','Destination','VIP']
for col in cat_col:
    test[col] = train[col].fillna(train[col].mode()[0])

In [109]:
test['Cabin'] = train.Cabin.fillna('Unknown')

In [110]:
test.loc[(test['CryoSleep'].isnull()) & (test[amenities].sum(axis=1) > 0), 'CryoSleep'] = False

In [111]:
test['CryoSleep'] = train.CryoSleep.fillna(train.CryoSleep.mode()[0])

In [112]:
amenities = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

for col in amenities:
    test[col] = train[col].fillna(0)


In [113]:
test['Age'] = train.Age.fillna(train.Age.median())

In [114]:
test.isna().sum()

PassengerId      0
HomePlanet       0
CryoSleep        0
Cabin            0
Destination      0
Age              0
VIP              0
RoomService      0
FoodCourt        0
ShoppingMall     0
Spa              0
VRDeck           0
Name            94
dtype: int64

In [115]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Lerome Peckers
2,0019_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Sabih Unhearfus
3,0021_01,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Meratz Caltilter
4,0023_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Brence Harperez


In [116]:
# converting bolean values to integers
#train dataset
tar = ['CryoSleep', 'VIP','Transported']
for col in tar:
    train[col] = train[col].astype('int') # converts True, False to 1,0

#test dataset
tar2 = ['CryoSleep', 'VIP']
for col in tar2:
    test[col] = test[col].astype('int')

In [117]:
train.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars'], dtype=object)

In [118]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
# le = LabelEncoder()
# #train dataset
# train['HomePlanet'] = le.fit_transform(train.HomePlanet)

# #test dataset
# test['HomePlanet'] = le.transform(test.HomePlanet)

In [119]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Lerome Peckers
2,0019_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Sabih Unhearfus
3,0021_01,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Meratz Caltilter
4,0023_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Brence Harperez


In [120]:
X_train = train.drop(['PassengerId','Name','Transported'], axis=1)
y_train = train['Transported']

X_test = test.drop(['PassengerId','Name'], axis=1)

In [121]:
X_test

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0
1,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0
2,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0
3,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0
4,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
4272,Earth,0,E/298/S,TRAPPIST-1e,42.0,0,0.0,1.0,0.0,17.0,1601.0
4273,Earth,0,F/853/S,TRAPPIST-1e,28.0,0,0.0,180.0,516.0,0.0,0.0
4274,Mars,0,F/937/P,TRAPPIST-1e,43.0,0,375.0,103.0,1990.0,36.0,0.0
4275,Europa,0,C/143/P,PSO J318.5-22,55.0,1,0.0,102.0,0.0,278.0,5353.0


In [123]:
print(X_train.info())
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   int64  
 2   Cabin         8693 non-null   object 
 3   Destination   8693 non-null   object 
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   int64  
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 747.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   object 
 1   CryoSleep     4277 non-n

In [124]:
# selecting column types
num_col = X_train.select_dtypes('int64','float64').columns
cat_col = X_train.select_dtypes('object').columns

In [126]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

preprocessing = ColumnTransformer(
    transformers=[
        ('numeric',StandardScaler(), num_col),
        ('categoric',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), cat_col)
    ]
)

In [127]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [128]:
model = Pipeline(
    steps=[
        ('preprocessing',preprocessing),
        ('smote',SMOTE(random_state=42)),
        ('classifier',RandomForestClassifier(random_state=42, n_estimators=100))
    ]
)

In [130]:
model.fit(X_train, y_train)

In [132]:
y_pred = model.predict(X_test)

In [133]:
y_pred

array([0, 1, 0, ..., 1, 0, 0])

In [136]:
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": y_pred
})

In [139]:
submission.to_csv('my_sumbission.csv', index=False)

In [142]:
df = pd.read_csv('my_sumbission.csv')
df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,0
1,0018_01,1
2,0019_01,0
3,0021_01,0
4,0023_01,1


In [143]:
df['Transported'] = df.Transported.astype(bool)

In [146]:
df.to_csv('my_submission.csv',index=False)

In [147]:
df.shape

(4277, 2)