### Importing necessary modules

In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [20]:
train = pd.read_csv('spaceship-titanic/train.csv', index_col = 'PassengerId')
test = pd.read_csv('spaceship-titanic/test.csv', index_col = 'PassengerId')

In [21]:
print('Train Shape: ', train.shape)
print('Test Shape: ', test.shape)

Train Shape:  (8693, 13)
Test Shape:  (4277, 12)


### Removing columns that are not required

In [22]:
train.drop(['Name'], axis = 1, inplace = True)
test.drop(['Name'], axis = 1, inplace = True)

In [23]:
train[['deck', 'num', 'side']] = train['Cabin'].str.split('/', expand = True)
train.drop(['Cabin'], axis = 1, inplace = True)

In [24]:
test[['deck', 'num', 'side']] = test['Cabin'].str.split('/', expand = True)
test.drop(['Cabin'], axis = 1, inplace = True)

### Replacing all categorical values to numeric values

In [25]:
train['deck'] = train['deck'].replace({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7})
test['deck'] = test['deck'].replace({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7})

In [26]:
train['side'] = train['side'].replace({'P':0, 'S':1})
test['side'] = test['side'].replace({'P':0, 'S':1})

In [27]:
train['HomePlanet'] = train['HomePlanet'].replace({'Europa':0, 'Earth':1, 'Mars':2})
test['HomePlanet'] = test['HomePlanet'].replace({'Europa':0, 'Earth':1, 'Mars':2})

In [28]:
train['Destination'] = train['Destination'].replace({'TRAPPIST-1e':0, '55 Cancri e':1, 'PSO J318.5-22':2})
test['Destination'] = test['Destination'].replace({'TRAPPIST-1e':0, '55 Cancri e':1, 'PSO J318.5-22':2})

In [29]:
train[['CryoSleep', 'VIP', 'Transported']] = (train[['CryoSleep', 'VIP', 'Transported']] == True).astype(int)
test[['CryoSleep', 'VIP']] = (test[['CryoSleep', 'VIP']] == True).astype(int)

### Count of null values

In [30]:
nulls = pd.DataFrame(train.isnull().sum().sort_values(ascending = False)[:25])
nulls.columns = ['Null Count']
nulls.index.name = 'Features'

In [31]:
train = train.select_dtypes(include=[np.number]).interpolate().dropna()
test = test.select_dtypes(include=[np.number]).interpolate().dropna()

sum(train.isnull().sum() != 0)

0

### Applying random forest algorithm on the dataset

In [32]:
X_train = train.drop("Transported", axis=1)
Y_train = train["Transported"]
X_test  = test
X_train.shape, Y_train.shape, X_test.shape

((8693, 12), (8693,), (4277, 12))

In [33]:
model = RandomForestRegressor(n_estimators = 3)
model.fit(X_train, Y_train)
pred_y = model.predict(X_test)
pred_y = pred_y.round(0).astype(bool)

### Testing accuracy

In [34]:
print('Training Score: ', model.score(X_train, Y_train))
print('Testing Score: ', model.score(X_test, pred_y))

Training Score:  0.7345690034412904
Testing Score:  0.8158087668517756


### Creating a csv file called 'submission.csv'

In [35]:
output = pd.DataFrame({'PassengerId': test.index, 'Transported': pred_y})
output.to_csv('spaceship-titanic/submission.csv', index = False)