In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import IsolationForest
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input/spaceship-titanic'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [57]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
train.head()
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test.head()
complete = pd.concat([train, test])
complete.isna().sum()

PassengerId        0
HomePlanet       288
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Name             294
Transported     4277
dtype: int64

In [58]:
for col in ('HomePlanet', 'CryoSleep', 'Destination', 'VIP'):
    complete[col] = complete[col].fillna(complete[col].mode()[0])

for col in ('Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'):
    complete[col] = complete[col].fillna(np.mean(complete[col]))



Fill na by mean and mode as the case may be.

In [59]:
complete = complete.drop(['Cabin', 'Name'], axis = 1)

train_1 = complete[complete['Transported'].notnull()]
test_1 = complete[complete['Transported'].isnull()]
pid = test_1.PassengerId

test_1 = test_1.drop(['PassengerId','Transported'], axis = 1)



Dropped cabin and name from the dataset.
Split into train and test.

In [60]:
train_1 = pd.get_dummies(train_1, columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP' ])

Use one hot encoding using get dummies.

In [None]:
corr = train_1.corr()
corr.style.background_gradient(cmap = 'coolwarm')

Checking for correlation among features and target variable

In [61]:
m1 = IsolationForest()
m1.fit(train_1)
train_1['anamoly'] = m1.predict(train_1)  
train_1= train_1[train_1.anamoly == 1] 
train_1.drop(columns = ['anamoly'], inplace = True)

Isolation forest used to detect outliers. Removing them from the data set

In [62]:
y = train_1.Transported
y = y.astype('int')
features = train_1
features = features.drop('Transported', axis =1 )
features.set_index('PassengerId', inplace = True)
train_features, test_features, train_y,test_y = train_test_split(features, y, random_state = 100, test_size = 0.3) 


Split the train_1 to train and test data set

In [20]:
model = DecisionTreeClassifier()
criterion = ['gini', 'entropy']
min_samples_split = np.array(range(1, 100))
max_depth = np.array(range(2, 100))
param = {'criterion': criterion, 'min_samples_split': min_samples_split, 'max_depth': max_depth}
rdecision = RandomizedSearchCV(model, param, cv= 10)
rdecision.fit(train_features, train_y)
print('criterion', rdecision.best_estimator_.criterion)
print('min_samples_split', rdecision.best_estimator_.min_samples_split)
print('max_depth', rdecision.best_estimator_.max_depth)
print('score', rdecision.best_score_)

criterion gini
min_samples_split 81
max_depth 69
score 0.7877666568692667


Used RandomizedsearchCV for parameter tuning and used the values with the best score

In [21]:
model = DecisionTreeClassifier(criterion = 'gini', min_samples_split = 89, max_depth = 11, random_state = 100)
model.fit(train_features,train_y)
model.score(test_features, test_y)

0.8104635153740248

Used the values from RandomizedSearchCV and the results were predicted with accuracy of 81.04%

In [None]:
model = RandomForestClassifier()
criterion = ['gini', 'entropy']
min_samples_split = np.array(range(1, 100))
max_depth = np.array(range(2, 100))
n_estimators = np.array(range(1, 100))
param = {'criterion': criterion, 'min_samples_split': min_samples_split, 'max_depth': max_depth, 'n_estimators': n_estimators}
rrandom = RandomizedSearchCV(model, param, cv= 10, scoring = 'accuracy')
rrandom.fit(train_features, train_y)
print('criterion', rrandom.best_estimator_.criterion)
print('min_samples_split', rrandom.best_estimator_.min_samples_split)
print('max_depth', rrandom.best_estimator_.max_depth)
print('n_estimators', rrandom.best_estimator_.n_estimators)
print('score', rrandom.best_score_)

Used RandomizedsearchCV for parameter tuning and used the values with the best score

In [22]:
model = RandomForestClassifier(criterion = 'entropy', min_samples_split = 94, max_depth = 26, n_estimators = 97, random_state = 100)
model.fit(train_features,train_y)
model.score(test_features, test_y)

0.8090867370353373

Used the values from RandomizedSearchCV and the results were predicted with accuracy of 80.9%

In [None]:
model = BaggingClassifier()
max_features = np.array(range(0, 17))
n_estimators = np.array(range(1, 100))
param = {'max_features': max_features, 'n_estimators': n_estimators}
rbagging = RandomizedSearchCV(model, param, cv = 10, scoring = 'accuracy')
rbagging.fit(train_features, train_y)
print('max_features', rbagging.best_estimator_.max_features)
print('n_estimators', rbagging.best_estimator_.n_estimators)
print('score', rbagging.best_score_)


Used RandomizedsearchCV for parameter tuning and used the values with the best score

In [23]:
model = BaggingClassifier( max_features = 11, n_estimators = 27)
model.fit(train_features,train_y)
model.score(test_features, test_y)

0.8095456631482332

Used the values from RandomizedSearchCV and the results were predicted with accuracy of 80.9%

In [None]:
model = AdaBoostClassifier()
learning_rate = np.logspace(-6, 0, 100)
n_estimators = np.array(range(1, 100))
param = {'learning_rate': learning_rate , 'n_estimators': n_estimators}
rada = RandomizedSearchCV(model, param, cv = 10, scoring = 'accuracy')
rada.fit(train_features, train_y)
print('learning_rate', rada.best_estimator_.learning_rate)
print('n_estimators', rada.best_estimator_.n_estimators)
print('score', rada.best_score_)


Used RandomizedsearchCV for parameter tuning and used the values with the best score

In [24]:
model = AdaBoostClassifier( learning_rate = 0.572236765935022 , n_estimators = 66)
model.fit(train_features,train_y)
model.score(test_features, test_y)

0.8003671408903167

Used the values from RandomizedSearchCV and the results were predicted with accuracy of 80.03%

In [None]:
model = LogisticRegression()
penalty = ['l1', 'l2', 'elasticnet']
c = np.array([0.001, 0.01, 0.1, 1, 10, 100])
solv =  ['lbfgs','liblinear','newton-cg', 'sag','saga']
max_iter = np.array(range(10000,1000000))
param = {'penalty': penalty, 'C': c, 'solver' : solv, 'max_iter': max_iter}
rlogistic = RandomizedSearchCV(model, param, cv = 10)
rlogistic.fit(train_features,train_y)
print('penalty', rlogistic.best_estimator_.penalty)
print('C', rlogistic.best_estimator_.C)
print('solver', rlogistic.best_estimator_.solver)
print('max_iter', rlogistic.best_estimator_.max_iter)
print('score', rlogistic.best_score_)

Used RandomizedsearchCV for parameter tuning and used the values with the best score

In [25]:
model = LogisticRegression(C = 0.01 , penalty = 'l2' , solver = 'newton-cg', max_iter = 696927, random_state = 100)
model.fit(train_features,train_y)
model.score(test_features, test_y)

0.8067921064708582

Used the values from RandomizedSearchCV and the results were predicted with accuracy of 80.6%

In [64]:
level_0_estimators = dict()
level_0_estimators["logreg"] = LogisticRegression(max_iter = 100000, random_state = 100)
level_0_estimators["forest"] = model = RandomForestClassifier(criterion = 'gini', min_samples_split = 42, max_depth = 41, n_estimators=75, random_state = 100)
 
level_0_columns = [f"{name}_prediction" for name in level_0_estimators.keys()]
 
level_1_estimator = model = RandomForestClassifier(criterion = 'gini', min_samples_split = 42, max_depth = 41, n_estimators=75, random_state = 100)

kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 100)
model = StackingClassifier(estimators=list(level_0_estimators.items()), 
                                    final_estimator=level_1_estimator, 
                                    passthrough=True, cv=kfold, stack_method="predict_proba")

model.fit(train_features,train_y)
model.score(test_features, test_y)

0.806287563569117

Stacking classifier predicted with an accuracy of 81.45%
# The best model

In [7]:
model = SVC()
gamma = [0.001, 0.01, 0.1, 1, 10, 100]
C = [0.001, 0.01, 0.1, 1, 10, 100]
param = {'C': C, 'gamma': gamma}
rsvc = RandomizedSearchCV(model, param, cv = 10)
rsvc.fit(train_features, train_y)
print('C', rsvc.best_estimator_.C)
print('gamma', rsvc.best_estimator_.gamma)
print('score', rsvc.best_score_)

C 10
gamma 0.001
score 0.7362355241552156


Used RandomizedsearchCV for parameter tuning and used the values with the best score

In [8]:
model = SVC(gamma = 0.001 , C = 10 , random_state = 100)
model.fit(train_features,train_y)
model.score(test_features, test_y)

0.7381134969325154

Used the values from RandomizedSearchCV and the results were predicted with accuracy of 73.8%

In [65]:
test_1 = pd.get_dummies(test_1, columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP' ])
Transported = model.predict(test_1)

In [66]:
Transported = pd.DataFrame(Transported)
pid = pd.DataFrame(pid)
pid['Transported'] = Transported
pid.Transported = pid.Transported.astype(bool) 

In [67]:
pid = pid.reset_index(drop = True)
pid = pid.set_index('PassengerId')
pid.to_csv('Predictions.csv')