In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [3]:
df_train=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [4]:
df_train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [5]:
df_train_x=df_train.drop(['PassengerId','Transported'],axis='columns')
df_train_y=df_train['Transported']
df_test_x=df_test.drop(['PassengerId'],axis='columns')

In [6]:
df_train_x

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther
8689,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley
8690,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon
8691,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre


In [7]:
df_train_y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [8]:
y_prepared=list(map(int,df_train_y))
y_train=np.array(y_prepared)
y_train.reshape(-1)
y_train

array([0, 1, 0, ..., 1, 0, 1])

# data preprocessing pipeline

In [9]:
object_col=['HomePlanet','CryoSleep','Cabin','Destination','VIP','Name']
num_col=['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

num_pipeline=Pipeline(
    [
        ('imputer',SimpleImputer(strategy='median')),
        ('std_scaler',StandardScaler())
    ])
cat_pipeline=Pipeline(
    [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoder',OrdinalEncoder())
    ])
full_pipeline= ColumnTransformer([
        ('num',num_pipeline,num_col),
        ('cat',cat_pipeline,object_col)
    ])

In [10]:
object_col=['HomePlanet','CryoSleep','Cabin','Destination','VIP','Name']
num_col=['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

X_train=full_pipeline.fit_transform(df_train_x)
X_test=full_pipeline.fit_transform(df_test_x)


In [11]:
X_train

array([[ 7.11945365e-01, -3.33104659e-01, -2.81026731e-01, ...,
         2.00000000e+00,  0.00000000e+00,  5.25200000e+03],
       [-3.34037485e-01, -1.68073432e-01, -2.75386568e-01, ...,
         2.00000000e+00,  0.00000000e+00,  4.50200000e+03],
       [ 2.03685698e+00, -2.68000597e-01,  1.95999765e+00, ...,
         2.00000000e+00,  1.00000000e+00,  4.57000000e+02],
       ...,
       [-1.94573105e-01, -3.33104659e-01, -2.81026731e-01, ...,
         2.00000000e+00,  0.00000000e+00,  3.00200000e+03],
       [ 2.23820035e-01, -3.33104659e-01,  3.76365488e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.59600000e+03],
       [ 1.06060632e+00, -1.42334616e-01,  2.65687100e+00, ...,
         2.00000000e+00,  0.00000000e+00,  6.38000000e+03]])

In [12]:
rnd_clf=RandomForestClassifier()
log_clf=LogisticRegression()
voting_clf=VotingClassifier(estimators=[('lr',log_clf),('rf',rnd_clf)],voting='soft')


In [13]:
cross_val_score(voting_clf, X_train,y_train,cv=3,scoring="accuracy")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.78571429, 0.7826087 , 0.80531584])

In [14]:
voting_clf.fit(X_train,y_train)

In [15]:
y_pred=voting_clf.predict(X_test)

# prepared y for submission

In [16]:
y_pred=list(map(bool,y_pred))
y_pred=np.array(y_pred)
y_pred.reshape(-1)
y_pred


array([ True, False,  True, ...,  True,  True,  True])

# Submission

In [17]:
df_sub=pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')


In [18]:
output = pd.DataFrame({'PassengerId': df_sub.PassengerId, 'Transported': y_pred})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
