In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

# Separate target from predictors
y = train_data.Survived
X = train_data.drop(['Survived', 'PassengerId'], axis=1)
X_test = test_data.drop(['PassengerId'], axis=1)

# Divide data into training/validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Select categorical columns with relatively low cardinality
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == 'object']

#Select numerical columns
numerical_cols = [cname for cname in X_train.columns if  X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols]
X_valid = X_valid[my_cols]
X_full = X[my_cols]
X_test = X_test[my_cols]

In [2]:
X_train.head()

Unnamed: 0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
140,female,C,3,,0,2,15.2458
439,male,S,2,31.0,0,0,10.5
817,male,C,2,31.0,1,1,37.0042
378,male,C,3,20.0,0,0,4.0125
491,male,S,3,21.0,0,0,7.25


In [3]:
X_test.head()

Unnamed: 0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
0,male,Q,3,34.5,0,0,7.8292
1,female,S,3,47.0,1,0,7.0
2,male,Q,2,62.0,0,0,9.6875
3,male,S,3,27.0,0,0,8.6625
4,female,S,3,22.0,1,1,12.2875


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=0)

In [6]:
from sklearn.metrics import accuracy_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)    
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = accuracy_score(y_valid, preds)
print('Accuracy: ', score)                            

Accuracy:  0.8379888268156425


In [7]:
# Preprocessing of full training data, fit model
my_pipeline.fit(X_full, y)

# Preprocessing of test data, get predictions
preds_test = my_pipeline.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': preds_test})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
