In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import FeatureUnion, make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer

In [3]:
train = pd.read_csv('./train.csv', index_col=0)
test = pd.read_csv('./test.csv', index_col=0)
train.shape, test.shape

((891, 11), (418, 10))

In [4]:
train.drop('Cabin', inplace=True, axis=1)
train['Fare_per_person'] = train.Fare / train.groupby('Ticket')['Pclass'].transform('count')
train['Survived_per_ticket'] = train.groupby('Ticket')['Survived'].transform('sum')
train['Age'] = train['Age'].fillna(train.groupby(['Sex', 'SibSp', 'Parch'])['Age'].transform('mean'))
train['Age'] = train['Age'].fillna(train.groupby(['Sex'])['Age'].transform('mean'))
train['Embarked'] = train.Embarked.fillna(train.Embarked.value_counts().index[0])

In [8]:
test.drop('Cabin', inplace=True, axis=1)
test['Fare_per_person'] = test.Fare / test.groupby('Ticket')['Pclass'].transform('count')
test['Survived_per_ticket'] = test.groupby('Ticket')['Sex'].transform('count')
test['Age'] = test['Age'].fillna(test.groupby(['Sex', 'SibSp', 'Parch'])['Age'].transform('mean'))
test['Age'] = test['Age'].fillna(test.groupby(['Sex'])['Age'].transform('mean'))
test['Embarked'] = test.Embarked.fillna(test.Embarked.value_counts().index[0])
test.head()

In [299]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Fare_per_person,Survived_per_ticket,People_per_ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,7.25,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,71.2833,1,1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,7.925,1,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,26.55,1,2
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,8.05,0,1


In [6]:
missing_val_count_by_column_train = (train.isnull().sum())
missing_val_count_by_column_train

Survived               0
Pclass                 0
Name                   0
Sex                    0
Age                    0
SibSp                  0
Parch                  0
Ticket                 0
Fare                   0
Embarked               0
Fare_per_person        0
Survived_per_ticket    0
dtype: int64

In [9]:
missing_val_count_by_column_test = (test.isnull().sum())
missing_val_count_by_column_test

Pclass                 0
Name                   0
Sex                    0
Age                    0
SibSp                  0
Parch                  0
Ticket                 0
Fare                   1
Embarked               0
Fare_per_person        1
Survived_per_ticket    0
dtype: int64

In [313]:
train.loc[:, ['Age',  'Fare', 'SibSp', 'Parch', 'Survived_per_ticket', 'Pclass', 'Sex', 'Embarked']]

Unnamed: 0_level_0,Age,Fare,SibSp,Parch,Survived_per_ticket,Pclass,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,22.0,7.2500,1,0,0,3,male,S
2,38.0,71.2833,1,0,1,1,female,C
3,26.0,7.9250,0,0,1,3,female,S
4,35.0,53.1000,1,0,1,1,female,S
5,35.0,8.0500,0,0,0,3,male,S
...,...,...,...,...,...,...,...,...
887,27.0,13.0000,0,0,0,2,male,S
888,19.0,30.0000,0,0,1,1,female,S
889,21.2,23.4500,1,2,0,3,female,S
890,26.0,30.0000,0,0,1,1,male,C


In [14]:
preprocess = make_column_transformer(
    ( SimpleImputer(), ['Age',  'Fare', 'SibSp', 'Parch', 'Survived_per_ticket']),
    ( make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False)), [ 'Pclass', 'Sex', 'Embarked']),
)

In [15]:
clf = Pipeline(steps=[('pre', preprocess), ('NB', GaussianNB())])
cv = cross_val_score(clf, train.iloc[:, 1:], train.Survived, cv=5, scoring='accuracy')
cv

array([0.84916201, 0.85955056, 0.86516854, 0.83146067, 0.90449438])

In [17]:
clf.fit(train.iloc[:, 1:])

ValueError: bad input shape ()