# Model

In [1]:
import numpy as np
import pandas as pd

In [None]:
## Load data

In [39]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
df_test_y = pd.read_csv('./data/gender_submission.csv')

In [51]:
df_train.isna().sum(axis=0)/len(df_train)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
familysize     0.000000
dtype: float64

In [50]:
df_test.isna().sum(axis=0)/len(df_test)

PassengerId    0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.782297
Embarked       0.000000
familysize     0.000000
dtype: float64

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [44]:
df_train[['Age','Fare']].values

array([[22.    ,  7.25  ],
       [38.    , 71.2833],
       [26.    ,  7.925 ],
       ...,
       [    nan, 23.45  ],
       [26.    , 30.    ],
       [32.    ,  7.75  ]])

In [47]:
def preprocessing_data(df_train, df_test):
    # remove NaN
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    df_train[['Age','Fare']] = imp.fit_transform(df_train[['Age','Fare']].values)
    df_test[['Age','Fare']] = imp.transform(df_test[['Age','Fare']].values)
    
    # encoder categorical variable
    labelEncoder = LabelEncoder()
    df_train['Sex'] = labelEncoder.fit_transform(df_train['Sex'])
    df_test['Sex'] = labelEncoder.transform(df_test['Sex'])
    
    # new variables
    df_train['familysize'] = df_train['SibSp'] + df_train['Parch']
    df_test['familysize'] = df_test['SibSp'] + df_test['Parch']
    
    X_train = df_train[['Sex','Age','Fare','familysize']].values
    X_test = df_test[['Sex','Age','Fare','familysize']].values
    y_train = df_train['Survived'].values

    return X_train, X_test, y_train

In [52]:
X_train, X_test, y_train = preprocessing_data(df_train, df_test)
y_test = df_test_y['Survived'].values

## Random Forest

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [69]:
classifier = RandomForestClassifier(n_estimators=1000, criterion = 'entropy')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print(accuracy_score(y_test,y_pred))

[[266   0]
 [  1 151]]
0.9976076555023924


### Submission

In [74]:
df = df_test_y
df['Survived'] = y_pred
df.to_csv('submission.csv', index=False)

### Save model

In [75]:
import joblib

In [76]:
joblib.dump(classifier, 'model-RandomForestClassifier.pkl')

['model-RandomForestClassifier.pkl']