In [1]:
import pandas
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [28]:
test = pandas.read_csv("../data/test.csv")
train = pandas.read_csv("../data/train.csv")

In [102]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [36]:
train.Sex.unique(), train.Embarked.unique(), train.Pclass.unique()

(array(['male', 'female'], dtype=object),
 array(['S', 'C', 'Q', nan], dtype=object),
 array([3, 1, 2]))

In [103]:
y = train['Survived']
x = train[train.columns.drop('Survived').drop('PassengerId').drop('Cabin').drop('Ticket').drop('Name')]

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [105]:
X_train.head(), y_train.head()

(     Pclass  Sex   Age  SibSp  Parch     Fare Embarked
 105       3    1  28.0      0      0   7.8958        S
 68        3    1  17.0      4      2   7.9250        S
 253       3    1  30.0      1      0  16.1000        S
 320       3    1  22.0      0      0   7.2500        S
 706       2    1  45.0      0      0  13.5000        S,
 105    0
 68     1
 253    0
 320    0
 706    1
 Name: Survived, dtype: int64)

In [106]:
numeric_features = ["Age", "Fare", "SibSp", "Parch"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)


categorical_features = ["Sex", "Pclass", "Embarked"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

In [108]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7219730941704036

In [111]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
105,3,1,28.0,0,0,7.8958,S
68,3,1,17.0,4,2,7.925,S
253,3,1,30.0,1,0,16.1,S
320,3,1,22.0,0,0,7.25,S
706,2,1,45.0,0,0,13.5,S


In [110]:
preprocessor.fit_transform(X_train)

array([[-0.13279288, -0.48380773, -0.46037161, ...,  0.        ,
         1.        ,  0.        ],
       [-0.98165075, -0.48323058,  2.98532288, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.02154491, -0.32165051,  0.40105202, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.05562399, -0.48701956, -0.46037161, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.48455829, -0.29595582,  0.40105202, ...,  0.        ,
         1.        ,  0.        ],
       [ 2.3366118 ,  0.13097136,  0.40105202, ...,  0.        ,
         1.        ,  0.        ]])

668

In [115]:
def classify(**kwargs):
    clf = Pipeline(
        steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(**kwargs))]
    )
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)

In [131]:
for settings in [
    {"criterion": "gini", "n_estimators":100},
    {"criterion": "gini", "n_estimators":200},
    {"criterion": "gini", "n_estimators":300},
    {"criterion": "gini", "n_estimators":400},
    {"criterion": "gini", "n_estimators":800},
    {"criterion":"entropy", "n_estimators":100}, 
    {"criterion":"entropy", "n_estimators":200}, 
    {"criterion":"entropy", "n_estimators":300}, 
    {"criterion":"entropy", "n_estimators":400},
    {"criterion":"entropy", "n_estimators":800}, 
    
]:
    print(sum([classify(**settings) for _ in range(10)])/10, settings)

0.7318385650224215 {'criterion': 'gini', 'n_estimators': 100}
0.7372197309417041 {'criterion': 'gini', 'n_estimators': 200}
0.7327354260089687 {'criterion': 'gini', 'n_estimators': 300}
0.7318385650224215 {'criterion': 'gini', 'n_estimators': 400}
0.7246636771300448 {'criterion': 'gini', 'n_estimators': 800}
0.7376681614349776 {'criterion': 'entropy', 'n_estimators': 100}
0.7322869955156952 {'criterion': 'entropy', 'n_estimators': 200}
0.7385650224215248 {'criterion': 'entropy', 'n_estimators': 300}
0.7372197309417039 {'criterion': 'entropy', 'n_estimators': 400}
0.7367713004484305 {'criterion': 'entropy', 'n_estimators': 800}


In [133]:
for settings in [ 
    {"criterion":"entropy", "n_estimators":300},
    {"criterion":"entropy", "n_estimators":300, "max_depth":4},
    {"criterion":"entropy", "n_estimators":300, "max_depth":6},
    {"criterion":"entropy", "n_estimators":300, "max_depth":8},
    {"criterion":"entropy", "n_estimators":300, "max_depth":10},
    {"criterion":"entropy", "n_estimators":300, "max_depth":12},
    
]:
    print(sum([classify(**settings) for _ in range(10)])/10, settings)

0.7318385650224215 {'criterion': 'entropy', 'n_estimators': 300}
0.7340807174887893 {'criterion': 'entropy', 'n_estimators': 300, 'max_depth': 4}
0.7452914798206278 {'criterion': 'entropy', 'n_estimators': 300, 'max_depth': 6}
0.747982062780269 {'criterion': 'entropy', 'n_estimators': 300, 'max_depth': 8}
0.7551569506726457 {'criterion': 'entropy', 'n_estimators': 300, 'max_depth': 10}
0.7443946188340808 {'criterion': 'entropy', 'n_estimators': 300, 'max_depth': 12}


In [134]:
for settings in [ 
    {"criterion":"entropy", "n_estimators":300, "max_depth":10},
]:
    print(sum([classify(**settings) for _ in range(10)])/10, settings)

0.7551569506726458 {'criterion': 'entropy', 'n_estimators': 300, 'max_depth': 10}
0.7457399103139014 {'criterion': 'entropy', 'n_estimators': 300, 'max_depth': 10, 'bootstrap': False}


In [136]:
from operator import itemgetter

def classify(**kwargs):
    clf = Pipeline(
        steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(**kwargs))]
    )
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test), clf

In [138]:
settings = {"criterion":"entropy", "n_estimators":300, "max_depth":10}

result = max((classify(**settings) for _ in range(1000)),key=itemgetter(0))
result

(0.7713004484304933,
 Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   ['Age', 'Fare', 'SibSp',
                                                    'Parch']),
                                                  ('cat',
                                                   OneHotEncoder(handle_unknown='ignore'),
                                                   ['Sex', 'Pclass',
                                                    'Embarked'])])),
                 ('classifier',
                  RandomForestClassifier(criterion='entropy', max_depth=10,
    

In [139]:
import pickle

In [2]:
from joblib import dump, load

In [141]:
dump(result[1], 'model.joblib')

['model.joblib']

In [4]:
clf = load("../model/model.joblib")

In [25]:
a = pandas.DataFrame([{
    "Age": 2,
    "SibSp": 0,
    "Parch": 0,
    "Fare": 0,
    "Sex": "male",
    "Pclass": 0,
    "Embarked": "S"
}])

In [19]:
clf.__dict__

{'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', StandardScaler())]),
                                    ['Age', 'Fare', 'SibSp', 'Parch']),
                                   ('cat', OneHotEncoder(handle_unknown='ignore'),
                                    ['Sex', 'Pclass', 'Embarked'])])),
  ('classifier',
   RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=300))],
 'memory': None,
 'verbose': False}

In [26]:
clf.predict(a)

array([0])