In [1]:
import pandas as pd
titanic_data = pd.read_csv('data/train.csv')
X = titanic_data.copy()
y = X.pop("Survived")
X_test = pd.read_csv('data/test.csv')

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


def pd_transform(data, y_data = 0):
    data.drop(["Name", "Ticket","Cabin", "PassengerId"],axis=1,inplace=True)
    data['Age'].fillna(30, inplace=True)
    rows_dropped = data[data["Embarked"].isnull()].index

    data.drop(rows_dropped, inplace=True)

    data['Family'] = data.SibSp + data.Parch
    data.drop(['SibSp','Parch'],axis=1,inplace=True)

    data['Sex'].replace({'male':0,'female':1}, inplace=True)

    if isinstance(y_data, pd.Series):
        y_data.drop(rows_dropped,inplace=True)
    
    onehot = pd.DataFrame(OneHotEncoder(sparse=False).fit_transform(data[['Embarked']]))

    onehot.reset_index(inplace=True)
    data.reset_index(inplace=True)
    data = pd.concat([data, onehot], axis=1)

    data.drop(['Embarked', 'index'],axis=1, inplace=True)
    data.columns = data.columns.map(str)
    
    data = pd.DataFrame(StandardScaler().fit_transform(data))
    
    return data, y_data

In [3]:
def results(search):
    print("--"*30)
    print(f"best score was {search.best_score_}")
    print()
    print(f"best params were {search.best_params_}")
    print("--"*30)

In [4]:
X , y = pd_transform(X, y)

In [5]:
X.head(4)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.825209,-0.735342,-0.595082,-0.50024,0.057853,-0.482711,-0.307941,0.616794
1,-1.572211,1.359911,0.639311,0.788947,0.057853,2.071634,-0.307941,-1.621287
2,0.825209,1.359911,-0.286483,-0.48665,-0.561804,-0.482711,-0.307941,0.616794
3,-1.572211,1.359911,0.407863,0.422861,0.057853,-0.482711,-0.307941,0.616794


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=16)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=16)
scores = cross_val_score(model, X_train, y_train)
scores.mean()

0.80625

In [8]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators':[50,100,200], 'max_depth':[2,4,5,8]}

search = GridSearchCV(model,params)
search.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=16),
             param_grid={'max_depth': [2, 4, 5, 8],
                         'n_estimators': [50, 100, 200]})

In [9]:
results(search)

------------------------------------------------------------
best score was 0.82875

best params were {'max_depth': 8, 'n_estimators': 200}
------------------------------------------------------------


In [10]:
params2 = {'n_estimators':[15,25, 50], 'max_depth':[2,4, 6,8]}

search2 = GridSearchCV(model,params2)
search2.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=16),
             param_grid={'max_depth': [2, 4, 6, 8],
                         'n_estimators': [15, 25, 50]})

In [11]:
results(search2)

------------------------------------------------------------
best score was 0.825

best params were {'max_depth': 8, 'n_estimators': 15}
------------------------------------------------------------


In [12]:
from sklearn import svm
svm_model = svm.SVC()
params4 = {'kernel':['linear', 'poly', 'rbf'], 'degree':[2,3,4]}
search4 = GridSearchCV(svm_model, params4, n_jobs=-1)
search4.fit(X_train,y_train)

GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'degree': [2, 3, 4],
                         'kernel': ['linear', 'poly', 'rbf']})

In [13]:
results(search4)

------------------------------------------------------------
best score was 0.8237500000000001

best params were {'degree': 2, 'kernel': 'rbf'}
------------------------------------------------------------


In [24]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()
params5 = {'max_depth':[3,5,10,15], 'n_estimators':[50,100,200]}
search5 = GridSearchCV(xgb_model, params5, n_jobs=-1)
search5.fit(X_train,y_train);

In [25]:
results(search5)

------------------------------------------------------------
best score was 0.8324999999999999

best params were {'max_depth': 3, 'n_estimators': 50}
------------------------------------------------------------


XGB resulted in the best model for this case

Going to create it with the best params we got

In [26]:
xgb_model = XGBClassifier(max_depth = 3, n_estimators=50)
xgb_model.fit(X,y);

In [27]:
preds2 = xgb_model.predict(X_test)
test_2 = pd.read_csv('data/test.csv')
output = pd.DataFrame({'PassengerId': test_2['PassengerId'],
                       'Survived': preds2})
output.to_csv('./submissionv2.csv', index=False)

Results: top 22% in Kaggle's titanic competition