In [20]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv('/kaggle/input/titanic/train.csv')

# clean data
df.drop(['Cabin'], axis=1, inplace=True) # dropping 'Cabin' column because it has a lot of null values.
df.fillna(df.median(), inplace=True) # fill in NA values with median (could be also random)

# create validation set (stratified sampling based on Age)
df["Age_cat"] = pd.cut(df["Age"], bins=[0., 16, 32, 48, 64, np.inf], labels=[1, 2, 3, 4, 5])
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, valid_index in split.split(df, df["Age_cat"]):
    train = df.loc[train_index] 
    valid = df.loc[valid_index]
    
train.dropna(inplace=True) # delete rows with empty values (should only affect categorical column Embarked)


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [21]:
y = train['Survived'] # labels 
X = train.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Age_cat'], 1, inplace=True) # drop the irrelevant columns and keep the rest
X = pd.get_dummies(train, drop_first=True) # convert non-numerical variables to dummy variables
# ----- #
yp = valid['Survived'] # labels 
Xp = valid.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Age_cat'], 1, inplace=True) # drop the irrelevant columns and keep the rest
Xp = pd.get_dummies(valid, drop_first=True) # convert non-numerical variables to dummy variables

# clean and prepare test set
test = pd.read_csv("/kaggle/input/titanic/test.csv") # load the testing data
ids = test[['PassengerId']] # create a sub-dataset for submission file and saving it
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 1, inplace=True) # drop the irrelevant columns
test.fillna(test.median(), inplace=True)
# test["Age_cat"] = pd.cut(test["Age"], bins=[0., 16, 32, 48, 64, np.inf], labels=[1, 2, 3, 4, 5])
test = pd.get_dummies(test, drop_first=True) # convert non-numerical variables to dummy variables

In [22]:
# Decision trees
from sklearn import tree
clf_tree = tree.DecisionTreeClassifier()
clf_tree.fit(X, y)
print("training accuracy = %.3f" %clf_tree.score(X, y), "\nvalidation accuracy = %.3f" %clf_tree.score(Xp, yp))
# Re-build the forest with whole data 
clf_tree.fit(pd.concat([X, Xp]), pd.concat([y, yp])) 
print("accuracy = %.3f" %clf_tree.score(pd.concat([X, Xp]), pd.concat([y, yp])))

training accuracy = 0.980 
validation accuracy = 0.832
accuracy = 0.980


In [37]:
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(max_depth=9, random_state=2)
clf_forest.fit(X, y)
print("training accuracy = %.3f" %clf_forest.score(X, y), "\nvalidation accuracy = %.3f" %clf_forest.score(Xp, yp))
# Re-build the forest with whole data 
clf_forest.fit(pd.concat([X, Xp]), pd.concat([y, yp])) 
print("accuracy = %.3f" %clf_forest.score(pd.concat([X, Xp]), pd.concat([y, yp])))


# from sklearn.model_selection import GridSearchCV
# param_grid = [ {'max_depth': range(1,10)} ]

# clf_forest = RandomForestClassifier()
# grid_search = GridSearchCV(clf_forest, param_grid)
# grid_search.fit(X, y)
# print("training accuracy = %.3f" %grid_search.score(X, y), "\nvalidation accuracy = %.3f" %grid_search.score(Xp, yp))


training accuracy = 0.937 
validation accuracy = 0.883
accuracy = 0.930


In [9]:
grid_search.best_params_

{'max_depth': 5}

In [None]:
from xgboost import XGBClassifier
clf_xgb = XGBClassifier(
    eval_metric="logloss",
    use_label_encoder=False,
    learning_rate=0.02,
    n_estimators=2000,
    max_depth= 5,
    min_child_weight= 2,
    gamma=0.9,                        
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread= -1,
    scale_pos_weight=1)
clf_xgb.fit(X, y)
# make predictions for test data
y_pred = clf_xgb.predict(Xp)
print("training accuracy = %.3f" %clf_xgb.score(X, y), "\nvalidation accuracy = %.3f" %clf_xgb.score(Xp, yp))
# Re-build the forest with whole data 
clf_xgb.fit(pd.concat([X, Xp]), pd.concat([y, yp])) 
print("accuracy = %.3f" %clf_xgb.score(pd.concat([X, Xp]), pd.concat([y, yp])))

In [None]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
clf_svm = Pipeline([ ("scaler", StandardScaler()), ("svc", svm.SVC(C=1000)),])
clf_svm.fit(X, y)
print("training accuracy = %.3f" %clf.score(X, y), "\nvalidation accuracy = %.3f" %clf.score(Xp, yp))
# Re-build the forest with whole data
clf_svm.fit(pd.concat([X, Xp]), pd.concat([y, yp]))
print("accuracy = %.3f" %clf_svm.score(pd.concat([X, Xp]), pd.concat([y, yp])))

# grid_search =GridSearchCV(estimator=svm.SVC(), param_grid={'C': [1, 100], 'kernel': ('linear', 'rbf')})
# grid_search.fit(X, y)
# print("training accuracy = %.3f" %grid_search.score(X, y), "\nvalidation accuracy = %.3f" %grid_search.score(Xp, yp))

In [38]:

predictions = clf_forest.predict(test)
results = ids.assign(Survived = predictions) # assign predictions to ids
results.to_csv("results.csv", index=False) # write the final dataset to a csv file.