In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv('/kaggle/input/titanic/train.csv')

# clean data
df.drop(['Cabin'], axis=1, inplace=True) # dropping 'Cabin' column because it has a lot of null values.
df.fillna(df.median(), inplace=True) # fill in NA values with median

# create validation set (stratified sampling)
df["Age_cat"] = pd.cut(df["Age"], bins=[0., 16, 32, 48, 64, np.inf], labels=[1, 2, 3, 4, 5])
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, valid_index in split.split(df, df["Age_cat"]):
    train = df.loc[train_index] 
    valid = df.loc[valid_index]
    
train.dropna(inplace=True) # delete rows with empty values (should only affect categorical column Embarked)


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [3]:
y = train['Survived'] # labels 
X = train.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], 1, inplace=True) # drop the irrelevant columns and keep the rest
X = pd.get_dummies(train, drop_first=True) # convert non-numerical variables to dummy variables
# ----- #
yp = valid['Survived'] # labels 
Xp = valid.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], 1, inplace=True) # drop the irrelevant columns and keep the rest
Xp = pd.get_dummies(valid, drop_first=True) # convert non-numerical variables to dummy variables

In [None]:
# Decision trees
# from sklearn import tree
# dtc = tree.DecisionTreeClassifier()
# dtc.fit(X, y)

In [4]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=6, random_state=2)
clf.fit(X, y)
print("training accuracy = %.3f" %clf.score(X, y), "\nvalidation accuracy = %.3f" %clf.score(Xp, yp))
# Re-build the forest with whole data 
clf.fit(pd.concat([X, Xp]), pd.concat([y, yp])) 
print("accuracy = %.3f" %clf.score(pd.concat([X, Xp]), pd.concat([y, yp])))

training accuracy = 0.859 
validation accuracy = 0.872
accuracy = 0.865


In [5]:
from xgboost import XGBClassifier
clf = XGBClassifier(
 learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 5,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1)
clf.fit(X, y)
# make predictions for test data
y_pred = clf.predict(Xp)
print("training accuracy = %.3f" %clf.score(X, y), "\nvalidation accuracy = %.3f" %clf.score(Xp, yp))
# Re-build the forest with whole data 
clf.fit(pd.concat([X, Xp]), pd.concat([y, yp])) 
print("accuracy = %.3f" %clf.score(pd.concat([X, Xp]), pd.concat([y, yp])))



training accuracy = 0.928 
validation accuracy = 0.860
accuracy = 0.919


In [6]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
clf = Pipeline([ ("scaler", StandardScaler()), ("svc", svm.SVC(C=1000)),])
# clf = svm.SVC(C=1000)
clf.fit(X, y)
print("training accuracy = %.3f" %clf.score(X, y), "\nvalidation accuracy = %.3f" %clf.score(Xp, yp))
# Re-build the forest with whole data
clf.fit(pd.concat([X, Xp]), pd.concat([y, yp]))
print("accuracy = %.3f" %clf.score(pd.concat([X, Xp]), pd.concat([y, yp])))

training accuracy = 0.894 
validation accuracy = 0.860
accuracy = 0.897


In [7]:
# clean and prepare test set
test = pd.read_csv("/kaggle/input/titanic/test.csv") # load the testing data
ids = test[['PassengerId']] # create a sub-dataset for submission file and saving it
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 1, inplace=True) # drop the irrelevant columns
test.fillna(test.median(), inplace=True)
test["Age_cat"] = pd.cut(test["Age"], bins=[0., 16, 32, 48, 64, np.inf], labels=[1, 2, 3, 4, 5])
test = pd.get_dummies(test, drop_first=True) # convert non-numerical variables to dummy variables


In [8]:
predictions = clf.predict(test)
results = ids.assign(Survived = predictions) # assign predictions to ids
results.to_csv("titanic-results.csv", index=False) # write the final dataset to a csv file.
