In [1]:
import numpy as np
import pandas as pd

In [33]:
train = pd.read_csv("train.csv", dtype={"Age": np.float64}, )
test  = pd.read_csv("test.csv", dtype={"Age": np.float64}, )
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
train.size, test.size

(10692, 4598)

In [16]:
train_corr = train.corr()
train_corr

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.036847,-0.057527,-0.001652,0.012658,-0.017443
Survived,-0.005007,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.125953
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495,0.305762
Sex,-0.042939,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333,-0.022521
Age,0.036847,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067,-0.040924
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651,0.030874
Parch,-0.001652,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225,-0.035957
Fare,0.012658,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0,-0.268865
Embarked,-0.017443,-0.125953,0.305762,-0.022521,-0.040924,0.030874,-0.035957,-0.268865,1.0


In [34]:
def correct_data(titanic_data, all_data):

    titanic_data.Age = titanic_data.Age.fillna(all_data.Age.median())

    titanic_data.Sex = titanic_data.Sex.replace(['male', 'female'], [0, 1])

    titanic_data.Embarked = titanic_data.Embarked.fillna("S")
    titanic_data.Embarked = titanic_data.Embarked.replace(['C', 'S', 'Q'], [0, 1, 2])

    titanic_data.Fare = titanic_data.Fare.fillna(all_data.Fare.median())

    return titanic_data

train_data = correct_data(train, test)
test_data  = correct_data(test, test)

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

models = []

models.append(("LogisticRegression",LogisticRegression()))
models.append(("SVC",SVC()))
models.append(("LinearSVC",LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
models.append(("MLPClassifier",MLPClassifier(solver='lbfgs', random_state=0)))

In [36]:
results = []
names = []
for name,model in models:
    result = cross_val_score(model, train_data[predictors], train_data["Survived"],  cv=3)
    names.append(name)
    results.append(result)

In [45]:
for i in range(len(names)):
    print(names[i],results[i].mean())


LogisticRegression 0.7856341189674523
SVC 0.6891133557800223
LinearSVC 0.7171717171717171
KNeighbors 0.7037037037037037
DecisionTree 0.7687991021324354
RandomForest 0.7912457912457912
MLPClassifier 0.7732884399551065


In [32]:
alg = RandomForestClassifier()
alg.fit(train_data[predictors], train_data["Survived"])

predictions = alg.predict(test_data[predictors])

submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": predictions
    })

submission.to_csv('submission.csv', index=False)


In [43]:
parameters = {
        'n_estimators'      : [5, 10, 20, 30, 50, 100, 300],
        'random_state'      : [0],
#         'n_jobs'            : [1],
        'max_depth'         : [3, 5, 10, 15, 20, 25, 30, 40, 50, 100]
}
gsc = GridSearchCV(RandomForestClassifier(), parameters,cv=3)
gsc.fit(train_data[predictors], train_data["Survived"])

print(gsc.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)


In [44]:
predictions = gsc.predict(test_data[predictors])

submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": predictions
    })

submission.to_csv('submission2.csv', index=False)


In [52]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
gsc = GridSearchCV(LogisticRegression(), param_grid,cv=3)
gsc.fit(train_data[predictors], train_data["Survived"])

print(gsc.best_estimator_)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [53]:
predictions = gsc.predict(test_data[predictors])

submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": predictions
    })

submission.to_csv('submission3.csv', index=False)
