In [1]:
import pandas as pd
from pandas import Series,DataFrame

import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [2]:
# Get train & test csv files as a DataFrame
train_df = pd.read_csv("./titanic/train.csv")
test_df  = pd.read_csv("./titanic/test.csv")

In [3]:
# PassengerId, Name, Ticket
train_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
test_df.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [4]:
# Embarked
train_df.drop(['Embarked'], axis=1, inplace=True)
test_df.drop(['Embarked'], axis=1, inplace=True)

In [5]:
# Fare

# Only for test_df, since there is a missing "Fare" values
test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)

# Convert from float to int
train_df['Fare'] = train_df['Fare'].astype(int)
test_df['Fare']  = test_df['Fare'].astype(int)

In [6]:
# Age 

# Get average, std, and number of NaN values in train_df
average_age_train   = train_df["Age"].mean()
std_age_train       = train_df["Age"].std()
count_nan_age_train = train_df["Age"].isnull().sum()

# Get average, std, and number of NaN values in test_df
average_age_test   = test_df["Age"].mean()
std_age_test       = test_df["Age"].std()
count_nan_age_test = test_df["Age"].isnull().sum()

# Generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_train - std_age_train,
                           average_age_train + std_age_train,
                           size = count_nan_age_train)
rand_2 = np.random.randint(average_age_test - std_age_test,
                           average_age_test + std_age_test,
                           size = count_nan_age_test)

# Fill NaN values in Age column with random values generated       
train_df["Age"][np.isnan(train_df["Age"])] = rand_1
test_df["Age"][np.isnan(test_df["Age"])] = rand_2

# Convert from float to int
train_df['Age'] = train_df['Age'].astype(int)
test_df['Age']  = test_df['Age'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
# Cabin
# It has a lot of NaN values, so it won't cause a remarkable impact on prediction
train_df.drop(["Cabin"], axis=1, inplace=True)
test_df.drop(["Cabin"], axis=1, inplace=True)

In [8]:
# Family

# Instead of having two columns Parch (Parents/Children Aboard) & SibSp (Siblings/Spouses Aboard), 
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
train_df['Family'] =  train_df["Parch"] + train_df["SibSp"]
#train_df['Family'].loc[train_df['Family'] > 0] = 1

test_df['Family'] =  test_df["Parch"] + test_df["SibSp"]
#test_df['Family'].loc[test_df['Family'] > 0] = 1

# Drop Parch & SibSp
train_df.drop(['SibSp', 'Parch'], axis=1, inplace=True)
test_df.drop(['SibSp', 'Parch'], axis=1, inplace=True)

In [9]:
# Sex

# Children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
    age, sex = passenger
    return 'child' if age < 16 else sex

train_df['Person'] = train_df[['Age', 'Sex']].apply(get_person, axis=1)
test_df['Person']  = test_df[['Age', 'Sex']].apply(get_person, axis=1)

# No need to use Sex column since we created Person column
train_df.drop(['Sex'], axis=1, inplace=True)
test_df.drop(['Sex'], axis=1, inplace=True)

# Create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies_train  = pd.get_dummies(train_df['Person'])
person_dummies_train.columns = ['Child', 'Female', 'Male']
person_dummies_train.drop(['Male'], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(test_df['Person'])
person_dummies_test.columns = ['Child', 'Female', 'Male']
person_dummies_test.drop(['Male'], axis=1, inplace=True)

train_df = train_df.join(person_dummies_train)
test_df  = test_df.join(person_dummies_test)

# No need to use Person column since we created dummy variables
train_df.drop(['Person'], axis=1, inplace=True)
test_df.drop(['Person'], axis=1, inplace=True)

In [10]:
# Pclass

# Create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
pclass_dummies_train  = pd.get_dummies(train_df['Pclass'])
pclass_dummies_train.columns = ['Class_1', 'Class_2', 'Class_3']
pclass_dummies_train.drop(['Class_3'], axis=1, inplace=True)

pclass_dummies_test  = pd.get_dummies(test_df['Pclass'])
pclass_dummies_test.columns = ['Class_1' ,'Class_2', 'Class_3']
pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True)

train_df = train_df.join(pclass_dummies_train)
test_df  = test_df.join(pclass_dummies_test)

# No need to use Pclass column since we created dummy variables
train_df.drop(['Pclass'], axis=1, inplace=True)
test_df.drop(['Pclass'], axis=1, inplace=True)

In [11]:
# Define training, testing, & final sets

x_train, x_test, y_train, y_test = train_test_split(train_df.drop("Survived", axis=1),
                                                    train_df["Survived"],
                                                    test_size = 0.3)
x_test, x_box, y_test, y_box = train_test_split(x_test, y_test, test_size = 0.3)
x_final  = test_df.drop("PassengerId", axis=1).copy()

In [12]:
# Logistic Regression

max_score = 0
C = 0
logreg = 0

for some_c in np.arange(0.01, 1, 0.01):
    
    some_agent = LogisticRegression(C=some_c, random_state=int(np.random.uniform(low=1,high=100)))
    some_agent.fit(x_train, y_train)
    
    if max_score < some_agent.score(x_test, y_test):
        C = some_c
        max_score = some_agent.score(x_test, y_test)
        logreg = some_agent
        
print('score:', logreg.score(x_test, y_test), 'C:', C)

score: 0.828877005348 C: 0.08


In [13]:
# Random Forests

n_estimators = 0
criterion = 0
max_features = 0
max_depth = 0
random_forest = 0
max_score = 0

for some_est in [10, 50, 100, 200]:
    for some_criterion in ['gini', 'entropy']:
        for some_max_features in ['auto', 'sqrt', 'log2', None]:
            for some_max_depth in [None, 2, 3, 4, 5, 10]:
                
                some_agent = RandomForestClassifier(n_estimators = some_est,
                                                    criterion = some_criterion,
                                                    max_features = some_max_features,
                                                    max_depth = some_max_depth)
                some_agent.fit(x_train, y_train)
                
                if max_score < some_agent.score(x_test, y_test):
                    
                    n_estimators = some_est
                    criterion = some_criterion
                    max_features = some_max_features
                    max_depth = some_max_depth
                    max_score = some_agent.score(x_test, y_test)
                    random_forest = some_agent
                
print('score:', random_forest.score(x_test, y_test), 'n_estimators:', n_estimators,
      'criterion:', criterion, 'max_features:', max_features, 'max_depth:', max_depth)

score: 0.850267379679 n_estimators: 10 criterion: entropy max_features: auto max_depth: 5


In [14]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)

gaussian.score(x_test, y_test)

0.81283422459893051

In [18]:
# Multi-layer Perceptron classifier

mlp = MLPClassifier(hidden_layer_sizes=(20,5),
                    activation='tanh',
                    solver='adam',
                    alpha=0.0001,
                    max_iter=300)
mlp.fit(x_train, y_train)

mlp.score(x_test, y_test)

0.79679144385026734

In [19]:
[mlp, gaussian, random_forest, logreg]
print('mlp    ', mlp.score(x_box, y_box))
print('gaus   ', gaussian.score(x_box, y_box))
print('rf     ', random_forest.score(x_box, y_box))
print('logreg ', logreg.score(x_box, y_box))

mlp     0.851851851852
gaus    0.876543209877
rf      0.83950617284
logreg  0.83950617284


In [61]:
some = np.ndarray(shape=(len(x_box)))
m = mlp.predict(x_box)
g = gaussian.predict(x_box)
r = random_forest.predict(x_box)
l = logreg.predict(x_box)
for i in range(len(x_box)):
    vote = m[i] + g[i] + r[i] + l[i]
    if vote > 2:
        some[i] = 1
    else:
        if vote == 2:
            if np.random.uniform(0, 1) < 0.5:
                some[i] = 1
            else:
                some[i] = 0
        else:
            some[i] = 0

score = 0
for i in range(len(some)):
    if some[i] == y_box.as_matrix()[i]:
        score += 1
print(score/len(some))

0.8641975308641975


In [68]:
y_final = np.ndarray(shape=(len(x_final)), dtype=int)
m = mlp.predict(x_final)
g = gaussian.predict(x_final)
r = random_forest.predict(x_final)
l = logreg.predict(x_final)
for i in range(len(x_final)):
    vote = m[i] + g[i] + r[i] + l[i]
    if vote > 2:
        y_final[i] = 1
    else:
        if vote == 2:
            if np.random.uniform(0, 1) < 0.5:
                y_final[i] = 1
            else:
                y_final[i] = 0
        else:
            y_final[i] = 0

In [69]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": y_final
    })
submission.to_csv('titanic.csv', index=False)