In [8]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score

train_df = pd.read_csv("data/train.csv")

et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)

columns = ["Fare", "Pclass"]

labels = train_df["Survived"].values
features = train_df[list(columns)].values

et_score = cross_val_score(et, features, labels, n_jobs=-1).mean()

print("{} -> {}".format(columns, et_score))

['Fare', 'Pclass'] -> 0.685746352413


In [13]:
columns = ["Fare", "Pclass", "Sex"]

train_df["Sex"] = train_df["Sex"].apply(lambda sex: 0 if sex == "male" else 1)
features = train_df[list(columns)].values

et_score = cross_val_score(et, features, labels, n_jobs=-1).mean()
print("{} -> {}".format(columns, et_score))

['Fare', 'Pclass', 'Sex'] -> 0.68911335578


In [14]:
test_df = pd.read_csv("data/test.csv")
et.fit(features, labels)
et.predict(test_df[columns].values)

ValueError: could not convert string to float: male

# Handling Sex (string to int conversion)

In [19]:
def replace_non_numeric(df):
    df["Sex"] = df["Sex"].apply(lambda sex: 0 if sex == "male" else 1)
    return df

train_df = replace_non_numeric(pd.read_csv("data/train.csv"))
test_df = replace_non_numeric(pd.read_csv("data/test.csv"))

# Handling NaN

In [20]:
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)

et.fit(features, labels)
print (et.predict(imp.transform(test_df[columns].values)))

[0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0
 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0
 1 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0
 1 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0
 0 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0
 0 1 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0
 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1
 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0
 0 0 0 0 1 0 0 0 0 0 1]


# Write to file Kaggle Submission

In [23]:
predictions = et.predict(imp.transform(test_df[columns].values))
test_df["Survived"] = pd.Series(predictions)

test_df.to_csv("output/first_submission.csv", columns=['PassengerId', 'Survived'], index=False)