In [73]:
import pandas as pd
import numpy as np

from sklearn import cross_validation
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import KFold

In [74]:
# Importing the dataset...
titanic = pd.read_csv("../../datasets/train.csv")
titanic_test = pd.read_csv("../../datasets/test.csv")

# Replaces missing ages w/ the median
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())

# Replaces sex with integers
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

# Replaces missing embarked data to "S" and converts to integers
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

In [75]:
# Parameters to be considered:
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initializes linear regression algorithm and cross-validation folds
alg1 = LinearRegression()
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # Selects training folds
    train_predictors = (titanic[predictors].iloc[train,:])
    # Selects target fold
    train_target = titanic["Survived"].iloc[train]
    
    # Calculates and stores prediction
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)
    
# Flattens the predictions matrix
predictions = np.concatenate(predictions, axis=0)
predictions[predictions <= 0.5] = 0
predictions[predictions > 0.5] = 1

# Calculates the accuracy of this algorithm
correct = sum(predictions[predictions == titanic["Survived"]])
total = len(predictions)
accuracy = float(correct) / total

print accuracy

0.789001122334




In [76]:
# Initializes logistic regression algorithm
alg2 = LogisticRegression(random_state=1)

# Computes accuracy for cross-validation folds
scores = cross_validation.cross_val_score(alg2, titanic[predictors], titanic["Survived"], cv=3)

# Prints the mean of the score for all folds
print scores.mean()

0.787878787879


In [77]:
# Processing the test data...
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())

titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic["Fare"].median())

In [83]:
# Performs logistic regression and generates a submission
alg = LogisticRegression(random_state=1)
alg.fit(titanic[predictors], titanic["Survived"])
predictions = alg.predict(titanic_test[predictors])
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("submission_iteration1.csv", index=False)

The accuracy of this model is 75.10%