In [1]:
import pandas as pd
from sklearn import cross_validation, svm, preprocessing, pipeline

In [2]:
# define training and testing data
train_file = "../data/train.csv"
test_file = "../data/test.csv"

# read files into pandas
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [3]:
# compute default values for missing data
med_age = train.Age.median()  # median age
mod_emb = train.Embarked.mode().values[0]  # mode embarked point
med_fair = train.Fare.median()  # median fair

# fill missing columns
train.fillna({'Age': med_age, 'Embarked': mod_emb}, inplace=True)
test.fillna({'Age': med_age, 'Fare': med_fair}, inplace=True)

# recode categorical variables
sex = {'female': 0, 'male': 1}
emb = {'S': 0, 'C': 1, 'Q': 2}
train.replace({'Sex': sex, 'Embarked': emb}, inplace=True)
test.replace({'Sex': sex, 'Embarked': emb}, inplace=True)

In [4]:
# create feature vectors and labels
features = ['Sex', 'Pclass', 'Age', 'Fare', 'SibSp', 'Parch', 'Embarked']
X = train[features].values
y = train['Survived'].values

In [5]:
# create the classifier
clf = pipeline.make_pipeline(preprocessing.StandardScaler(), svm.LinearSVC())

# cross validate on the training data
scores = cross_validation.cross_val_score(clf, X, y, cv=5)
print(scores)

[ 0.78212291  0.7877095   0.78089888  0.75280899  0.8079096 ]


In [6]:
# predict on testing data
clf.fit(X, y)
test['Survived'] = clf.predict(test[features])

In [7]:
# write predictions to csv
pred_file = '../data/svm.csv'
test[['PassengerId', 'Survived']].to_csv(pred_file, index=False)