In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [89]:
# Treating AGE and Pclass

age_mean = train['Age'].mean()
train['Age'] = train['Age'].fillna(age_mean)
test['Age'] = test['Age'].fillna(age_mean)

cut_points = [-1,0,5,12,18,35,60,100]
age_labels = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]

train['Age'] = pd.cut(train['Age'],cut_points, labels=age_labels)
test['Age'] = pd.cut(test['Age'],cut_points, labels=age_labels)

age_dummies_train = pd.get_dummies(train['Age'], prefix='Age')
train = pd.concat([train, age_dummies_train], axis=1)

age_dummies_test = pd.get_dummies(test['Age'], prefix='Age')
test = pd.concat([test, age_dummies_test], axis=1)

pclass_dummies_train = pd.get_dummies(train['Pclass'], prefix='Pclass')
train = pd.concat([train, pclass_dummies_train], axis=1)

pclass_dummies_test = pd.get_dummies(test['Pclass'], prefix='Pclass')
test = pd.concat([test, pclass_dummies_test], axis=1)

sex_dummies_train = pd.get_dummies(train['Sex'], prefix='Sex')
train = pd.concat([train, sex_dummies_train], axis=1)

sex_dummies_test = pd.get_dummies(test['Sex'], prefix='Sex')
test = pd.concat([test, sex_dummies_test], axis=1)

train['Fare_categories'] = pd.cut(train['Fare'], [0, 7, 9, 11, 14, 31, 513], labels=['low', 'low2', 'med', 'med2', 'med3', 'high'])
test['Fare_categories'] = pd.cut(test['Fare'], [0, 7, 9, 11, 14, 31, 513], labels=['low', 'low2', 'med', 'med2', 'med3', 'high'])

fare_dummies_train = pd.get_dummies(train['Fare_categories'], prefix='Fare')
train = pd.concat([train, fare_dummies_train], axis=1)

fare_dummies_test = pd.get_dummies(test['Fare_categories'], prefix='Fare')
test = pd.concat([test, fare_dummies_test], axis=1)

In [91]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Pclass_3,Sex_female,Sex_male,Fare_categories,Fare_low,Fare_low2,Fare_med,Fare_med2,Fare_med3,Fare_high
0,892,3,"Kelly, Mr. James",male,Young Adult,0,0,330911,7.8292,,...,1,0,1,low2,0,1,0,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,Adult,1,0,363272,7.0,,...,1,1,0,low,1,0,0,0,0,0
2,894,2,"Myles, Mr. Thomas Francis",male,Senior,0,0,240276,9.6875,,...,0,0,1,med,0,0,1,0,0,0
3,895,3,"Wirz, Mr. Albert",male,Young Adult,0,0,315154,8.6625,,...,1,0,1,low2,0,1,0,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,Young Adult,1,1,3101298,12.2875,,...,1,1,0,med2,0,0,0,1,0,0


In [92]:
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Age_Missing', 
           'Age_Child', 'Age_Adult', 'Age_Infant', 'Age_Teenager', 'Age_Young Adult', 'Age_Senior', 'Sex_male', 
           'Sex_female', 'Fare_low', 'Fare_low2', 'Fare_med', 'Fare_med2', 'Fare_med3', 'Fare_high']

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(train[columns], train['Survived'])
predictions = dt.predict(train[columns])
accuracy_score(predictions, train['Survived'])

0.84624017957351294

In [94]:
predictions = dt.predict(test[columns])

submission = {
    "PassengerId": test['PassengerId'],
    "Survived": predictions
}

submission_df = pd.DataFrame(submission)
submission_df.to_csv('submission.csv', index=False)