In [2]:
import csv as csv
csv_file_object = csv.reader(open('train.csv', 'rb'))
header = csv_file_object.next()

print header

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [3]:
import numpy as np

data = []
for row in csv_file_object:
    data.append(row)
data = np.array(data)

print data

[['1' '0' '3' ..., '7.25' '' 'S']
 ['2' '1' '1' ..., '71.2833' 'C85' 'C']
 ['3' '1' '3' ..., '7.925' '' 'S']
 ..., 
 ['889' '0' '3' ..., '23.45' '' 'S']
 ['890' '1' '1' ..., '30' 'C148' 'C']
 ['891' '0' '3' ..., '7.75' '' 'Q']]


In [12]:
number_passengers = np.size(data[0::, 1].astype(np.float))
number_survived = np.sum(data[0::, 1].astype(np.float))
proportion_survivors = number_survived / number_passengers

print number_passengers
print number_survived
print proportion_survivors

891
342.0
0.383838383838


In [23]:
print data[0:10, 4]

['male' 'female' 'female' 'female' 'male' 'male' 'male' 'male' 'female'
 'female']


In [27]:
women_only_stats = data[0::, 4] == "female"
men_only_stats = data[0::, 4] != "female"

print women_only_stats[0:10]

[False  True  True  True False False False False  True  True]


In [38]:
women_onboard = data[women_only_stats, 1].astype(np.float)
men_onboard = data[men_only_stats,1].astype(np.float)

print women_onboard[0:10]

[ 1.  1.  1.  1.  1.  1.  1.  0.  1.  0.]


In [41]:
proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)
proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)

print 'Proportion of women who survived is %s' % proportion_women_survived
print 'Proportion of men who survived is %s' % proportion_men_survived

Proportion of women who survived is 0.742038216561
Proportion of men who survived is 0.188908145581


In [46]:
test_file = open('test.csv', 'rb')
test_file_object = csv.reader(test_file)

test_header = test_file_object.next()

print test_header

['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [48]:
prediction_file = open("genderbasedmodel.csv", "wb")
prediction_file_object = csv.writer(prediction_file)

In [None]:
prediction_file_object.writerow(["PassengerId", "Survived"])

for row in test_file_object:
    if row[3] == 'female':                                       
        prediction_file_object.writerow([row[0], '1'])
    else:       
        prediction_file_object.writerow([row[0], '0'])

test_file.close()
prediction_file.close()

In [54]:
# So we add a ceiling
fare_ceiling = 40
# then modify the data in the Fare column to = 39, if it is greater or equal to the ceiling
data[data[0::, 9].astype(np.float) >= fare_ceiling, 9] = fare_ceiling - 1.0

print data[0:10, 9]

['7.25' '39.0' '7.925' '39.0' '8.05' '8.4583' '39.0' '21.075' '11.1333'
 '30.0708']


In [55]:
fare_bracket_size = 10
number_of_price_brackets = fare_ceiling / fare_bracket_size

# I know there were 1st, 2nd and 3rd classes on board
number_of_classes = 3

# But it's better practice to calculate this from the data directly
# Take the length of an array of unique values in column index 2
number_of_classes = len(np.unique(data[0::, 2]))

print number_of_classes

3


In [57]:
survival_table = np.zeros((2, number_of_classes, number_of_price_brackets))

print survival_table

[[[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]


In [61]:
for i in xrange(number_of_classes): 
    for j in xrange(number_of_price_brackets):
        women_only_stats = \
            data[
                (data[0::, 4] == "female") &  # is a female
                (data[0::, 2].astype(np.float) == i+1) &  # and was ith class
                (data[0:, 9].astype(np.float) >= j*fare_bracket_size) &  # was greater than this bin      
                (data[0:, 9].astype(np.float) < (j+1)*fare_bracket_size),  # and less than the next bin
                1  # in the 2nd col
            ]
        
        men_only_stats = \
            data[           
                (data[0::,4] != "female") &
                (data[0::,2].astype(np.float) == i+1) &
                (data[0:,9].astype(np.float) >= j*fare_bracket_size) &              
                (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size),    
                1
            ]
        
        survival_table[0, i, j] = np.mean(women_only_stats.astype(np.float)) 
        survival_table[1, i, j] = np.mean(men_only_stats.astype(np.float))
        survival_table[survival_table != survival_table] = 0.
        
print survival_table

[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]
