In [129]:
import numpy as np 
import pandas as pd 

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
# Input data files are available in the "../input/" directory.
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


In [7]:
data = pd.read_csv("/kaggle/input/titanic/train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
validate_data = pd.read_csv("/kaggle/input/titanic/test.csv")
validate_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [68]:
rate_people = sum(data.Survived)/len(data)
print("% of people who survived:", rate_people)

women = data.loc[data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)
print("% of women who survived:", rate_women)

men = data.loc[data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)
print("% of men who survived:", rate_men)

% of people who survived: 0.3838383838383838
% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924


In [69]:
data.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             177
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin           687
Embarked          2
Cabin_Letter      0
Cabin_Number      0
Child             0
Young_Child       0
dtype: int64

In [70]:
#look to see if children had a better survival rate

In [72]:
data[data.Age.isnull()].head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Letter,Cabin_Number,Child,Young_Child
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,X,0.0,False,False
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S,X,0.0,False,False
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C,X,0.0,False,False
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C,X,0.0,False,False
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,X,0.0,False,False


In [73]:
data[data.Age < 10].head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Letter,Cabin_Number,Child,Young_Child
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,X,0.0,True,True
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,G,6.0,True,True
16,17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q,X,0.0,True,True
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.075,,S,X,0.0,True,True
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,1,2,SC/Paris 2123,41.5792,,C,X,0.0,True,True


In [75]:
data.Age = data.Age.fillna(data.Age.mean())
validate_data.Age = validate_data.Age.fillna(data.Age.mean())
data.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin           687
Embarked          2
Cabin_Letter      0
Cabin_Number      0
Child             0
Young_Child       0
dtype: int64

In [77]:
data["Child"] = data.Age <15
validate_data["Child"] = validate_data.Age <15

data["Young_Child"] = data.Age <10
validate_data["Young_Child"] = validate_data.Age <10

In [78]:
data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin_Number,Child,Young_Child
PassengerId,1.0,-0.005007,-0.035144,0.033207,-0.057527,-0.001652,0.012658,0.017502,-0.026833,-0.022602
Survived,-0.005007,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307,0.213129,0.122978,0.128812
Pclass,-0.035144,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495,-0.54211,0.118457,0.104857
Age,0.033207,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566,0.213566,-0.569274,-0.539087
SibSp,-0.057527,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651,-0.054887,0.364654,0.330474
Parch,-0.001652,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225,-0.058468,0.361001,0.34681
Fare,0.012658,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0,0.296027,-0.003117,-0.008962
Cabin_Number,0.017502,0.213129,-0.54211,0.213566,-0.054887,-0.058468,0.296027,1.0,-0.091659,-0.074598
Child,-0.026833,0.122978,0.118457,-0.569274,0.364654,0.361001,-0.003117,-0.091659,1.0,0.88291
Young_Child,-0.022602,0.128812,0.104857,-0.539087,0.330474,0.34681,-0.008962,-0.074598,0.88291,1.0


In [79]:
features = ["Pclass", "Sex", "SibSp", "Parch","Age"]
X_explore = pd.get_dummies(data.drop(columns = ["Name","Ticket","Cabin"]))
X_explore.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin_Number,Child,Young_Child,...,Embarked_S,Cabin_Letter_A,Cabin_Letter_B,Cabin_Letter_C,Cabin_Letter_D,Cabin_Letter_E,Cabin_Letter_F,Cabin_Letter_G,Cabin_Letter_T,Cabin_Letter_X
0,1,0,3,22.0,1,0,7.25,0.0,False,False,...,1,0,0,0,0,0,0,0,0,1
1,2,1,1,38.0,1,0,71.2833,85.0,False,False,...,0,0,0,1,0,0,0,0,0,0
2,3,1,3,26.0,0,0,7.925,0.0,False,False,...,1,0,0,0,0,0,0,0,0,1
3,4,1,1,35.0,1,0,53.1,123.0,False,False,...,1,0,0,1,0,0,0,0,0,0
4,5,0,3,35.0,0,0,8.05,0.0,False,False,...,1,0,0,0,0,0,0,0,0,1


In [80]:
X_explore.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin_Number,Child,Young_Child,...,Embarked_S,Cabin_Letter_A,Cabin_Letter_B,Cabin_Letter_C,Cabin_Letter_D,Cabin_Letter_E,Cabin_Letter_F,Cabin_Letter_G,Cabin_Letter_T,Cabin_Letter_X
PassengerId,1.0,-0.005007,-0.035144,0.033207,-0.057527,-0.001652,0.012658,0.017502,-0.026833,-0.022602,...,0.022148,-0.003256,0.069552,-0.040957,0.022828,0.042351,-0.035772,-0.060049,-0.013814,-0.019919
Survived,-0.005007,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307,0.213129,0.122978,0.128812,...,-0.15566,0.022287,0.175095,0.114652,0.150716,0.145321,0.057935,0.01604,-0.026456,-0.316912
Pclass,-0.035144,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495,-0.54211,0.118457,0.104857,...,0.08172,-0.204934,-0.369572,-0.417048,-0.27869,-0.230091,0.011063,0.055561,-0.052496,0.725541
Age,0.033207,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566,0.213566,-0.569274,-0.539087,...,-0.027121,0.12192,0.091394,0.113149,0.132319,0.117211,-0.077209,-0.077253,0.039469,-0.233123
SibSp,-0.057527,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651,-0.054887,0.364654,0.330474,...,0.070941,-0.046266,-0.034538,0.029251,-0.017575,-0.036865,0.001706,-0.001402,-0.015907,0.04046
Parch,-0.001652,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225,-0.058468,0.361001,0.34681,...,0.063036,-0.040325,0.056498,0.030736,-0.019125,-0.016554,0.023694,0.072388,-0.015878,-0.036987
Fare,0.012658,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0,0.296027,-0.003117,-0.008962,...,-0.166603,0.019549,0.386297,0.364318,0.098878,0.053717,-0.033093,-0.02518,0.002224,-0.482075
Cabin_Number,0.017502,0.213129,-0.54211,0.213566,-0.054887,-0.058468,0.296027,1.0,-0.091659,-0.074598,...,-0.08149,0.052835,0.231563,0.644441,0.123634,0.297033,0.008311,-0.010149,-0.012925,-0.707616
Child,-0.026833,0.122978,0.118457,-0.569274,0.364654,0.361001,-0.003117,-0.091659,1.0,0.88291,...,0.023269,-0.009665,-0.037564,-0.050544,-0.060746,-0.038442,0.094783,0.098002,-0.010383,0.055371
Young_Child,-0.022602,0.128812,0.104857,-0.539087,0.330474,0.34681,-0.008962,-0.074598,0.88291,1.0,...,0.041263,-0.001501,-0.064535,-0.037348,-0.053633,-0.029079,0.113868,0.113595,-0.009167,0.044043


In [25]:
#look at cabin letter
data["Cabin_Letter"] = data.Cabin.fillna('X').astype(str).str[0]
validate_data["Cabin_Letter"] = validate_data.fillna('X').Cabin.astype(str).str[0]

#look at cabin number
s =  data.Cabin.fillna('000').astype(str).str[1:]
data["Cabin_Number"] = pd.to_numeric(s, 'coerce').fillna(0)
s = validate_data.Cabin.fillna('000').astype(str).str[1:]
validate_data["Cabin_Number"] = pd.to_numeric(s, 'coerce').fillna(0)

In [81]:
data.Cabin_Letter.unique()

array(['X', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [82]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Letter,Cabin_Number,Child,Young_Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,X,0.0,False,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C,85.0,False,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,X,0.0,False,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C,123.0,False,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,X,0.0,False,False


In [83]:
features = ["Pclass", "Sex", "SibSp", "Parch","Age"]
X_explore = pd.get_dummies(data.drop(columns = ["Name","Ticket","Cabin"]))
X_explore.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin_Number,Child,Young_Child,...,Embarked_S,Cabin_Letter_A,Cabin_Letter_B,Cabin_Letter_C,Cabin_Letter_D,Cabin_Letter_E,Cabin_Letter_F,Cabin_Letter_G,Cabin_Letter_T,Cabin_Letter_X
PassengerId,1.0,-0.005007,-0.035144,0.033207,-0.057527,-0.001652,0.012658,0.017502,-0.026833,-0.022602,...,0.022148,-0.003256,0.069552,-0.040957,0.022828,0.042351,-0.035772,-0.060049,-0.013814,-0.019919
Survived,-0.005007,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307,0.213129,0.122978,0.128812,...,-0.15566,0.022287,0.175095,0.114652,0.150716,0.145321,0.057935,0.01604,-0.026456,-0.316912
Pclass,-0.035144,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495,-0.54211,0.118457,0.104857,...,0.08172,-0.204934,-0.369572,-0.417048,-0.27869,-0.230091,0.011063,0.055561,-0.052496,0.725541
Age,0.033207,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566,0.213566,-0.569274,-0.539087,...,-0.027121,0.12192,0.091394,0.113149,0.132319,0.117211,-0.077209,-0.077253,0.039469,-0.233123
SibSp,-0.057527,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651,-0.054887,0.364654,0.330474,...,0.070941,-0.046266,-0.034538,0.029251,-0.017575,-0.036865,0.001706,-0.001402,-0.015907,0.04046
Parch,-0.001652,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225,-0.058468,0.361001,0.34681,...,0.063036,-0.040325,0.056498,0.030736,-0.019125,-0.016554,0.023694,0.072388,-0.015878,-0.036987
Fare,0.012658,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0,0.296027,-0.003117,-0.008962,...,-0.166603,0.019549,0.386297,0.364318,0.098878,0.053717,-0.033093,-0.02518,0.002224,-0.482075
Cabin_Number,0.017502,0.213129,-0.54211,0.213566,-0.054887,-0.058468,0.296027,1.0,-0.091659,-0.074598,...,-0.08149,0.052835,0.231563,0.644441,0.123634,0.297033,0.008311,-0.010149,-0.012925,-0.707616
Child,-0.026833,0.122978,0.118457,-0.569274,0.364654,0.361001,-0.003117,-0.091659,1.0,0.88291,...,0.023269,-0.009665,-0.037564,-0.050544,-0.060746,-0.038442,0.094783,0.098002,-0.010383,0.055371
Young_Child,-0.022602,0.128812,0.104857,-0.539087,0.330474,0.34681,-0.008962,-0.074598,0.88291,1.0,...,0.041263,-0.001501,-0.064535,-0.037348,-0.053633,-0.029079,0.113868,0.113595,-0.009167,0.044043


In [84]:
data.groupby('Cabin_Letter').Survived.sum()

Cabin_Letter
A      7
B     35
C     35
D     25
E     24
F      8
G      2
T      0
X    206
Name: Survived, dtype: int64

In [86]:
data.groupby('Cabin_Letter').agg({'Survived': ['sum','count','mean']})

Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,sum,count,mean
Cabin_Letter,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,7,15,0.466667
B,35,47,0.744681
C,35,59,0.59322
D,25,33,0.757576
E,24,32,0.75
F,8,13,0.615385
G,2,4,0.5
T,0,1,0.0
X,206,687,0.299854


In [137]:
# Try Random Random Forest Classifier
y = data["Survived"]

#features = ["Pclass", "Sex", "SibSp", "Parch","Young_Child","Cabin_Letter","Cabin_Number"]
#data_with_dummies = pd.get_dummies(data[features]).drop(columns= ['Sex_male', 'Cabin_Letter_B', 'Cabin_Letter_D',
#       'Cabin_Letter_E'])
#X_train, X_test, Y_train, Y_test = train_test_split(data_with_dummies, y, test_size=0.33, random_state=42)

features = ["Pclass", "Sex", "SibSp", "Parch","Young_Child"]
data_with_dummies = pd.get_dummies(data[features])
X_train, X_test, Y_train, Y_test = train_test_split(data_with_dummies, y, test_size=0.33, random_state=42)


#model = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=1)
#model = GaussianNB()
model = SVC()

model.fit(X_train, Y_train)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_x.PassengerId, 'Survived': predictions})

#0.8305084745762712
#0.8338983050847457
#0.8372881355932204
accuracy_score(Y_test,output["Survived"])



0.8372881355932204

In [140]:
X_train

Unnamed: 0,Pclass,SibSp,Parch,Young_Child,Sex_female,Sex_male
0,3,1,0,False,0,1
1,1,1,0,False,1,0
2,3,0,0,False,1,0
3,1,1,0,False,1,0
4,3,0,0,False,0,1
...,...,...,...,...,...,...
886,2,0,0,False,0,1
887,1,0,0,False,1,0
888,3,1,2,False,1,0
889,1,0,0,False,0,1


In [138]:
Y_train = data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch","Young_Child"]
X_train = pd.get_dummies(data[features])
X_validate = pd.get_dummies(validate_data[features])

#model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model = SVC()
model.fit(X_train, Y_train)
predictions = model.predict(X_validate)

output = pd.DataFrame({'PassengerId': validate_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")



Your submission was successfully saved!


In [None]:
#To Do:
# Cross Validation
# further data exploration
# better feature selection
# Other models
# Parameter Selection
# add comments