# Kaggle Titanic Competition
## Titanic: Machine Learning from Disaster


***
## 1. Data Gathering
### Import Libraries

In [189]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

### Load Data 

In [111]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

### Check data frame structure 

In [112]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [113]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


***
## 2. Data Cleaning
### Check and fill missing values 

In [115]:
train.count()
test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

In [116]:
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Embarked"] = train["Embarked"].fillna("S")
train.count()

test["Age"] = test["Age"].fillna(test["Age"].median())
test["Embarked"] = test["Embarked"].fillna("S")
test["Fare"] = test["Fare"].fillna(test["Fare"].median())
test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            418
SibSp          418
Parch          418
Ticket         418
Fare           418
Cabin           91
Embarked       418
dtype: int64

### Convert string to numeric columns 

In [117]:
sex = {'male':0, 'female':1}
train["Sex"] = train["Sex"].map(sex)
train["Sex"].head(10)

sex = {'male':0, 'female':1}
test["Sex"] = test["Sex"].map(sex)
test["Sex"].head(10)

0    0
1    1
2    0
3    0
4    1
5    0
6    1
7    0
8    1
9    0
Name: Sex, dtype: int64

In [118]:
embarked = {'S':0, 'C':1, 'Q':2}
train["Embarked"] = train["Embarked"].map(embarked)
train["Embarked"].head(10)

embarked = {'S':0, 'C':1, 'Q':2}
test["Embarked"] = test["Embarked"].map(embarked)
test["Embarked"].head(10)

0    2
1    0
2    2
3    0
4    0
5    0
6    2
7    0
8    1
9    0
Name: Embarked, dtype: int64

***
## 3. Analyze the Data
### Show data correlation 

***
## 4. Train and Evaluate the Model
### Set target, features to train model 

In [197]:
# Create the target and features numpy arrays: target, features_one
target = train["Survived"].values

features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
test_features_one = test[["Pclass", "Sex", "Age", "Fare"]].values

features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
test_features_two = test[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values

train_three = train.copy()
train_three["family_size"] = train_three["SibSp"] + train_three["Parch"] + 1
features_three = train_three[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values
test_three = test.copy()
test_three["family_size"] = test_three["SibSp"] + test_three["Parch"] + 1
test_features_three = test_three[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values

features_four = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
test_features_four = test[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values

train_five = train.copy()
train_five["family_size"] = train_five["SibSp"] + train_five["Parch"] + 1
features_five = train_five[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "Embarked", "family_size"]].values
test_five = test.copy()
test_five["family_size"] = test_five["SibSp"] + test_five["Parch"] + 1
test_features_five = test_five[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "Embarked", "family_size"]].values

train_five.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            891
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       891
family_size    891
dtype: int64

### Select and train the model 

In [200]:
# Fit your first decision tree: my_tree_one
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)

my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)

my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)

my_tree_four = RandomForestClassifier(max_depth=10, min_samples_split=2, n_estimators=100, random_state=1)
my_tree_four = my_tree_four.fit(features_four, target)

my_tree_five = RandomForestClassifier(max_depth=10, min_samples_split=2, n_estimators=100, random_state=1)
my_tree_five = my_tree_five.fit(features_five, target)

In [201]:
# Look at the importance and score of the included features
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))

print(my_tree_two.feature_importances_)
print(my_tree_two.score(features_two, target))

print(my_tree_three.feature_importances_)
print(my_tree_three.score(features_three, target))

print(my_tree_four.feature_importances_)
print(my_tree_four.score(features_four, target))

print(my_tree_five.feature_importances_)
print(my_tree_five.score(features_five, target))

[ 0.12482906  0.31274009  0.22948397  0.33294688]
0.977553310887
[ 0.14130255  0.17906027  0.41616727  0.17938711  0.05039699  0.01923751
  0.0144483 ]
0.905723905724
[ 0.11014746  0.31088095  0.2111209   0.27360016  0.03860819  0.02930762
  0.02633471]
0.979797979798
[ 0.10384741  0.20139027  0.31989322  0.24602858  0.05272693  0.04159232
  0.03452128]
0.939393939394
[ 0.09915728  0.31168455  0.20147389  0.22532166  0.03731813  0.03178077
  0.03580558  0.05745814]
0.938271604938


***
## 5. Predict the Test set

### Select test model features 

In [202]:
test_five.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            418
SibSp          418
Parch          418
Ticket         418
Fare           418
Cabin           91
Embarked       418
family_size    418
dtype: int64

### Make predictions 

In [203]:
# Make your prediction using the test set
my_prediction = my_tree_one.predict(test_features).reshape((418,1))
my_prediction.shape

my_prediction_two = my_tree_two.predict(test_features_two).reshape((418,1))
my_prediction_two.shape

my_prediction_three = my_tree_three.predict(test_features_three).reshape((418,1))
my_prediction_three.shape

my_prediction_four = my_tree_four.predict(test_features_four).reshape((418,1))
my_prediction_four.shape

my_prediction_five = my_tree_five.predict(test_features_five).reshape((418,1))
my_prediction_five.shape

(418, 1)

***
## 6. Save Results
### Create the Solution 

In [204]:
my_solution = pd.DataFrame(my_prediction_five, test["PassengerId"], columns = ["Survived"])
my_solution.index.name

'PassengerId'

### Save to CSV file 

In [205]:
my_solution.to_csv("my_solution_five.csv")

***
***
# Rough Work 

In [17]:
import time

a = np.random.rand(1000000)
b = np.random.rand(1000000)

tic = time.time()
c = np.dot(a,b)
toc = time.time()

print(c)
print("Vectorized exec time: " + str(1000*(toc-tic)) + "ms")

c = 0
tic = time.time()
for i in range(1000000):
    c += a[i]*b[i]
toc = time.time()

print(c)
print("For-loop exec time: " + str(1000*(toc-tic)) + "ms")

249972.075277
Vectorized exec time: 1.5177726745605469ms
249972.075277
For-loop exec time: 624.934196472168ms


In [18]:
import numpy as np

A = np.array([[56.0, 0.0, 4.4, 68.0],
              [1.2, 104.0, 52.0, 8.0],
              [1.8, 135.0, 99.0, 0.9]])

print(A)

[[  56.     0.     4.4   68. ]
 [   1.2  104.    52.     8. ]
 [   1.8  135.    99.     0.9]]


In [19]:
cal = A.sum(axis=0)
print(cal)

[  59.   239.   155.4   76.9]


In [20]:
cal.reshape(1,4)

array([[  59. ,  239. ,  155.4,   76.9]])

In [22]:
p = 100*A/cal
print(p)

[[ 94.91525424   0.           2.83140283  88.42652796]
 [  2.03389831  43.51464435  33.46203346  10.40312094]
 [  3.05084746  56.48535565  63.70656371   1.17035111]]


In [27]:
B=np.array([1,2,3,4])

In [28]:
print(B)

[1 2 3 4]


In [29]:
print(B.T)

[1 2 3 4]


In [38]:
B


array([1, 2, 3, 4])

In [37]:
B.T

array([1, 2, 3, 4])

In [36]:
B.shape

(4,)

In [34]:
B.reshape(4,1)

array([[1],
       [2],
       [3],
       [4]])

In [39]:
B

array([1, 2, 3, 4])

In [40]:
B=B.reshape(4,1
           )

In [45]:
B

array([[1],
       [2],
       [3],
       [4]])

In [42]:
B.shape

(4, 1)

In [43]:
B.T


array([[1, 2, 3, 4]])

In [44]:
%matplotlib inline