# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Dictionary

- survival        0 = No, 1 = Yes
- pclass	      Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
    - pclass: A proxy for socio-economic status (SES)
       - 1st = Upper
       - 2nd = Middle
       - 3rd = Lower
- sex	          Sex	
- Age	          Age in years	
- sibsp	          # of siblings / spouses aboard the Titanic
    -sibsp: The dataset defines family relations in this way...
        - Sibling = brother, sister, stepbrother, stepsister
        - Spouse = husband, wife (mistresses and fiancés were ignored)

- parch	          # of parents / children aboard the Titanic	
    - parch: The dataset defines family relations in this way...
        - Parent = mother, father
        - Child = daughter, son, stepdaughter, stepson
        - Some children travelled only with a nanny, therefore parch=0 for them.
        
- ticket	      Ticket number	
- fare	          Passenger fare	
- cabin	          Cabin number	
- embarked	      Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

## Import DataFiles

In [2]:
#kaggle_train = pd.read_csv('../DataSets/Kaggle/train.csv')
#kaggle_test = pd.read_csv('../DataSets/Kaggle/test.csv')
titanic_complete = pd.read_csv('../DataSets/TensorFlow/Titanic.csv')

In [3]:
titanic_complete.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


In [4]:
titanic_complete.shape

(1309, 14)

# Data Cleaning

In [5]:
# Dropping unwanted columns
titanic_complete=titanic_complete.drop(['name','cabin','body','boat','home.dest'],1)
titanic_complete

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,embarked
0,1,1,female,29,0,0,24160,211.3375,S
1,1,1,male,0.9167,1,2,113781,151.55,S
2,1,0,female,2,1,2,113781,151.55,S
3,1,0,male,30,1,2,113781,151.55,S
4,1,0,female,25,1,2,113781,151.55,S
...,...,...,...,...,...,...,...,...,...
1304,3,0,female,14.5,1,0,2665,14.4542,C
1305,3,0,female,?,1,0,2665,14.4542,C
1306,3,0,male,26.5,0,0,2656,7.225,C
1307,3,0,male,27,0,0,2670,7.225,C


### Finding Missing Values('?') and Fixing them

In [6]:
titanic_complete = titanic_complete.replace('?',np.nan)


In [7]:
titanic_complete.dtypes

pclass       int64
survived     int64
sex         object
age         object
sibsp        int64
parch        int64
ticket      object
fare        object
embarked    object
dtype: object

In [9]:
#Converting age and fare into float
titanic_complete['age'] = titanic_complete['age'].astype(float)
titanic_complete['fare'] = titanic_complete['fare'].astype(float)
titanic_complete.dtypes

pclass        int64
survived      int64
sex          object
age         float64
sibsp         int64
parch         int64
ticket       object
fare        float64
embarked     object
dtype: object

In [10]:
#View Missing Values
missing_data = titanic_complete.isnull().sum().sort_values(ascending = False)
missing_data

age         263
embarked      2
fare          1
pclass        0
survived      0
sex           0
sibsp         0
parch         0
ticket        0
dtype: int64

#### AGE

In [11]:
# Fill the missing values with the average of all Ages - MEAN
import math
mean = titanic_complete['age'].dropna().mean()
print(mean)
titanic_complete['age'] = titanic_complete['age'].fillna(math.ceil(mean))


29.8811345124283


### Embarked

In [13]:
# fill nan with Unknown - MODE
from statistics import mode
titanic_complete['embarked'] = titanic_complete['embarked'].fillna(mode(titanic_complete['embarked']))

### Fare

In [14]:
# fill nan with the most frequent values - MODE
from statistics import mode

titanic_complete['fare'] = titanic_complete['fare'].fillna(mode(titanic_complete['fare']))


In [17]:
#View Missing Values
missing_data = titanic_complete.isnull().sum().sort_values(ascending = False)
missing_data

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
ticket      0
fare        0
embarked    0
dtype: int64

## Converting Categorical Data

In [None]:
# Use Pandas get_dummies to convert categorical data

#titanic_complete = pd.get_dummies(titanic_complete)
#titanic_complete.head()


## Logistic Regression
Logistic Regression is a statistical method for predicting binary outcomes from data.

Examples of this are "yes" vs "no" or "young" vs "old". 

These are categories that translate to probability of being a 0 or a 1.

In [None]:
# Assign X (data) and y (target)
X = titanic_complete.drop('survived', axis = 1)
y = titanic_complete['survived']
print(X.shape, y.shape)

### Split Data into Training and Testing

In [None]:
#from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)