# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Dictionary

- survival        0 = No, 1 = Yes
- pclass	      Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
    - pclass: A proxy for socio-economic status (SES)
       - 1st = Upper
       - 2nd = Middle
       - 3rd = Lower
- sex	          Sex	
- Age	          Age in years	
- sibsp	          # of siblings / spouses aboard the Titanic
    -sibsp: The dataset defines family relations in this way...
        - Sibling = brother, sister, stepbrother, stepsister
        - Spouse = husband, wife (mistresses and fiancés were ignored)

- parch	          # of parents / children aboard the Titanic	
    - parch: The dataset defines family relations in this way...
        - Parent = mother, father
        - Child = daughter, son, stepdaughter, stepson
        - Some children travelled only with a nanny, therefore parch=0 for them.
        
- ticket	      Ticket number	
- fare	          Passenger fare	
- cabin	          Cabin number	
- embarked	      Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

## Import DataFiles

In [2]:
#kaggle_train = pd.read_csv('../DataSets/Kaggle/train.csv')
#kaggle_test = pd.read_csv('../DataSets/Kaggle/test.csv')
titanic_complete = pd.read_csv('../DataSets/TensorFlow/Titanic.csv')

In [3]:
titanic_complete.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


In [4]:
titanic_complete.shape

(1309, 14)

# Data Cleaning

In [5]:
# Dropping unwanted columns
titanic_complete=titanic_complete.drop(['name','ticket','cabin','body','boat','home.dest','embarked'],1)
titanic_complete

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,1,1,female,29,0,0,211.3375
1,1,1,male,0.9167,1,2,151.55
2,1,0,female,2,1,2,151.55
3,1,0,male,30,1,2,151.55
4,1,0,female,25,1,2,151.55
...,...,...,...,...,...,...,...
1304,3,0,female,14.5,1,0,14.4542
1305,3,0,female,?,1,0,14.4542
1306,3,0,male,26.5,0,0,7.225
1307,3,0,male,27,0,0,7.225


### Finding Missing Values('?') and Fixing them

In [6]:
titanic_complete = titanic_complete.replace('?',np.nan)


In [8]:
titanic_complete.dtypes

pclass       int64
survived     int64
sex         object
age         object
sibsp        int64
parch        int64
fare        object
dtype: object

In [7]:
#Converting age and fare into float
titanic_complete['age'] = titanic_complete['age'].astype(float)
titanic_complete['fare'] = titanic_complete['fare'].astype(float)
titanic_complete.dtypes

pclass        int64
survived      int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
dtype: object

In [8]:
#View Missing Values
missing_data = titanic_complete.isnull().sum().sort_values(ascending = False)
missing_data

age         263
fare          1
parch         0
sibsp         0
sex           0
survived      0
pclass        0
dtype: int64

#### AGE

In [9]:
# Fill the missing values with the average of all Ages - MEAN
import math
mean = titanic_complete['age'].dropna().mean()
print(mean)
titanic_complete['age'] = titanic_complete['age'].fillna(math.ceil(mean))


29.8811345124283


### Embarked

# fill nan with Unknown - MODE
from statistics import mode
titanic_complete['embarked'] = titanic_complete['embarked'].fillna(mode(titanic_complete['embarked']))

### Fare

In [10]:
# fill nan with the most frequent values - MODE
from statistics import mode

titanic_complete['fare'] = titanic_complete['fare'].fillna(mode(titanic_complete['fare']))


In [11]:
#View Missing Values
missing_data = titanic_complete.isnull().sum().sort_values(ascending = False)
missing_data

fare        0
parch       0
sibsp       0
age         0
sex         0
survived    0
pclass      0
dtype: int64

In [29]:
data = titanic_complete.copy()
data_binary_encoded = pd.get_dummies(data, columns=["sex"],drop_first=True)
data_binary_encoded.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_male
0,1,1,29.0,0,0,211.3375,0
1,1,1,0.9167,1,2,151.55,1
2,1,0,2.0,1,2,151.55,0
3,1,0,30.0,1,2,151.55,1
4,1,0,25.0,1,2,151.55,0


In [13]:
#new_data = data_binary_encoded.drop(["sex_female"],axis=1)

In [30]:
data_binary_encoded

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_male
0,1,1,29.0000,0,0,211.3375,0
1,1,1,0.9167,1,2,151.5500,1
2,1,0,2.0000,1,2,151.5500,0
3,1,0,30.0000,1,2,151.5500,1
4,1,0,25.0000,1,2,151.5500,0
...,...,...,...,...,...,...,...
1304,3,0,14.5000,1,0,14.4542,0
1305,3,0,30.0000,1,0,14.4542,0
1306,3,0,26.5000,0,0,7.2250,1
1307,3,0,27.0000,0,0,7.2250,1


In [31]:
# Assign X (data) and y (target)
X = data_binary_encoded.drop('survived', axis = 1)
y = data_binary_encoded['survived']
print(X.shape, y.shape)

(1309, 6) (1309,)


In [32]:
X


Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male
0,1,29.0000,0,0,211.3375,0
1,1,0.9167,1,2,151.5500,1
2,1,2.0000,1,2,151.5500,0
3,1,30.0000,1,2,151.5500,1
4,1,25.0000,1,2,151.5500,0
...,...,...,...,...,...,...
1304,3,14.5000,1,0,14.4542,0
1305,3,30.0000,1,0,14.4542,0
1306,3,26.5000,0,0,7.2250,1
1307,3,27.0000,0,0,7.2250,1


In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 


In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)

In [38]:
print(X_train.shape, y_train.shape)

(981, 6) (981,)


In [39]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
from sklearn.linear_model import LogisticRegression
model_LogisticRegression = LogisticRegression()
model_LogisticRegression 

LogisticRegression()

In [41]:

model_LogisticRegression.fit(X_train_scaled , y_train)

LogisticRegression()

In [42]:
print(f"Training Data Score: {model_LogisticRegression.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_LogisticRegression.score(X_test_scaled, y_test)}")

Training Data Score: 0.7747196738022426
Testing Data Score: 0.8140243902439024


In [43]:
predictions = model_LogisticRegression.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
201,0,0
115,1,0
255,0,1
1103,1,0
195,1,1
...,...,...
34,0,0
85,0,1
984,0,1
156,0,0


## Converting Categorical Data

In [None]:
# tune up the model ....base line model

In [None]:
# Use Pandas get_dummies to convert categorical data

#titanic_complete = pd.get_dummies(titanic_complete)
#titanic_complete.head()


In [None]:
# Remove Extra Categorical Columns

In [48]:
# impo
list_pickle_path = 'titanicData.pkl'
list_pickle = open(list_pickle_path, 'wb')

pickle.dump(model_LogisticRegression, list_pickle)
list_pickle.close()

b'\x80\x04\x95\xea\x02\x00\x00\x00\x00\x00\x00\x8c\x1esklearn.linear_model._logistic\x94\x8c\x12LogisticRegression\x94\x93\x94)\x81\x94}\x94(\x8c\x07penalty\x94\x8c\x02l2\x94\x8c\x04dual\x94\x89\x8c\x03tol\x94G?\x1a6\xe2\xeb\x1cC-\x8c\x01C\x94G?\xf0\x00\x00\x00\x00\x00\x00\x8c\rfit_intercept\x94\x88\x8c\x11intercept_scaling\x94K\x01\x8c\x0cclass_weight\x94N\x8c\x0crandom_state\x94N\x8c\x06solver\x94\x8c\x05lbfgs\x94\x8c\x08max_iter\x94Kd\x8c\x0bmulti_class\x94\x8c\x04auto\x94\x8c\x07verbose\x94K\x00\x8c\nwarm_start\x94\x89\x8c\x06n_jobs\x94N\x8c\x08l1_ratio\x94N\x8c\x0en_features_in_\x94K\x06\x8c\x08classes_\x94\x8c\x15numpy.core.multiarray\x94\x8c\x0c_reconstruct\x94\x93\x94\x8c\x05numpy\x94\x8c\x07ndarray\x94\x93\x94K\x00\x85\x94C\x01b\x94\x87\x94R\x94(K\x01K\x02\x85\x94h\x1c\x8c\x05dtype\x94\x93\x94\x8c\x02i8\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01<\x94NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89C\x10\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x94t\x94b\x

## Logistic Regression
Logistic Regression is a statistical method for predicting binary outcomes from data.

Examples of this are "yes" vs "no" or "young" vs "old". 

These are categories that translate to probability of being a 0 or a 1.

### Split Data into Training and Testing

In [None]:
#from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)