# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Dictionary

- survival        0 = No, 1 = Yes
- pclass	      Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
    - pclass: A proxy for socio-economic status (SES)
       - 1st = Upper
       - 2nd = Middle
       - 3rd = Lower
- sex	          Sex	
- Age	          Age in years	
- sibsp	          # of siblings / spouses aboard the Titanic
    -sibsp: The dataset defines family relations in this way...
        - Sibling = brother, sister, stepbrother, stepsister
        - Spouse = husband, wife (mistresses and fiancés were ignored)

- parch	          # of parents / children aboard the Titanic	
    - parch: The dataset defines family relations in this way...
        - Parent = mother, father
        - Child = daughter, son, stepdaughter, stepson
        - Some children travelled only with a nanny, therefore parch=0 for them.
        
- ticket	      Ticket number	
- fare	          Passenger fare	
- cabin	          Cabin number	
- embarked	      Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

## Import DataFiles

In [4]:
#kaggle_train = pd.read_csv('../DataSets/Kaggle/train.csv')
#kaggle_test = pd.read_csv('../DataSets/Kaggle/test.csv')
titanic_complete = pd.read_csv('../DataSets/TensorFlow/Titanic.csv')

In [5]:
titanic_complete.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


In [6]:
titanic_complete.shape

(1309, 14)

# Data Cleaning

In [7]:
# Dropping unwanted columns
titanic_complete=titanic_complete.drop(['name','ticket','cabin','body','boat','home.dest','embarked'],1)
titanic_complete

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,1,1,female,29,0,0,211.3375
1,1,1,male,0.9167,1,2,151.55
2,1,0,female,2,1,2,151.55
3,1,0,male,30,1,2,151.55
4,1,0,female,25,1,2,151.55
...,...,...,...,...,...,...,...
1304,3,0,female,14.5,1,0,14.4542
1305,3,0,female,?,1,0,14.4542
1306,3,0,male,26.5,0,0,7.225
1307,3,0,male,27,0,0,7.225


### Finding Missing Values('?') and Fixing them

In [8]:
titanic_complete = titanic_complete.replace('?',np.nan)


In [9]:
titanic_complete.dtypes

pclass       int64
survived     int64
sex         object
age         object
sibsp        int64
parch        int64
fare        object
dtype: object

In [10]:
#Converting age and fare into float
titanic_complete['age'] = titanic_complete['age'].astype(float)
titanic_complete['fare'] = titanic_complete['fare'].astype(float)
titanic_complete.dtypes

pclass        int64
survived      int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
dtype: object

In [11]:
#View Missing Values
missing_data = titanic_complete.isnull().sum().sort_values(ascending = False)
missing_data

age         263
fare          1
parch         0
sibsp         0
sex           0
survived      0
pclass        0
dtype: int64

#### AGE

In [12]:
# Fill the missing values with the average of all Ages - MEAN
import math
mean = titanic_complete['age'].dropna().mean()
print(mean)
titanic_complete['age'] = titanic_complete['age'].fillna(math.ceil(mean))


29.8811345124283


### Embarked

# fill nan with Unknown - MODE
from statistics import mode
titanic_complete['embarked'] = titanic_complete['embarked'].fillna(mode(titanic_complete['embarked']))

### Fare

In [13]:
# fill nan with the most frequent values - MODE
from statistics import mode

titanic_complete['fare'] = titanic_complete['fare'].fillna(mode(titanic_complete['fare']))


In [14]:
#View Missing Values
missing_data = titanic_complete.isnull().sum().sort_values(ascending = False)
missing_data

fare        0
parch       0
sibsp       0
age         0
sex         0
survived    0
pclass      0
dtype: int64

In [15]:
data = titanic_complete.copy()
data_binary_encoded = pd.get_dummies(data, columns=["sex"],drop_first=True)
data_binary_encoded.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_male
0,1,1,29.0,0,0,211.3375,0
1,1,1,0.9167,1,2,151.55,1
2,1,0,2.0,1,2,151.55,0
3,1,0,30.0,1,2,151.55,1
4,1,0,25.0,1,2,151.55,0


In [None]:
#new_data = data_binary_encoded.drop(["sex_female"],axis=1)

In [16]:
data_binary_encoded

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_male
0,1,1,29.0000,0,0,211.3375,0
1,1,1,0.9167,1,2,151.5500,1
2,1,0,2.0000,1,2,151.5500,0
3,1,0,30.0000,1,2,151.5500,1
4,1,0,25.0000,1,2,151.5500,0
...,...,...,...,...,...,...,...
1304,3,0,14.5000,1,0,14.4542,0
1305,3,0,30.0000,1,0,14.4542,0
1306,3,0,26.5000,0,0,7.2250,1
1307,3,0,27.0000,0,0,7.2250,1


In [17]:
# Assign X (data) and y (target)
X = data_binary_encoded.drop('survived', axis = 1)
y = data_binary_encoded['survived']
print(X.shape, y.shape)

(1309, 6) (1309,)


In [16]:
X


Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male
0,1,29.0000,0,0,211.3375,0
1,1,0.9167,1,2,151.5500,1
2,1,2.0000,1,2,151.5500,0
3,1,30.0000,1,2,151.5500,1
4,1,25.0000,1,2,151.5500,0
...,...,...,...,...,...,...
1304,3,14.5000,1,0,14.4542,0
1305,3,30.0000,1,0,14.4542,0
1306,3,26.5000,0,0,7.2250,1
1307,3,27.0000,0,0,7.2250,1


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 


In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1,test_size=0.3)

In [25]:
print(X_train.shape, y_train.shape)

(916, 6) (916,)


In [26]:
target_names = ["notsurvived", "survived"]

In [27]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

In [29]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

SVC(gamma='auto', kernel='linear')

In [30]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.789


In [32]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

 notsurvived       0.81      0.85      0.83       236
    survived       0.75      0.70      0.73       157

    accuracy                           0.79       393
   macro avg       0.78      0.77      0.78       393
weighted avg       0.79      0.79      0.79       393



In [33]:
import pickle
list_pickle_path = 'titanicData_SVM.pkl'
list_pickle = open(list_pickle_path, 'wb')

pickle.dump(model, list_pickle)
list_pickle.close()