## Mounting Colab on my Google Drive

In [51]:
# from google.colab import drive
# drive.mount("/content/drive")

Changing the directory 

In [52]:
# %cd "code here"

Checking the contents of the current directory

In [53]:
# !ls

# Tutorial 1 The Titanic Disaster

## Project Summary
Predict which passengers survived the Titanic shipwreck.

## Importing Some Basic Libraries

In [54]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt

## Importing the Dataset

In [55]:
dataset = pd.read_csv('Titanic_Data.csv')

## Showing the Dataset in a Table

In [56]:
dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Number of Siblings/Spouses Aboard,Number of Parents/Children Aboard,Fare,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,No
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,Yes
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,Yes
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,Yes
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,No


## A Quick Review of the Data

In [57]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   PassengerId                        891 non-null    int64  
 1   Pclass                             891 non-null    int64  
 2   Name                               891 non-null    object 
 3   Sex                                891 non-null    object 
 4   Age                                714 non-null    float64
 5   Number of Siblings/Spouses Aboard  891 non-null    int64  
 6   Number of Parents/Children Aboard  891 non-null    int64  
 7   Fare                               891 non-null    float64
 8   Embarked                           889 non-null    object 
 9   Survived                           891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


## Encoding Categorical Data

### Encoding the Input Data

**For the gender column**

In [58]:
gender = {"male":0.0, "female":1.0}

For the 'embarked' column

In [59]:
dataset['Sex'] = dataset['Sex'].map(gender)

In [60]:
dataset['Sex'].head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Sex, dtype: float64

In [61]:
print(dataset['Embarked'].unique())

['S' 'C' 'Q' nan]


In [62]:
ports = {'S':0.0, 'C':1.0, 'Q':2.0}

In [63]:
dataset['Embarked'] = dataset['Embarked'].map(ports)

In [64]:
dataset['Embarked'].head()

0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
Name: Embarked, dtype: float64

### Encoding the Output Data (Labels)

In [65]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
dataset['Survived'] = le.fit_transform(dataset['Survived'])

In [66]:
dataset['Survived'].head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int32

## Dropping Irrelevant Input Data 

In [67]:
dataset.drop(['PassengerId', 'Name'], axis=1,inplace=True)

## Checking the Preprocessed Dataset

In [68]:
dataset.head()

Unnamed: 0,Pclass,Sex,Age,Number of Siblings/Spouses Aboard,Number of Parents/Children Aboard,Fare,Embarked,Survived
0,3,0.0,22.0,1,0,7.25,0.0,0
1,1,1.0,38.0,1,0,71.2833,1.0,1
2,3,1.0,26.0,0,0,7.925,0.0,1
3,1,1.0,35.0,1,0,53.1,0.0,1
4,3,0.0,35.0,0,0,8.05,0.0,0


## Seperate the input and output

In [78]:
X = dataset.iloc[:, 0:7]
y = dataset.iloc[:, -1]

'code here'

## Showing the Input Data in a Table Format

In [71]:
"code here"

'code here'

## A Quick Check of the Output Data

In [79]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int32

## Taking Care of Missing Data Inputs

In [80]:
from sklearn.impute import SimpleImputer

In [82]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [84]:
X = imputer.fit_transform(X)

## Splitting the Dataset into the Training Set and Test Set

In [85]:
from sklearn.model_selection import train_test_split

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [88]:
X_train.shape

(712, 7)

In [89]:
X_test.shape

(179, 7)

## Scaling the Features

In [98]:
# Scale the age and fare
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[:, [2, 5]] = sc.fit_transform(X[:, [2, 5]])
X

array([[ 3.00000000e+00,  0.00000000e+00, -5.92480600e-01, ...,
         0.00000000e+00, -5.02445171e-01,  0.00000000e+00],
       [ 1.00000000e+00,  1.00000000e+00,  6.38789012e-01, ...,
         0.00000000e+00,  7.86845294e-01,  1.00000000e+00],
       [ 3.00000000e+00,  1.00000000e+00, -2.84663197e-01, ...,
         0.00000000e+00, -4.88854258e-01,  0.00000000e+00],
       ...,
       [ 3.00000000e+00,  1.00000000e+00, -2.03353982e-16, ...,
         2.00000000e+00, -1.76263239e-01,  0.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00, -2.84663197e-01, ...,
         0.00000000e+00, -4.43810379e-02,  1.00000000e+00],
       [ 3.00000000e+00,  0.00000000e+00,  1.77062908e-01, ...,
         0.00000000e+00, -4.92377828e-01,  2.00000000e+00]])

We only scale on the training set

## Training and Testing Predictive Models

In [99]:
from sklearn.metrics import accuracy_score

In [100]:
# Support vector machine
from sklearn.svm import SVC
sv_classfier = SVC(kernel = 'rbf')
sv_classfier.fit(X_train, y_train)
Y_pred = sv_classfier.predict(X_test)
print(accuracy_score(y_test, Y_pred))

0.6536312849162011


about 65%

In [101]:
from sklearn.linear_model import LogisticRegression
# Linear regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
Y_pred = lr.predict(X_test)
print(accuracy_score(y_test, Y_pred))

0.7988826815642458


about 80%