In [154]:
import numpy as np
import pandas as pd


In [191]:
# Importing the dataset
titanic = pd.read_csv('/content/titanic_dataset .csv')

# Understanding the data:

In [192]:
titanic.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [193]:
#Size of Dataframe
titanic.shape

(891, 12)

In [194]:
# Data types of columns
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [195]:
#statistics of the numerical and categorical columns
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [196]:
# check for null values
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [197]:
#Replacing missing values in 'Age'
median_age=titanic['Age'].median()
titanic['Age']= titanic['Age'].fillna(median_age)
#checking null values for 'Age'
titanic.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [198]:
#Replacing missing values in 'Cabin'
titanic['Cabin']=titanic['Cabin'].fillna(titanic['Cabin'].mode()[0])
#checking null values for 'Cabin'
titanic.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       2
dtype: int64

In [199]:
#Replacing missing values in 'Embarked'
titanic['Embarked']=titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])
#checking null values for 'Embarked'
titanic.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [200]:
#check for duplicate values
titanic[titanic.duplicated(keep='first')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


There are no duplicate values in the dataset

In [201]:
#replace 'Sex' with numeric values
titanic['Sex'] = titanic.Sex.replace({'male':0,'female':1})


# Dividing data into features and labels

In [202]:
feature_columns =['Pclass', 'Sex', 'Fare']
X = titanic[feature_columns].values
y = titanic['Survived'].values


# Spliting dataset into training set and test set
KNN Classifier Model
To start with, we need to split the dataset into two sets:
a training set and a test set.
We will use the training set to train the model where the model will memorize both the input features and the output variable.
Then we will use the test set to see that if the model can predict if the passenger survived using the ‘P-class’, ‘Sex’, and, ‘Fare’.
The method ‘train_test_split’ is going to help to split the data.

In [203]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. K-Nearest Neighbors

## Fit the model

In [204]:
# Fitting clasifier to the Training set
from sklearn.neighbors import KNeighborsClassifier

# Instantiate learning model (k =5)
classifier = KNeighborsClassifier(n_neighbors=5)

# Fitting the model
classifier.fit(X_train, y_train)

Here, n_neighbors is 5.

That means when we will ask our trained model to predict the survival chance of a new instance, it will take 5 closest training data.
Based on the labels of those 5 training data, the model will predict the label of the new instance.

# Testing the KNN algorithm on the Testing Data

In [205]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Evaluate the model

In [206]:
#Let’s see how much accuracy it can give us on training data
print('Accuracy of our model is equal ' + str(round(classifier.score(X_train, y_train)*100,2))+ ' %.')


Accuracy of our model is equal 83.31 %.


The training data accuracy I got is  83%.

Now check, how much accurately it can predict the label of the test dataset.

In [207]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)*100
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

Accuracy of our model is equal 77.61 %.


### The training set accuracy is a bit higher than the test set accuracy. That’s overfitting.

# SVM models

In [208]:
 # Importing C-Support Vector Classification from scikit-learn
from sklearn.svm import SVC

# Declaring the SVC with no tunning
svcclassifier = SVC()

# Fitting the data. This is where the SVM will learn
svcclassifier.fit(X_train, y_train)


# Evaluate the model

In [210]:
#Let’s see how much accuracy it can give us on training data
print('Accuracy of our model is equal ' + str(round(svcclassifier.score(X_train, y_train)*100,2))+ ' %.')

Accuracy of our model is equal 66.77 %.


### The training data accuracy I got is 67%.

Now check, how much accurately it can predict the label of the test dataset.

In [211]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)*100

print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

Accuracy of our model is equal 77.61 %.


## The testing set accuracy is a bit higher than the training set accuracy.

# K-fold Cross Validation

In [212]:
from sklearn.model_selection import KFold
kfold_validator=KFold(10)
for train, test in kfold_validator.split(X,y):
  print('Training Index:', train)
  print('Testing Index:', test)

Training Index: [ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
 324 325 326 327 328 329 330 331 33

## To find the average accuracy score of KNN Classifier using K-fold Cross Validation

In [213]:
from sklearn.model_selection import cross_val_score
cv_result= cross_val_score(classifier,X,y,cv=kfold_validator)

In [214]:
cv_result

array([0.73333333, 0.78651685, 0.71910112, 0.74157303, 0.86516854,
       0.80898876, 0.76404494, 0.83146067, 0.78651685, 0.80898876])

In [215]:
#find the average accuracy
np.mean(cv_result)

0.7845692883895131

## To find the average accuracy score of SVM models using K-fold Cross Validation

In [216]:
cv_result_svm= cross_val_score(svcclassifier,X,y,cv=kfold_validator)

In [217]:
cv_result_svm

array([0.58888889, 0.68539326, 0.66292135, 0.66292135, 0.64044944,
       0.69662921, 0.65168539, 0.68539326, 0.73033708, 0.68539326])

In [218]:
#find the average accuracy
np.mean(cv_result_svm)

0.6690012484394507

# Stratified k-fold cross validation

In [219]:
from sklearn.model_selection import StratifiedKFold
skfold_validator= StratifiedKFold(n_splits=10)

In [220]:
for train, test in skfold_validator.split(X,y):
  print('Training Index:', train)
  print('Testing Index:', test)

Training Index: [ 82  84  85  88  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
 324 325 326 327 328 329 330 331 33

## To find the average accuracy score of KNN Classifier using Stratified k-fold cross validation

In [221]:
cv_result= cross_val_score(classifier,X,y,cv=skfold_validator)

In [222]:
cv_result

array([0.7       , 0.7752809 , 0.68539326, 0.74157303, 0.88764045,
       0.79775281, 0.71910112, 0.80898876, 0.76404494, 0.80898876])

In [223]:
#find the average accuracy
np.mean(cv_result)

0.7688764044943821

## To find the average accuracy score of SVM models using Stratified k-fold cross validation

In [224]:
cv_result_svm= cross_val_score(svcclassifier,X,y,cv=skfold_validator)

In [225]:
cv_result_svm

array([0.61111111, 0.59550562, 0.68539326, 0.74157303, 0.66292135,
       0.68539326, 0.68539326, 0.68539326, 0.69662921, 0.66292135])

In [226]:
#find the average accuracy
np.mean(cv_result_svm)

0.6712234706616729