In [1]:
import pandas as pd 

train = pd.read_csv('../data/titanic/train.csv')
test = pd.read_csv('../data/titanic/test.csv')

#### Load train and test (optional)

In [2]:
import numpy as np
with open('../data/titanic/train.npy', 'rb') as f:
    train_input = np.load(f)
    train_labels = np.load(f)
with open('../data/titanic/test.npy', 'rb') as f:
    test_input = np.load(f)
    test_labels = np.load(f)

In [2]:
print('train has {} rows and {} columns'.format(*train.shape))
print('test has {} rows and {} columns'.format(*test.shape))

train has 891 rows and 12 columns
test has 418 rows and 11 columns


#### Analyzing the data

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Cleaning the data

#### General cleaning

In [6]:
# Remove missing "Embarked" data
train = train.dropna(subset=["Embarked"])
# Drop cabin - very little information about it
train = train.drop("Cabin", axis=1)
# Fill missing age with the mean
mean = train["Age"].mean()
train["Age"] = train["Age"].fillna(mean)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


#### Managing unique data

In [7]:
print('There are {} different (unique) PassengerIds in the data'.format(train["PassengerId"].nunique()))
print('There are {} different (unique) names in the data'.format(train["Name"].nunique()))
print('There are {} different (unique) ticket numbers in the data'.format(train["Ticket"].nunique()))

There are 889 different (unique) PassengerIds in the data
There are 889 different (unique) names in the data
There are 680 different (unique) ticket numbers in the data


In [8]:
# Dropping unique identifiers
train = train.drop("PassengerId", axis =1)
train = train.drop("Name", axis=1)
train = train.drop("Ticket", axis=1)

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


#### Handling text and categorical attributes

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in ['Sex','Embarked']:
    le.fit(train[col])
    train[col] = le.transform(train[col])

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


#### Feature scaling

In [12]:
print('The maximum age is {}'.format(train["Age"].max()))
print('The maximum fare is {}'.format(train["Fare"].max()))

The maximum age is 80.0
The maximum fare is 512.3292


In [13]:
# The scaler returns a Numpy-array instead of a Pandas DataFrame.
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train)
train = scaler.transform(train)

print('The minimum value is {} and the maximum value is {}'.format(train.min(),train.max()))

The minimum value is 0.0 and the maximum value is 1.0


### Splitting data

In [30]:
from sklearn.model_selection import train_test_split

input_data = train[:,1:8]
labels = train[:,0]

train_input, test_input, train_labels, test_labels = train_test_split(input_data,labels,test_size=0.2)

print('we have {} training and {} testing rows'.format(train_input.shape[0],test_input.shape[0]))
print('There are {} input columns'.format(train_input.shape[1]))

we have 711 training and 178 testing rows
There are 7 input columns


In [31]:
# Saving the data for training and testing 
import numpy as np 

with open('../data/titanic/train.npy','wb') as f:
    np.save(f,train_input)
    np.save(f,train_labels)

with open('../data/titanic/test.npy','wb') as f:
    np.save(f,test_input)
    np.save(f,test_labels)

### Training

In [3]:
# Simple algorithm - random classifier
import random
random.seed(a=None,version=2)

def classify(passenger):
    return random.randint(0,1)

def run(f_classify,x):
    return list(map(f_classify,x))

results = run(classify,train_input)
#print(results)

def evaluate(predictions,actual):
    correct = list(filter(
        lambda item: item[0] == item[1],
        list(zip(predictions,actual))
    ))
    return '{} correct predictions out of {}. Accuracy {:.0f} %' \
            .format(len(correct),len(actual),100*len(correct)/len(actual))

print(evaluate(results,train_labels))

373 correct predictions out of 711. Accuracy 52 %


In [1]:
import numpy as np
with open('../data/titanic/train.npy', 'rb') as f:
    train_input = np.load(f)
    train_labels = np.load(f)
with open('../data/titanic/test.npy', 'rb') as f:
    test_input = np.load(f)
    test_labels = np.load(f)

In [4]:
def predict_death(item):
    return 0

print(evaluate(run(predict_death, train_input), train_labels))

442 correct predictions out of 711. Accuracy 62 %


In [5]:
#classify_function = classify
classify_function = predict_death


In [6]:
# Confusion matrix 
# True Negatives  | False Positives
# False Negatives | True Positives

from sklearn.metrics import confusion_matrix

predictions = run(classify_function,train_input)
confusion_matrix(train_labels,predictions)


array([[442,   0],
       [269,   0]])

In [14]:
# Precision: is the “accuracy of the positive predictions.” It only looks at the positive predictions. These are predictions that the passenger survived.
# sum(TruePositives)/sum(AllPredictedPositives)

from sklearn.metrics import precision_score
print('The precision score of the {} function is {}'.format(str(classify_function.__name__),precision_score(train_labels,predictions)))

The precision score of the predict_death classifier is 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Recall: is the “accuracy of the actual positives.” It only looks at the actual positives.
# sum(TruePositives)/sum(AllActualPositives)

from sklearn.metrics import recall_score
print('The recall score of the {} function is {}'.format(str(classify_function.__name__),recall_score(train_labels,predictions)))

The recall score of the predict_death function is 0.0


In [16]:
# Specifity: is the “accuracy of the actual negatives.” It only looks at actual negatives (deaths)
# sum(TrueNegatives)/sum(AllActualNegatives)

# “negative predictive value” (NPV): is the “accuracy of the negative predictions
# sum(TrueNegatives)/sum(AllPredictedNegatives)

def specificity(matrix):
    return matrix[0][0]/(matrix[0][0]+matrix[0,1]) if (matrix[0][0]+matrix[0][1] > 0) else 0

def npv(matrix):
    return matrix[0][0]/(matrix[0][0]+matrix[1][0]) if (matrix[0][0]+matrix[1][0] > 0) else 0

cm = confusion_matrix(train_labels, predictions)

print('The specificity score of the {} function is {:.2f}'.format(str(classify_function.__name__),specificity(cm)))
print('The npv score of the {} function is {:.2f}'.format(str(classify_function.__name__),npv(cm)))




The specificity score of the predict_death function is 1.00
The npv score of the predict_death function is 0.62


In [17]:
random_predictions = run(classify, train_input)
random_cm = confusion_matrix(train_labels, random_predictions)
print('The precision score of the random classifier is {:.2f}'.format(precision_score(train_labels, random_predictions)))
print('The recall score of the random classifier is {:.2f}'.format(recall_score(train_labels, random_predictions)))
print('The specificity score of the random classifier is {:.2f}'.format(specificity(random_cm)))
print('The npv score of the random classifier is {:.2f}'.format(npv(random_cm)))

The precision score of the random classifier is 0.37
The recall score of the random classifier is 0.48
The specificity score of the random classifier is 0.52
The npv score of the random classifier is 0.62
