# Titanic - Machine Learning from Disaster

##### Python 3.8 Tenserflow 2.7 enviroment

## Import libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()

## Load data

In [None]:
train_data = pd.read_csv('train.csv')
train_data

## Copy Dataframe

In [None]:
train_df = train_data.copy()

## Explore data

In [None]:
train_df.describe()

In [None]:
# Count missing values for train data
train_df.isna().sum()

In [None]:
# Plot a heat map to see the correlation between the parameters and the target variable (Survived)
# The higher the (absolut?) value the higher the correlation
heatmap = sns.heatmap(train_df[['Survived', 'SibSp', 'Parch', 'Age', 'Fare', 'Pclass']].corr(), annot = True)
sns.set(rc={'figure.figsize':(7,5)})

In [None]:
# Correlation between Fare and Surviving
plt.figure(figsize=(25, 7))
plt.hist([train_df[train_df['Survived']==1]['Fare'], train_df[train_df['Survived']==0]['Fare']], 
         stacked=True, color=['dodgerblue','navy'],
         bins=30, label=['Survived', 'Died']
        )
plt.xlabel('Fare')
plt.ylabel('Number of passenger')
plt.legend()

In [None]:
# Correlation between family size and Surviving
plt.figure(figsize=(25, 7))
plt.hist([train_df[train_df['Survived']==1]['Parch']+train_df[train_df['Survived']==1]['SibSp'],
          train_df[train_df['Survived']==0]['Parch']+train_df[train_df['Survived']==0]['SibSp']], 
          stacked=True, color=['steelblue','navy'],
          bins=30, label=['Survived', 'Died']
         )
plt.xlabel('Family')
plt.ylabel('Number of passenger')
plt.legend()

###### Seems pasangers with 1 to 3 family members have more chances to survive 

In [None]:
# Correlation between Age and Surviving
plt.figure(figsize=(25, 7))
plt.hist([train_df[train_df['Survived']==1]['Age'], train_df[train_df['Survived']==0]['Age']], 
         stacked=True, color=['darkturquoise','navy'],
         bins=30, label=['Survived', 'Died']
        )
plt.xlabel('Age')
plt.ylabel('Number of passenger')
plt.legend()

###### Seems there is higher rate of surviving only among children around less than 8

In [None]:
# Correlation between Class and Surviving
plt.figure(figsize=(15, 5))
plt.hist([train_df[train_df['Survived']==1]['Pclass'], train_df[train_df['Survived']==0]['Pclass']], 
         stacked=True, color=['darkcyan','navy'],
         bins=30, label=['Survived', 'Died']
        )
plt.xlabel('Class')
plt.ylabel('Number of passenger')
plt.legend()

In [None]:
# Correlation between Gender and Surviving
plt.figure(figsize=(7, 5))
plt.hist([train_df[train_df['Survived']==1]['Sex'], train_df[train_df['Survived']==0]['Sex']], 
         stacked=True, color=['aqua','navy'],
         bins=3, label=['Survived', 'Died']
        )
plt.xlabel('Sex')
plt.ylabel('Number of Survived')
plt.legend()

###### Seems  women had more chances for surviving

In [None]:
# Correlation between Embarked and Surviving
plt.figure(figsize=(7, 5))
plt.hist([train_df[train_df['Survived']==1]['Embarked'].map(lambda x: str(x)), 
          train_df[train_df['Survived']==0]['Embarked'].map(lambda x: str(x))], 
          stacked=True, color=['teal','navy'],
          bins=3, label=['Survived', 'Died']
         )
plt.xlabel('Embarked')
plt.ylabel('Number of Survived')
plt.legend()

## Data processing

### Missing age values

In [None]:
# Fill missing age values with random between mean-std and mean+std
age_mean = train_df["Age"].mean()
age_std = train_df["Age"].std()

# Remember age_mean and age_std to preprocess_data dictionary
preprocess_data = {'age_mean':age_mean, 'age_std':age_std}

def fill_missing_age(df, mean, std):
    is_null = df["Age"].isnull().sum()
    # compute random numbers in range of mean +/- std and size of is_null values
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = df["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    df["Age"] = age_slice
    df["Age"] = df["Age"].astype(int)
    return df

train_df = fill_missing_age(train_df, age_mean, age_std)

### Missing fare values

In [None]:
# Fill fare missing values with mean 
fare_mean = train_df["Fare"].mean()

# Remember fare mean to preprocess_data dictionary
preprocess_data['fare_mean'] = fare_mean
    
train_df['Fare'] = train_df['Fare'].fillna(train_df['Fare'].mean())

### Missing embarked values

In [None]:
# Fill missing values with most common
embarked_mode = train_df['Embarked'].mode().iloc[0]

# Remember embarked mode to preprocess_data dictionary
preprocess_data['embarked_mode'] = embarked_mode

train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)

In [None]:
train_df.isna().sum()

## Feature engineering

### Cabin

In [None]:
# Turn Cabin number into Deck 
decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', np.nan]

# Remember decks to preprocess_data dictionary
preprocess_data['decks'] = decks

def substring_in(inp, sub):
    for s in sub:
        if str(inp) == str(s) or str(inp).find(s) != -1:
            return s
    return

train_df['Deck'] = train_df['Cabin'].map(lambda x: substring_in(x, decks))

### Relatives

In [None]:
# Combine SibSp, Parch (siblings/spouse/parents/children) into new feature
train_df['Relatives'] = train_df['SibSp']+train_df['Parch']

In [None]:
# Extend field 'Relatives' into 3 categories: Singleton, SmallFamily, LargerFamily
def family_size(relatives):
    if relatives == 0:
        return 'Singleton'
    elif 0 < relatives <= 3:
        return 'SmallFamily'
    else:
        return 'LargerFamily'
    
train_df['Relatives'] = train_df['Relatives'].map(lambda x: family_size(x))
train_df.head()

### Title

In [None]:
# Extract from Name field title (it might be that some education or profession correlate with surviving)
train_df['Title'] = train_df['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip())
train_df['Title'].unique()

### Drop extra features

In [None]:
train_df = train_df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
train_df.head()

## Scale numerical data

In [None]:
train_df.dtypes

In [None]:
# Change Pclass type as it is a categorical variable
train_df['Pclass'] = train_df['Pclass'].astype(str)

In [None]:
from sklearn.preprocessing import StandardScaler

# Copy Dataframes before scaling
train_scaled = train_df.copy()

# Determine features to scale
features_to_scale = list(train_scaled.select_dtypes(include=['float64', 'int32']).columns)

# Fit scaler with train data
scaler = StandardScaler()
scaler.fit(train_scaled[features_to_scale])

# Remember features_to_scale and scaler to scale_data dictionary
scale_data = {'features_to_scale':features_to_scale, 'scaler':scaler}

# Transform numerical values
train_scaled[features_to_scale] = scaler.transform(train_scaled[features_to_scale])
train_scaled.head()

## Transform categorical variables into dummies

In [None]:
# Determine other features to encode with dummies
features_to_encode = list(train_scaled.select_dtypes(include=['object']).columns)
features_to_encode

In [None]:
from sklearn.preprocessing import OneHotEncoder
# Create binary dummies using OneHotEncoder to overcome the mismatch in features of train and test data
encoder = OneHotEncoder(drop='first', dtype='int64', handle_unknown = 'ignore')

# Remember features_to_encode, encoder to encode_data dictionary
encode_data = {'features_to_encode':features_to_encode, 'encoder':encoder}

encoded_df = encoder.fit_transform(train_scaled[features_to_encode])

encoded_features = encoder.get_feature_names_out(features_to_encode)
encoded_df = pd.DataFrame(encoded_df.todense(), columns=encoded_features)
encoded_df

In [None]:
# Concatenate scaled DataFrame and encoded DataFrame, drop already encoded features
train_encoded = pd.concat([train_scaled.copy(), encoded_df], axis=1)
train_encoded = train_encoded.drop(features_to_encode, axis=1)
train_encoded.head()

## Save data used for preprocess, scale and encode features into files

In [None]:
preprocess_data, scale_data, encode_data

In [None]:
import pickle

with open('preprocess_data.pkl', 'wb') as f:
    pickle.dump(preprocess_data, f)
    
with open('scale_data.pkl', 'wb') as f:
    pickle.dump(scale_data, f)
    
with open('encode_data.pkl', 'wb') as f:
    pickle.dump(encode_data, f)

## Balance training data

## Divide training data into train, validation and test sets

## Build prediction models

## Load and  preprocess new data

In [None]:
# Load new data
test_data = pd.read_csv('test.csv')
test_data

In [None]:
# Copy data and explore
test_df = test_data.copy()
test_df.describe()

### Preprocess new data

In [None]:
def new_data_preprocess(df):
    # Load preprocessing data
    with open('preprocess_data.pkl', 'rb') as f:
        preprocess_data = pickle.load(f)
        
    # Fill missing age values
    df = fill_missing_age(df, preprocess_data['age_mean'], preprocess_data['age_std'])
    
    # Fill missing fare values
    df['Fare'] = df['Fare'].fillna(preprocess_data['fare_mean'])
    
    # Fill missing embarked values if any
    df['Embarked'] = df['Embarked'].fillna(preprocess_data['embarked_mode'])
    
    # Turn Cabin number into Deck    
    df['Deck'] = df['Cabin'].map(lambda x: substring_in(x, preprocess_data['decks']))
    
    # Combine SibSp, Parch (siblings/spouse/parents/children) into new feature
    df['Relatives'] = df['SibSp']+df['Parch']
    # Extend field 'Relatives' into 3 categories: Singleton, SmallFamily, LargerFamily with family_size() function
    df['Relatives'] = df['Relatives'].map(lambda x: family_size(x))
    
    # Extract from Name field title (it might be that some education or profession correlate with surviving)
    df['Title'] = df['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip())
    
    # Drop extra features
    df = df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)

    return df

test_df = new_data_preprocess(test_df)
test_df

### Scale new data numerical values

In [None]:
def new_data_scale(df):
    # Change Pclass type as it is a categorical variable
    df['Pclass'] = df['Pclass'].astype(str)

    # Load scale data
    with open('scale_data.pkl', 'rb') as f:
        scale_data = pickle.load(f)

    #Transform numerical values
    scaler, features_to_scale = scale_data['scaler'], scale_data['features_to_scale']
    df[features_to_scale] = scaler.transform(df[features_to_scale])
        
    return df

# Copy Dataframes before scaling
test_scaled = test_df.copy()

test_scaled = new_data_scale(test_scaled)
test_scaled

### Transform new data categorical variables into dummies

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

def new_data_one_hot(df):
    # Load encode data
    with open('encode_data.pkl', 'rb') as f:
        encode_data = pickle.load(f)
    
    # Transform features listed in features_to_encode into dummies
    encoder, features_to_encode = encode_data['encoder'], encode_data['features_to_encode']
    encoded_df = encoder.transform(df[features_to_encode])

    # Get DataFrame from dummies matrix
    encoded_features = encoder.get_feature_names_out(features_to_encode)    
    encoded_df = pd.DataFrame(encoded_df.todense(), columns=encoded_features)
    
    # Concatenate scaled DataFrame and encoded DataFrame, drop already encoded features
    df = pd.concat([df, encoded_df], axis=1)
    df = df.drop(features_to_encode, axis=1)    
    return df

# Copy Dataframes before scaling
test_encoded = test_scaled.copy()

test_encoded = new_data_one_hot(test_encoded)
test_encoded

### Check there is no mismatch in features in train and new data sets

In [None]:
comparison = train_encoded.columns.values[1:] == test_encoded.columns.values
equal_features = comparison.all()
  
print(equal_features)

## Run best perforemed model on new data and save prediction as submission.csv 