In [7]:
# Import Dependencies
import numpy as np
import pandas as pd
import os

In [8]:
test_df = pd.read_csv("./titanic_data/test.csv")

test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
def preprocess(df):
    df = df.copy()
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_str(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    def family_size(x, y):
        return x + y
        
    def format_fare(x):
        return round(x, 2)


    df["TicketNum"] = df["Ticket"].apply(ticket_number)
    df["TicketStr"] = df["Ticket"].apply(ticket_str)
    df["FamilySize"] = family_size(df["SibSp"], df["Parch"])
    df["Fare"] = df["Fare"].apply(format_fare)

    return df


# Function to process titles
def process_titles(df):
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    
    # Combine some titles for better meaning
    df['Title'].replace({'Mme': 'Mlle', 'Mlle': 'Mlle'}, inplace=True)
    
    # Combine titles
    df['Title'].replace({'Capt': 'Sir', 'Don': 'Sir', 'Major': 'Sir', 'Sir': 'Sir', 'Col': 'Sir'}, inplace=True)
    df['Title'].replace({'Dona': 'Lady', 'Lady': 'Lady', 'the Countess': 'Lady', 'Jonkheer': 'Lady'}, inplace=True)
    df['Title'].replace({'Ms': 'Miss'}, inplace=True)
    
    return df

# Function to preprocess the 'TicketStr' column
def preprocess_ticket_column(df):
    # Some additional fixing
    # A/5
    df['TicketStr'].replace({"A./5.": "A5", "A.5.": "A5", "A/5.": "A5", "A/5": "A5"}, inplace=True)
    # A/4
    df['TicketStr'].replace({"A/4": "A4", "A/4.": "A4", "A4.": "A4"}, inplace=True)
    # A/S
    df['TicketStr'].replace({"A/S": "AS"}, inplace=True)
    # CA
    df['TicketStr'].replace({"C.A.": "CA", "CA.": "CA"}, inplace=True)
    # CASOTON
    df['TicketStr'].replace({"C.A./SOTON": "CASOTON"}, inplace=True)

    # FC FCC
    df['TicketStr'].replace({"F.C.": "FC", "F.C.C.": "FCC"}, inplace=True)
    # P/PP
    df['TicketStr'].replace({"P/PP": "PPP"}, inplace=True)
    # S.C./A.4.
    df['TicketStr'].replace({"S.C./A.4.": "SCA4"}, inplace=True)
    # S.C./PARIS
    df['TicketStr'].replace({"S.C./PARIS": "SCPARIS", "S.C./Paris": "SCPARIS"}, inplace=True)
    # S.O./P.P
    df['TicketStr'].replace({"S.O./P.P.": "SOPP"}, inplace=True)

    # S.O.C
    df['TicketStr'].replace({"S.O.C.": "SOC"}, inplace=True)

    # SC/Paris
    df['TicketStr'].replace({"SC/PARIS": "SCPARIS", "SC/Paris": "SCPARIS"}, inplace=True)

    # SCO/W
    df['TicketStr'].replace({"SCO/W": "SCOW"}, inplace=True)

    # SO/C
    df['TicketStr'].replace({"SO/C": "SOC"}, inplace=True)

    # SOTON/O2
    df['TicketStr'].replace({"SOTON/O2": "SOTONO2"}, inplace=True)

    # SOTON/OQ
    df['TicketStr'].replace({"SOTON/OQ": "SOTONOQ"}, inplace=True)

    # STON/O
    df['TicketStr'].replace({"STON/O": "STONO"}, inplace=True)

    # STON/O
    df['TicketStr'].replace({"STON/O2.": "STONO2"}, inplace=True)

    # "SOTON/O2"
    df['TicketStr'].replace({"SOTON/O2.": "SOTONO2"}, inplace=True)

    # "SOTON/OQ"
    df['TicketStr'].replace({"SOTON/O.Q.": "SOTONOQ"}, inplace=True)

    # SW/PP
    df['TicketStr'].replace({"SW/PP": "SWPP"}, inplace=True)

    # W.E.P
    df['TicketStr'].replace({"W.E.P.": "WEP"}, inplace=True)

    # WC
    df['TicketStr'].replace({"W./C.": "WC", "W/C": "WC"}, inplace=True)

    # WE/P
    df['TicketStr'].replace({"WE/P": "WEP"}, inplace=True)

    # Turn into categorical type
    df['TicketStr'] = df['TicketStr'].astype('category')

    return df


preprocessed_test_df = preprocess(test_df)
preprocessed_test_df = process_titles(preprocessed_test_df)

columns_to_drop = ['Name', 'Ticket', 'Cabin']
preprocessed_test_df = preprocessed_test_df.drop(columns=columns_to_drop)

preprocessed_test_df = preprocessed_test_df[preprocessed_test_df['TicketNum'] != 'LINE']
preprocessed_test_df = preprocess_ticket_column(preprocessed_test_df)
average_fare_test = preprocessed_test_df['Fare'].mean()

preprocessed_test_df['Fare'].fillna(average_fare_test, inplace=True)

preprocessed_test_df.head()



Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,TicketNum,TicketStr,FamilySize,Title
0,892,3,male,34.5,0,0,7.83,Q,330911,NONE,0,Mr
1,893,3,female,47.0,1,0,7.0,S,363272,NONE,1,Mrs
2,894,2,male,62.0,0,0,9.69,Q,240276,NONE,0,Mr
3,895,3,male,27.0,0,0,8.66,S,315154,NONE,0,Mr
4,896,3,female,22.0,1,1,12.29,S,3101298,NONE,2,Mrs


In [10]:
preprocessed_test_df['TicketNum'] = preprocessed_test_df['TicketNum'].astype(int)

preprocessed_test_df['TicketStr'] = preprocessed_test_df['TicketStr'].astype(object)

average_age_train = round(preprocessed_test_df['Age'].mean(), 2)

preprocessed_test_df['Age'].fillna(average_age_train, inplace=True)

In [11]:
from sklearn.preprocessing import LabelEncoder

columns_to_encode = ['Sex', 'Embarked', 'TicketStr', 'Title']

label_encoder = LabelEncoder()

for col in columns_to_encode:
    if preprocessed_test_df[col].dtype == 'object':
        preprocessed_test_df[col] = label_encoder.fit_transform(preprocessed_test_df[col])

preprocessed_test_df.head()
preprocessed_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int32  
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         418 non-null    float64
 7   Embarked     418 non-null    int32  
 8   TicketNum    418 non-null    int32  
 9   TicketStr    418 non-null    int32  
 10  FamilySize   418 non-null    int64  
 11  Title        418 non-null    int32  
dtypes: float64(2), int32(5), int64(5)
memory usage: 34.3 KB


In [12]:
import pandas as pd
preprocessed_test_df.to_csv("./processed_data/preprocessed_test_data.csv", index=False)