In [93]:
# Import Dependencies
import numpy as np
import pandas as pd
import os

In [94]:
# Load Dataset
train_df = pd.read_csv("./titanic_data/train.csv")

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [95]:
def preprocess(df):
    df = df.copy()
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_str(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    def family_size(x, y):
        return x + y + 1
        
    def format_fare(x):
        return round(x, 2)


    df["TicketNum"] = df["Ticket"].apply(ticket_number)
    df["TicketStr"] = df["Ticket"].apply(ticket_str)
    df["FamilySize"] = family_size(df["SibSp"], df["Parch"])
    df["Fare"] = df["Fare"].apply(format_fare)
    
    return df


# Function to process titles
def process_titles(df):
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    
    # Combine some titles for better meaning
    df['Title'].replace({'Mme': 'Mlle', 'Mlle': 'Mlle'}, inplace=True)
    
    # Combine titles
    df['Title'].replace({'Capt': 'Sir', 'Don': 'Sir', 'Major': 'Sir', 'Sir': 'Sir', 'Col': 'Sir'}, inplace=True)
    df['Title'].replace({'Dona': 'Lady', 'Lady': 'Lady', 'the Countess': 'Lady', 'Jonkheer': 'Lady'}, inplace=True)
    df['Title'].replace({'Ms': 'Miss'}, inplace=True)
    
    return df

# Function to preprocess the 'TicketStr' column
def preprocess_ticket_column(df):
    # Some additional fixing
    # A/5
    df['TicketStr'].replace({"A./5.": "A5", "A.5.": "A5", "A/5.": "A5", "A/5": "A5"}, inplace=True)
    # A/4
    df['TicketStr'].replace({"A/4": "A4", "A/4.": "A4", "A4.": "A4"}, inplace=True)
    # A/S
    df['TicketStr'].replace({"A/S": "AS"}, inplace=True)
    # CA
    df['TicketStr'].replace({"C.A.": "CA", "CA.": "CA"}, inplace=True)
    # CASOTON
    df['TicketStr'].replace({"C.A./SOTON": "CASOTON"}, inplace=True)

    # FC FCC
    df['TicketStr'].replace({"F.C.": "FC", "F.C.C.": "FCC"}, inplace=True)
    # P/PP
    df['TicketStr'].replace({"P/PP": "PPP"}, inplace=True)
    # S.C./A.4.
    df['TicketStr'].replace({"S.C./A.4.": "SCA4"}, inplace=True)
    # S.C./PARIS
    df['TicketStr'].replace({"S.C./PARIS": "SCPARIS", "S.C./Paris": "SCPARIS"}, inplace=True)
    # S.O./P.P
    df['TicketStr'].replace({"S.O./P.P.": "SOPP"}, inplace=True)

    # S.O.C
    df['TicketStr'].replace({"S.O.C.": "SOC"}, inplace=True)

    # SC/Paris
    df['TicketStr'].replace({"SC/PARIS": "SCPARIS", "SC/Paris": "SCPARIS"}, inplace=True)

    # SCO/W
    df['TicketStr'].replace({"SCO/W": "SCOW"}, inplace=True)

    # SO/C
    df['TicketStr'].replace({"SO/C": "SOC"}, inplace=True)

    # SOTON/O2
    df['TicketStr'].replace({"SOTON/O2": "SOTONO2"}, inplace=True)

    # SOTON/OQ
    df['TicketStr'].replace({"SOTON/OQ": "SOTONOQ"}, inplace=True)

    # STON/O
    df['TicketStr'].replace({"STON/O": "STONO"}, inplace=True)

    # STON/O
    df['TicketStr'].replace({"STON/O2.": "STONO2"}, inplace=True)

    # "SOTON/O2"
    df['TicketStr'].replace({"SOTON/O2.": "SOTONO2"}, inplace=True)

    # "SOTON/OQ"
    df['TicketStr'].replace({"SOTON/O.Q.": "SOTONOQ"}, inplace=True)

    # SW/PP
    df['TicketStr'].replace({"SW/PP": "SWPP"}, inplace=True)

    # W.E.P
    df['TicketStr'].replace({"W.E.P.": "WEP"}, inplace=True)

    # WC
    df['TicketStr'].replace({"W./C.": "WC", "W/C": "WC"}, inplace=True)

    # WE/P
    df['TicketStr'].replace({"WE/P": "WEP"}, inplace=True)

    # Turn into categorical type
    df['TicketStr'] = df['TicketStr'].astype('category')

    return df

preprocessed_train_df = preprocess(train_df)

# Process titles for training dataset
preprocessed_train_df = process_titles(preprocessed_train_df)

columns_to_drop = ['Name', 'Ticket', 'Cabin']
preprocessed_train_df = preprocessed_train_df.drop(columns=columns_to_drop)


# Remove LINE from TicketNum column
preprocessed_train_df = preprocessed_train_df[preprocessed_train_df['TicketNum'] != 'LINE']

# Apply the preprocessing function to your DataFrames
preprocessed_train_df = preprocess_ticket_column(preprocessed_train_df)

preprocessed_train_df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,TicketNum,TicketStr,FamilySize,Title
0,1,0,3,male,22.0,1,0,7.25,S,21171,A5,2,Mr
1,2,1,1,female,38.0,1,0,71.28,C,17599,PC,2,Mrs
2,3,1,3,female,26.0,0,0,7.92,S,3101282,STONO2,1,Miss
3,4,1,1,female,35.0,1,0,53.1,S,113803,NONE,2,Mrs
4,5,0,3,male,35.0,0,0,8.05,S,373450,NONE,1,Mr
5,6,0,3,male,,0,0,8.46,Q,330877,NONE,1,Mr
6,7,0,1,male,54.0,0,0,51.86,S,17463,NONE,1,Mr
7,8,0,3,male,2.0,3,1,21.07,S,349909,NONE,5,Master
8,9,1,3,female,27.0,0,2,11.13,S,347742,NONE,3,Mrs
9,10,1,2,female,14.0,1,0,30.07,C,237736,NONE,2,Mrs


In [96]:
# Convert the column to 'int'
preprocessed_train_df['TicketNum'] = preprocessed_train_df['TicketNum'].astype(int)

# Convert 'TicketStr' column to object type
preprocessed_train_df['TicketStr'] = preprocessed_train_df['TicketStr'].astype(object)


print(preprocessed_train_df['TicketStr'].unique())

average_age_train = round(preprocessed_train_df['Age'].mean(), 2)

# Replace NaN values in the 'Age' column with the average value
preprocessed_train_df['Age'].fillna(average_age_train, inplace=True)

preprocessed_train_df.head()

['A5' 'PC' 'STONO2' 'NONE' 'PP' 'CA' 'SCPARIS' 'SCA4' 'A4' 'S.P.' 'SOC'
 'WC' 'SOTONOQ' 'WEP' 'STON/O_2.' 'C' 'S.O.P.' 'Fa' 'FCC' 'SWPP' 'SCOW'
 'PPP' 'SC' 'SC/AH' 'AS' 'SC/AH_Basle' 'S.W./PP' 'SOPP' 'FC' 'SOTONO2'
 'CASOTON']


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,TicketNum,TicketStr,FamilySize,Title
0,1,0,3,male,22.0,1,0,7.25,S,21171,A5,2,Mr
1,2,1,1,female,38.0,1,0,71.28,C,17599,PC,2,Mrs
2,3,1,3,female,26.0,0,0,7.92,S,3101282,STONO2,1,Miss
3,4,1,1,female,35.0,1,0,53.1,S,113803,NONE,2,Mrs
4,5,0,3,male,35.0,0,0,8.05,S,373450,NONE,1,Mr


In [97]:

from sklearn.preprocessing import LabelEncoder

columns_to_encode = ['Sex', 'Embarked', 'TicketStr', 'Title']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate through each specified column
for col in columns_to_encode:
    # Check if the column contains string values
    if preprocessed_train_df[col].dtype == 'object':
        # Fit and transform the LabelEncoder on the training data
        preprocessed_train_df[col] = label_encoder.fit_transform(preprocessed_train_df[col])

# Display the head of the encoded training data to check the results
preprocessed_train_df.head()
preprocessed_train_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 887 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  887 non-null    int64  
 1   Survived     887 non-null    int64  
 2   Pclass       887 non-null    int64  
 3   Sex          887 non-null    int32  
 4   Age          887 non-null    float64
 5   SibSp        887 non-null    int64  
 6   Parch        887 non-null    int64  
 7   Fare         887 non-null    float64
 8   Embarked     887 non-null    int32  
 9   TicketNum    887 non-null    int32  
 10  TicketStr    887 non-null    int32  
 11  FamilySize   887 non-null    int64  
 12  Title        887 non-null    int32  
dtypes: float64(2), int32(5), int64(6)
memory usage: 79.7 KB


In [98]:
import pandas as pd
# Use the to_csv method to write the DataFrame to a CSV file
preprocessed_train_df.to_csv("./processed_data/preprocessed_train_data.csv", index=False)