### Import Dependencies

In [313]:
# Import Dependencies
import numpy as np
import pandas as pd
import os

### Load Dataset

In [314]:
# Load Dataset
train_df = pd.read_csv("./titanic_data/train.csv")
test_df = pd.read_csv("./titanic_data/test.csv")

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [315]:
test_df.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


### Preprocess Data

In [316]:
def preprocess(df):
    df = df.copy()
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_str(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    def family_size(x, y):
        return x + y
        
    def format_fare(x):
        return round(x, 2)


    df["TicketNum"] = df["Ticket"].apply(ticket_number)
    df["TicketStr"] = df["Ticket"].apply(ticket_str)
    df["FamilySize"] = family_size(df["SibSp"], df["Parch"])
    df["Fare"] = df["Fare"].apply(format_fare)

    return df


# Function to process titles
def process_titles(df):
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    
    # Combine some titles for better meaning
    df['Title'].replace({'Mme': 'Mlle', 'Mlle': 'Mlle'}, inplace=True)
    
    # Combine titles
    df['Title'].replace({'Capt': 'Sir', 'Don': 'Sir', 'Major': 'Sir', 'Sir': 'Sir', 'Col': 'Sir'}, inplace=True)
    df['Title'].replace({'Dona': 'Lady', 'Lady': 'Lady', 'the Countess': 'Lady', 'Jonkheer': 'Lady'}, inplace=True)
    df['Title'].replace({'Ms': 'Miss'}, inplace=True)
    
    return df

# Function to preprocess the 'TicketStr' column
def preprocess_ticket_column(df):
    # Some additional fixing
    # A/5
    df['TicketStr'].replace({"A./5.": "A5", "A.5.": "A5", "A/5.": "A5", "A/5": "A5"}, inplace=True)
    # A/4
    df['TicketStr'].replace({"A/4": "A4", "A/4.": "A4", "A4.": "A4"}, inplace=True)
    # A/S
    df['TicketStr'].replace({"A/S": "AS"}, inplace=True)
    # CA
    df['TicketStr'].replace({"C.A.": "CA", "CA.": "CA"}, inplace=True)
    # CASOTON
    df['TicketStr'].replace({"C.A./SOTON": "CASOTON"}, inplace=True)

    # FC FCC
    df['TicketStr'].replace({"F.C.": "FC", "F.C.C.": "FCC"}, inplace=True)
    # P/PP
    df['TicketStr'].replace({"P/PP": "PPP"}, inplace=True)
    # S.C./A.4.
    df['TicketStr'].replace({"S.C./A.4.": "SCA4"}, inplace=True)
    # S.C./PARIS
    df['TicketStr'].replace({"S.C./PARIS": "SCPARIS", "S.C./Paris": "SCPARIS"}, inplace=True)
    # S.O./P.P
    df['TicketStr'].replace({"S.O./P.P.": "SOPP"}, inplace=True)

    # S.O.C
    df['TicketStr'].replace({"S.O.C.": "SOC"}, inplace=True)

    # SC/Paris
    df['TicketStr'].replace({"SC/PARIS": "SCPARIS", "SC/Paris": "SCPARIS"}, inplace=True)

    # SCO/W
    df['TicketStr'].replace({"SCO/W": "SCOW"}, inplace=True)

    # SO/C
    df['TicketStr'].replace({"SO/C": "SOC"}, inplace=True)

    # SOTON/O2
    df['TicketStr'].replace({"SOTON/O2": "SOTONO2"}, inplace=True)

    # SOTON/OQ
    df['TicketStr'].replace({"SOTON/OQ": "SOTONOQ"}, inplace=True)

    # STON/O
    df['TicketStr'].replace({"STON/O": "STONO"}, inplace=True)

    # STON/O
    df['TicketStr'].replace({"STON/O2.": "STONO2"}, inplace=True)

    # "SOTON/O2"
    df['TicketStr'].replace({"SOTON/O2.": "SOTONO2"}, inplace=True)

    # "SOTON/OQ"
    df['TicketStr'].replace({"SOTON/O.Q.": "SOTONOQ"}, inplace=True)

    # SW/PP
    df['TicketStr'].replace({"SW/PP": "SWPP"}, inplace=True)

    # W.E.P
    df['TicketStr'].replace({"W.E.P.": "WEP"}, inplace=True)

    # WC
    df['TicketStr'].replace({"W./C.": "WC", "W/C": "WC"}, inplace=True)

    # WE/P
    df['TicketStr'].replace({"WE/P": "WEP"}, inplace=True)

    # Turn into categorical type
    df['TicketStr'] = df['TicketStr'].astype('category')

    return df

preprocessed_train_df = preprocess(train_df)
preprocessed_test_df = preprocess(test_df)

# Process titles for both training and test datasets
preprocessed_train_df = process_titles(preprocessed_train_df)
preprocessed_test_df = process_titles(preprocessed_test_df)

columns_to_drop = ['Name', 'Ticket', 'Cabin', 'Fare']
preprocessed_train_df = preprocessed_train_df.drop(columns=columns_to_drop)
preprocessed_test_df = preprocessed_test_df.drop(columns=columns_to_drop)

# Remove LINE from TicketNum column
preprocessed_train_df = preprocessed_train_df[preprocessed_train_df['TicketNum'] != 'LINE']
preprocessed_test_df = preprocessed_test_df[preprocessed_test_df['TicketNum'] != 'LINE']

# Apply the preprocessing function to your DataFrames
preprocessed_train_df = preprocess_ticket_column(preprocessed_train_df)
preprocessed_test_df = preprocess_ticket_column(preprocessed_test_df)

In [317]:
preprocessed_test_df.head(20)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked,TicketNum,TicketStr,FamilySize,Title
0,892,3,male,34.5,0,0,Q,330911,NONE,0,Mr
1,893,3,female,47.0,1,0,S,363272,NONE,1,Mrs
2,894,2,male,62.0,0,0,Q,240276,NONE,0,Mr
3,895,3,male,27.0,0,0,S,315154,NONE,0,Mr
4,896,3,female,22.0,1,1,S,3101298,NONE,2,Mrs
5,897,3,male,14.0,0,0,S,7538,NONE,0,Mr
6,898,3,female,30.0,0,0,Q,330972,NONE,0,Miss
7,899,2,male,26.0,1,1,S,248738,NONE,2,Mr
8,900,3,female,18.0,0,0,C,2657,NONE,0,Mrs
9,901,3,male,21.0,2,0,S,48871,A4,2,Mr


In [318]:
# Set Categorical Variables
import pandas as pd

# # Convert 'Survived' column to categorical
# preprocessed_train_df['Survived'] = pd.Categorical(preprocessed_train_df['Survived'])

# # Convert 'Pclass' columns to categorical for both Train and Test data
# preprocessed_train_df['Pclass'] = pd.Categorical(preprocessed_train_df['Pclass'])
# preprocessed_test_df['Pclass'] = pd.Categorical(preprocessed_test_df['Pclass'])

# # Convert 'Sex' columns to categorical for both Train and Test data
# preprocessed_train_df['Sex'] = pd.Categorical(preprocessed_train_df['Sex'])
# preprocessed_test_df['Sex'] = pd.Categorical(preprocessed_test_df['Sex'])

# # Convert 'Embarked' columns to categorical for both Train and Test data
# preprocessed_train_df['Embarked'] = pd.Categorical(preprocessed_train_df['Embarked'])
# preprocessed_test_df['Embarked'] = pd.Categorical(preprocessed_test_df['Embarked'])

# # Convert 'TicketStr' columns to categorical for both Train and Test data
# preprocessed_train_df['TicketStr'] = pd.Categorical(preprocessed_train_df['TicketStr'])
# preprocessed_test_df['TicketStr'] = pd.Categorical(preprocessed_test_df['TicketStr'])

# # Convert 'Title' columns to categorical for both Train and Test data
# preprocessed_train_df['Title'] = pd.Categorical(preprocessed_train_df['Title'])
# preprocessed_test_df['Title'] = pd.Categorical(preprocessed_test_df['Title'])

# Convert the column to 'int'
preprocessed_train_df['TicketNum'] = preprocessed_train_df['TicketNum'].astype(int)
preprocessed_test_df['TicketNum'] = preprocessed_test_df['TicketNum'].astype(int)

# Convert 'TicketStr' column to object type
preprocessed_train_df['TicketStr'] = preprocessed_train_df['TicketStr'].astype(object)
preprocessed_test_df['TicketStr'] = preprocessed_test_df['TicketStr'].astype(object)

print(preprocessed_train_df['TicketStr'].unique())

average_age_train = preprocessed_train_df['Age'].mean()
# Replace NaN values in the 'Age' column with the average value
preprocessed_train_df['Age'].fillna(average_age_train, inplace=True)

preprocessed_train_df.head()

['A5' 'PC' 'STONO2' 'NONE' 'PP' 'CA' 'SCPARIS' 'SCA4' 'A4' 'S.P.' 'SOC'
 'WC' 'SOTONOQ' 'WEP' 'STON/O_2.' 'C' 'S.O.P.' 'Fa' 'FCC' 'SWPP' 'SCOW'
 'PPP' 'SC' 'SC/AH' 'AS' 'SC/AH_Basle' 'S.W./PP' 'SOPP' 'FC' 'SOTONO2'
 'CASOTON']


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,TicketNum,TicketStr,FamilySize,Title
0,1,0,3,male,22.0,1,0,S,21171,A5,1,Mr
1,2,1,1,female,38.0,1,0,C,17599,PC,1,Mrs
2,3,1,3,female,26.0,0,0,S,3101282,STONO2,0,Miss
3,4,1,1,female,35.0,1,0,S,113803,NONE,1,Mrs
4,5,0,3,male,35.0,0,0,S,373450,NONE,0,Mr


### Separate Target and Attributes

In [319]:
# Create Training Partition
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Assuming 'columns_to_encode' is a list containing the column names to encode
columns_to_encode = ['Sex', 'Embarked', 'TicketStr', 'Title']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate through each specified column
for col in columns_to_encode:
    # Check if the column contains string values
    if preprocessed_train_df[col].dtype == 'object':
        # Fit and transform the LabelEncoder on the training data
        preprocessed_train_df[col] = label_encoder.fit_transform(preprocessed_train_df[col])

# Display the head of the encoded training data to check the results
preprocessed_train_df.head()

y = preprocessed_train_df['Survived']
X = preprocessed_train_df.iloc[:, preprocessed_train_df.columns != 'Survived']
preprocessed_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 887 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  887 non-null    int64  
 1   Survived     887 non-null    int64  
 2   Pclass       887 non-null    int64  
 3   Sex          887 non-null    int64  
 4   Age          887 non-null    float64
 5   SibSp        887 non-null    int64  
 6   Parch        887 non-null    int64  
 7   Embarked     887 non-null    int64  
 8   TicketNum    887 non-null    int64  
 9   TicketStr    887 non-null    int64  
 10  FamilySize   887 non-null    int64  
 11  Title        887 non-null    int64  
dtypes: float64(1), int64(11)
memory usage: 90.1 KB


### Split the Dataset and Create the Model

In [322]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train['TicketStr'].unique())

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
logistic_predictions = logistic_model.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
print("Logistic Regression Accuracy:", logistic_accuracy)

[ 9 10 25  7  0 27 22  1 21 26  4 29 15 30  3 12 20 23 18  8 19 28 24 13
 16 11  2  5]
Logistic Regression Accuracy: 0.5730337078651685
