This is my first Jupyter Notebook for my first Kaggle Competition

In [149]:
import pandas as pd
import numpy as np  
import sklearn.impute as imputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


In [11]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

if 'Survived' not in df_test.columns:
    df_test['Survived'] = 0

#train.head()

In [45]:
# Approximate boundaries for cabin locations
# These are VERY rough estimates and highly simplified.
# Actual locations depend on detailed ship architecture and specific cabin placement.
# The model assumes cabin numbers generally increase from bow to stern within blocks,
# but this is a generalization.
#
# Format for deck_sections:
# Deck_Letter: [(max_number_for_section1, "section1_name"),
#               (max_number_for_section2, "section2_name"),
#               (float('inf'), "last_section_name")]
# The number represents the upper bound for that section. float('inf') captures all higher numbers.

deck_sections_simplified = {
    'A': [(12, "front"), (24, "middle"), (float('inf'), "back")],  # A-Deck cabins (e.g., A1 to A37+)
    'B': [(35, "front"), (70, "middle"), (float('inf'), "back")],  # B-Deck cabins (e.g., B1 to B100+)
    'C': [(45, "front"), (95, "middle"), (float('inf'), "back")],  # C-Deck cabins (e.g., C1 to C140+)
    # D-Deck had large dining saloons mid-ship. Cabins were often fore and aft of these.
    'D': [(20, "front"), (35, "middle"), (float('inf'), "back")], # Simplified: D1-D20 (front), D21-D35 (cabins near/between saloons - 'middle'), D36+ (aft)
    'E': [(40, "front"), (85, "middle"), (float('inf'), "back")],  # E-Deck cabins (e.g., E1 to E120+)
    # F-Deck had 3rd class dining, pool, Turkish baths mid/aft. Also 2nd class cabins.
    # Some cabins physically on F-Deck were designated with 'G' (e.g., G6), these are generally aft.
    # This rule is primarily for F-numbered cabins.
    'F': [(15, "front"), (30, "middle"), (float('inf'), "back")],
    # G-Deck was the lowest passenger deck, mostly 3rd class, and could be interrupted by machinery spaces.
    'G': [(25, "front"), (float('inf'), "back")], # Simplified: front and back sections for cabins, assuming machinery might be mid.
    # Special non-passenger cabin areas or unique cabins
    'T': [(float('inf'), "special_location_boat_deck")] # e.g., Marconi room, some officer's quarters
}

def get_titanic_cabin_location(cabin_identifier_str):
    """
    Attempts to categorize a Titanic cabin into 'front', 'middle', or 'back'.

    Args:
        cabin_identifier_str (str): The cabin identifier, e.g., "C23", "B58".
                                    It should start with a deck letter.

    Returns:
        str: A string indicating the estimated location ("front", "middle", "back"),
             a special location, or an error/unknown message.
    """
    if not cabin_identifier_str or not isinstance(cabin_identifier_str, str):
        return "invalid_input_cabin_empty_or_not_string"

    cabin_identifier_str = cabin_identifier_str.strip().upper()

    if not cabin_identifier_str:
        return "invalid_input_cabin_empty_after_strip"

    deck_char = cabin_identifier_str[0]
    number_part_str = cabin_identifier_str[1:]

    if deck_char not in deck_sections_simplified:
        # Handle decks not in our simplified list (e.g., Orlop, Tank Top, or invalid)
        if deck_char in ['O', 'R', 'L', 'P']: # Orlop deck letters sometimes seen in data
             return f"deck_{deck_char}_likely_cargo_or_machinery_not_passenger_cabins"
        return f"unknown_deck_{deck_char}"

    # For decks like 'T' that might not have numbers or are special
    if not number_part_str and deck_char == 'T':
        return deck_sections_simplified['T'][0][1] # Returns "special_location_boat_deck"

    # Extract numeric part of the cabin number, allowing for suffixes like 'A' in 'C85A'
    numeric_digits = ''.join(filter(str.isdigit, number_part_str))

    if not numeric_digits:
        # This could happen if input is just "C" or "CABIN" or "F G6" (which is complex)
        # For "F G6", this simple parser would struggle. It expects "G6".
        if deck_char == 'F' and 'G' in number_part_str: # Basic attempt to catch "F Gxx"
            return "likely_aft_third_class_on_F_deck"
        return f"cabin_number_missing_or_unparsable_for_deck_{deck_char}"

    try:
        cabin_num = int(numeric_digits)
    except ValueError:
        return f"cabin_number_contains_non_numeric_parts_unparsable_for_deck_{deck_char}"

    sections = deck_sections_simplified.get(deck_char)
    if sections:
        for limit, location_name in sections:
            if cabin_num <= limit:
                return location_name
        # This case should ideally not be reached if float('inf') is the last limit,
        # but as a fallback:
        return f"location_could_not_be_determined_on_deck_{deck_char}_num_{cabin_num}"

    return f"rules_not_defined_for_deck_{deck_char}" # Should be caught by initial deck_char check

# --- Example Usage ---
#if __name__ == "__main__":
#    cabins_to_test = [
#        "C23", "A10", "A30", "A37",
#        "B5", "B50", "B90",
#        "D10", "D30", "D45",
#        "E1", "E77", "E100",
#        "F2", "F20", "F33",
#        "G6", # This is a 3rd class cabin, often listed on F deck plans but G-numbered
#        "T",  # Marconi room
#        "C", "B10A", "X100", "F G6", "DeckZ99"
#
#
#    print("Titanic Cabin Location Estimations (Approximate):")
#    print("-------------------------------------------------")
#    for cabin in cabins_to_test:
#        location = get_titanic_cabin_location(cabin)
#        print(f"Cabin '{cabin}': {location}"
#
#    print("\nNote: These locations are highly simplified estimations.")
#    print("The actual position of a cabin is complex and requires detailed deck plans.")
#    print("For example, 'G6' was often physically on F-Deck and would be considered 'aft'.")
#    print("The parser handles 'G6' as a G-deck cabin if not specially treated.")



In [136]:
def preprocess(df_train, df_test):
    df = pd.concat([df_train, df_test], axis=0)
    df = df.drop(['Name', 'Ticket'], axis=1)

    # age work
    female_median = df[df['Sex'] == 'female']['Age'].median()
    male_median = df[df['Sex'] == 'male']['Age'].median()

    median_age_dict = {'male':male_median, 'female':female_median}
    
    df['Age'] = df['Age'].fillna(df['Sex'].map(median_age_dict))

    # fare
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # cabin
    df['Cabin_loc'] = df['Cabin'].map(lambda x: get_titanic_cabin_location(x))
    df.loc[df['Cabin_loc'].isna() | ~df['Cabin_loc'].isin(['front', 'middle', 'back']), 'Cabin_loc'] = 'unknown'

    df['Cabin_letter'] = df['Cabin'].str.extract(r'([A-Za-z])', expand=False)
    df.loc[df['Cabin_letter'].isna(), 'Cabin_letter'] = 'U'

    df['Cabin_number'] = df['Cabin'].str.extract(r'(\d+)', expand=False)
    df.loc[df['Cabin_number'].isna(), 'Cabin_number'] = 0
    df['Cabin_number'] = pd.to_numeric(df['Cabin_number'])

    
    # embarked
    df['Embarked'] = df['Embarked'].fillna('U')

    # get dummies
    df = pd.get_dummies(df, columns=['Sex'], prefix='Sex')
    df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')
    df = pd.get_dummies(df, columns=['Cabin_loc'], prefix='Cabin_loc')
    df = pd.get_dummies(df, columns=['Cabin_letter'], prefix='Cabin_letter')

    #drop columns
    df = df.drop(['Cabin','Embarked_U','Cabin_loc_unknown', 'Cabin_letter_U'], axis=1)

    # get sets

    df_train = df[:len(df_train)]
    df_test = df[len(df_train):]

    df_test = df_test.drop(['Survived'], axis=1)

    return df_train, df_test

reference_df = preprocess(df_train, df_test)

reference_df.info()
# look for nan values

missing_counts = reference_df.isnull().sum()
filtered_counts = missing_counts.loc[missing_counts > 0]
print(filtered_counts)


# Age (263) -- we'll use the median age based on gender of the passenger
# Fare (1) -- we'll use the median fare
# Cabin (1014) -- despite being sparse, we'll see if we can use some values
# Embarked (2) -- this should be dropped as it should have no influence on whether a passenger survived
reference_df.head(10)

In [142]:
train_df, test_df = preprocess(df_train, df_test)


In [143]:
X = train_df.drop(['Survived'], axis=1)
y = train_df['Survived']


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = np.reshape(y_train.values, (-1, 1))

In [144]:
x_train.shape, y_train.shape

((712, 23), (712, 1))

In [145]:
model_1 = LogisticRegression()
model_1 = model_1.fit(x_train, y_train)

y_pred = model_1.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8100558659217877

In [146]:
model_2 = XGBClassifier(enable_categorical=True)
model_2 = model_2.fit(x_train, y_train)
y_pred = model_2.predict(x_test)
accuracy_score(y_test, y_pred)

0.7932960893854749

In [148]:
train_df.corr()['Survived'].sort_values(ascending=False)

Survived            1.000000
Sex_female          0.543351
Fare                0.257307
Cabin_number        0.229756
Cabin_loc_front     0.215539
Cabin_letter_B      0.175095
Embarked_C          0.168240
Cabin_loc_back      0.167918
Cabin_letter_D      0.150716
Cabin_letter_E      0.145321
Cabin_loc_middle    0.119742
Cabin_letter_C      0.114652
Parch               0.081629
Cabin_letter_F      0.057935
Cabin_letter_A      0.022287
Cabin_letter_G      0.016040
Embarked_Q          0.003650
PassengerId        -0.005007
Cabin_letter_T     -0.026456
SibSp              -0.035322
Age                -0.067644
Embarked_S         -0.155660
Pclass             -0.338481
Sex_male           -0.543351
Name: Survived, dtype: float64

In [151]:
model_3 = RandomForestClassifier()
model_3 = model_3.fit(x_train, y_train)
y_pred = model_3.predict(x_test)
accuracy_score(y_test, y_pred)

  return fit_method(estimator, *args, **kwargs)


0.8212290502793296