# preprocessing

In [1]:
import pandas as pd

traindf = pd.read_csv('data/train.csv')
testdf = pd.read_csv('data/test.csv')
traindf['Set'] = 'Train'
testdf['Set'] = 'Test'
df = pd.concat([testdf,traindf])


In [2]:
def splits(data_frame):
    split_df = data_frame['PassengerId'].str.split('_', expand=True)
    data_frame[['Group', 'GroupNumber']] = split_df

    split_df =  data_frame['Cabin'].str.split("/", expand = True)
    data_frame[['Deck', 'CabinNumber', 'Side']]= split_df
    data_frame.CabinNumber = data_frame.CabinNumber.astype('Int64')
    
    split_df = data_frame['Name'].str.split(" ",expand = True)
    data_frame[['FirstName','LastName']] = split_df


    return data_frame

df = splits(df)


In [3]:

def group_size_apply(row):
    group = df[df.Group == row.Group]
    return len(group)

df['GroupSize'] = df.apply(group_size_apply, axis = 1)

df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df.loc[(df['Age'] < 13), 'Bills'] = 0
df.loc[(df['CryoSleep'] == True),'Bills'] = 0 
    
    
    


    


In [4]:

def fill_potential_decks_apply(row):
    
    potential_decks_by_planet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
    }

    potential_decks_by_planet_no_bills = {
        'Earth':['G'],
        'Europa':['B'],
        'Mars': ['E','F']
    }

    if pd.isna(row.Cabin):
        if row.Bills == 0 and not pd.isna(row.HomePlanet):
            if len(df[df.Group == row.Group].dropna(subset = 'Deck').Deck.unique()) > 1:
                return potential_decks_by_planet_no_bills[row.HomePlanet]
            
        if not pd.isna(row.HomePlanet):
            return potential_decks_by_planet[row.HomePlanet]
        
        else:
            return list(df.dropna(subset = ['Deck']).Deck.unique())
            

def fill_potential_sides_apply(row):
    if pd.isna(row.Cabin):
        if row.GroupSize > 1:
            group = df[df.Group == row.Group].dropna(subset = 'Side')
            if len(group) > 0:
                return [group.iloc[0].Side]
        return ['P','S']
    
    
df['potential_decks'] = df.apply(fill_potential_decks_apply, axis = 1)
    
df['potential_sides'] = df.apply(fill_potential_sides_apply,axis = 1)




In [5]:
def impute_based_on_shared_features(df,attribute,feature):
    for index, row in df[df[attribute].isna()].iterrows():
        # Find the group sharing the same feature value, excluding missing values in the attribute
        group = df[df[feature] == row[feature]].dropna(subset=[attribute])
        
        if not group.empty:
            # Assign the first non-missing value of the attribute in the group to the missing value
            df.loc[index, attribute] = group[attribute].iloc[0]

    return df

df = impute_based_on_shared_features(df,'HomePlanet','Group')
df = impute_based_on_shared_features(df,'HomePlanet','LastName')

In [6]:
df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)


# workings 2

In [7]:
def each_room_per_passenger(df):
    passenger_options = {}
    passengers_without_cabin = df[df['Cabin'].isna()]

    for index, passenger in passengers_without_cabin.iterrows():
        potential_options = {}

        for deck in passenger.potential_decks:
            for side in passenger.potential_sides:
                mask = (df['Deck'] == deck) & (df['Side'] == side)

                # Get the rooms before and after the current passenger index
                before = df.loc[mask & (df.index < index), 'CabinNumber'].dropna().unique()
                after = df.loc[mask & (df.index > index), 'CabinNumber'].dropna().unique()
                
                if len(before) == 0 or len(after) == 0:
                    continue
                
                elif max(before) + 1 >= min(after):
                    continue
                
                else:
                    if deck not in potential_options:
                        potential_options[deck] = {}
                    potential_options[deck][side] = [max(before), min(after)]

        passenger_options[index] = potential_options

    return passenger_options





# solo group and only one room that fits

In [8]:
def solo_group_one_option(df):
    potential_options = each_room_per_passenger(df)

    for index in potential_options.keys():
        if df.iloc[index].GroupSize != 1:
            continue
        empty_cabin_options_for_passenger = []
        
        refresh_options = False
        
        for deck, sides in potential_options[index].items():
            for side, numbers in sides.items():
                if numbers[0] + 2 <= numbers[1] and numbers[0] >= 0:
                    empty_cabin_options_for_passenger.append(f"{deck}/{numbers[0] + 1}/{side}")
                    if numbers[0] + 2 < numbers[1]:
                        refresh_options = True

        if len(empty_cabin_options_for_passenger) == 1:
            cabin = empty_cabin_options_for_passenger[0]
            df.loc[index,['Cabin','Deck','CabinNumber','Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
            if refresh_options:
                potential_options = each_room_per_passenger(df)
                

    return df


In [9]:
df2 = df.copy()

In [10]:
df3 = df2.copy()

# no free rooms so has to share

In [11]:
def no_free_rooms_so_shares(df):
    potential_options = each_room_per_passenger(df)
    
    for index,passenger_potential_options in potential_options.items():
        empty_cabin_options_for_passenger = 0
        
        for deck in passenger_potential_options:
            for side in passenger_potential_options[deck]:
                numbers = passenger_potential_options[deck][side]
                if numbers[0] + 2 <= numbers[1] and numbers[0] >= 0:
                    empty_cabin_options_for_passenger += 1
                    
        if empty_cabin_options_for_passenger == 0:
            passenger = df.loc[index]
            potential_cabins = df[(df['Group'] == passenger['Group']) &
                                  (df.index != index) &
                                  (df['Deck'].isin(passenger['potential_decks'])) &
                                  (df['Side'].isin(passenger['potential_sides']))]['Cabin'].unique()
            if len(potential_cabins) == 1:
                cabin = potential_cabins[0]
                df.loc[index,['Cabin','Deck','CabinNumber','Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return df
    


# only passenger that can take that cabin

In [12]:

                    
def rooms_to_fill(df):
    rooms = {}
    for deck in list(df.dropna(subset = ['Deck']).Deck.unique()):
        rooms[deck] = {'P': [], 'S': []} 
        for side in rooms[deck].keys():
            rooms_seen = df[(df.Deck == deck) & (df.Side == side) & (df.CabinNumber.notna())].CabinNumber.astype(int).tolist()
            largest_room_number = max(rooms_seen, default=-1) 

            for i in range(largest_room_number + 1):
                if i not in rooms_seen:
                    rooms[deck][side].append(f"{deck}/{i}/{side}")

    return rooms
           

In [13]:
def only_passenger_that_fits(df):
    free_rooms_dict = rooms_to_fill(df)
    free_passengers = each_room_per_passenger(df)
    for deck,deck_items in free_rooms_dict.items():
        for side,side_items in deck_items.items():
            for cabin in side_items:
                
                room_number = int(cabin.split('/')[1])
                passenger_options_for_empty_cabins = []
                
                for passenger_ind,passenger in free_passengers.items():
                    if deck not in passenger or side not in passenger[deck]:
                        continue
                    
                    if passenger[deck][side][0] + 1 == room_number == passenger[deck][side][1] - 1:
                        passenger_options_for_empty_cabins.append(passenger_ind)
                        
                if len(passenger_options_for_empty_cabins) == 1:
                    passenger_to_fill_ind = passenger_options_for_empty_cabins[0]
                    
                    df.loc[passenger_to_fill_ind,['Cabin','Deck','CabinNumber','Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
                    
                            
    return df
    

# all imputes

In [14]:
def all_imputes(df):
    df = solo_group_one_option(df)
    df = no_free_rooms_so_shares(df)
    df = only_passenger_that_fits(df)

    df = solo_group_one_option(df)
    df = no_free_rooms_so_shares(df)
    df = only_passenger_that_fits(df)
    
    return df
    
df = all_imputes(df)
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                 37
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Set                    0
Transported         4277
Group                  0
GroupNumber            0
Deck                  37
CabinNumber           37
Side                  37
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12671
potential_sides    12671
dtype: int64

# stop

In [17]:
def fill(df,cabin,index):
    df.loc[index,['Cabin','Deck','Number','Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return df

df = fill(df,1429,'E/58/P')
df.loc[1429,'Cabin'] = 'E/58/P'
df.loc[8413,'Cabin'] = 'A/57/P'
df.loc[9265,'Cabin'] = 'F/1267/S'
df.loc[9267,'Cabin'] = 'F/1267/S'

df.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room
df.loc[12893,'Cabin'] = 'F/1785/S'

df = fill_deck_and_side_from_cabin(df)


# workings


In [18]:
def all_cabin_options_for_each_row(df):
    options = each_room_per_passenger(df)
    for index, passenger in options.items():
        print()
        print("Index:",index, "GroupSize:", df.iloc[index].GroupSize)
        print("Free cabins that match:")
        for deck in passenger.keys():
            for side, numbers in passenger[deck].items():
                
                if numbers[0] + 2 == numbers[1]:
                    print(deck + "/" + str(numbers[0] + 1) + "/" + side )
                elif numbers[0] + 3 == numbers[1]:
                    print(deck + "/" + str(numbers[0] + 1) + "/" + side, deck + "/" + str(numbers[0] + 2) + "/" + side )



In [19]:
all_cabin_options_for_each_row(df)


index 404 GroupSize 1
free cabins that match
B/13/P
C/13/S

index 421 GroupSize 1
free cabins that match
B/13/P
C/13/S

index 479 GroupSize 2
free cabins that match
E/20/P E/21/P

index 505 GroupSize 2
free cabins that match
E/20/P E/21/P

index 517 GroupSize 2
free cabins that match
E/20/P E/21/P

index 1466 GroupSize 1
free cabins that match
C/40/S
D/36/S

index 1543 GroupSize 1
free cabins that match
C/40/S
D/36/S

index 2442 GroupSize 7
free cabins that match

index 2970 GroupSize 5
free cabins that match

index 3529 GroupSize 1
free cabins that match
E/150/P
F/519/P

index 3530 GroupSize 1
free cabins that match
E/150/P
F/519/P

index 4569 GroupSize 3
free cabins that match

index 4751 GroupSize 7
free cabins that match

index 5016 GroupSize 1
free cabins that match
G/590/P
G/579/S

index 5017 GroupSize 1
free cabins that match
G/590/P
G/579/S

index 8450 GroupSize 1
free cabins that match
D/191/P
E/387/P

index 8465 GroupSize 1
free cabins that match
D/191/P
E/387/P

index 10081

In [16]:
df_to_comp = pd.read_csv('data/31remaining.csv')

In [19]:
for index,row in df.iterrows():
    if not (pd.isna(row.Cabin) and pd.isna(df_to_comp.iloc[index].Cabin)):
        if row.Cabin != df_to_comp.iloc[index].Cabin:
            print(row.Cabin, df_to_comp.iloc[index].Cabin)

nan E/58/P
nan A/57/P
nan F/1267/S
nan F/1267/S
nan F/1785/S
nan F/1785/S
