In [136]:
import pandas as pd
import numpy as np

df = pd.read_csv('pre_processed.csv')
df.Number = df.Number.astype('Int64')



In [186]:

def fill_deck_and_side_from_cabin(df):
    # Define a mask to identify rows where 'Number' is NaN and 'Cabin' is not NaN
    mask = df['Number'].isna() & df['Cabin'].notna()
    
    # Use the mask to update only the filtered rows, converting types appropriately
    df.loc[mask, ['Deck', 'Number', 'Side']] = df.loc[mask, 'Cabin'].apply(
        lambda x: pd.Series({
            'Deck': x.split('/')[0],
            'Number': int(x.split('/')[1]),  # Explicit conversion to integer here
            'Side': x.split('/')[2]
        })
    )

    return df

In [187]:
decks_by_planet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
}

decks_by_planet_no_bills = {
    'Earth':['G'],
    'Europa':['B'],
    'Mars': ['E','F']
}

planet_by_deck = {
    'A':['Europa'],'B':['Europa'],'C':['Europa'],'D':['Europa','Mars'],'E':['Europa','Mars','Earth'],
    'F':['Earth','Mars'],'G':['Earth'],'T':['Europa']
}

homeplanets = ['Earth', 'Europa', 'Mars']

all_cabin_sides = ['P','S']

all_cabin_decks = list(df.dropna(subset = ['Deck']).Deck.unique())


In [188]:
def multiple_decks_in_group(df,row):
    return len(df[df.Group == row.Group].dropna(subset ='Deck').Deck.unique()) > 1
        

In [189]:
def fill_potential_decks(df):
    
    def func_potential_decks(row):
        if pd.isna(row.Cabin):
            if row.Bills == 0 and not pd.isna(row.HomePlanet):
                if len(df[df.Group == row.Group].dropna(subset = 'Deck').Deck.unique()) > 1:
                    return decks_by_planet_no_bills[row.HomePlanet]
                
            if not pd.isna(row.HomePlanet):
                return decks_by_planet[row.HomePlanet]
            
            else:
                return all_cabin_decks
            
    df['potential_decks'] = df.apply(func_potential_decks, axis = 1)
    return df

In [190]:
def fill_potential_sides(df):
    
    def func_potential_sides(row):
        if pd.isna(row.Cabin):
            if row.GroupSize > 1:
                group = df[df.Group == row.Group].dropna(subset = 'Side')
                if len(group) > 0:
                    return [group.iloc[0].Side]
            return ['P','S']
        
    df['potential_sides'] = df.apply(func_potential_sides,axis = 1)
    return df

In [191]:
df = fill_potential_decks(df)
df = fill_potential_sides(df)

# Filling missed Cabins


#### fill people that must be sharing a room with people in the same group
ie, from their chome planet we know the decks they could be in, if the side and number below and above are one above and one below then they must be sharing with someone from their group. make sure that their group has only one cabin

need to make sure is only one person  with cabin being na in the group otherwise one na might be in another cabin 

Finding groups that have more than 1 member that all share the same cabin and that dont have other nans in group
* make sure that there aren't other nans in group or else it might change results if there arent potentially other cabins it could go to if the other wasnt nan *

## empty room with only one comptabile


In [192]:
def rooms_to_fill(df):
    rooms = {}
    for deck in all_cabin_decks:
        for side in all_cabin_sides:
            rooms_seen = list(df[(df.Deck == deck) & (df.Side == side)].dropna(subset = ['Number']).Number)
            largest_room_number = max(rooms_seen)
            for i in range(largest_room_number):
                if i not in rooms_seen:
                    if deck not in rooms:
                        rooms[deck] = {'P':[],'S':[]}
                    rooms[deck][side].append(deck + "/" + str(i) + "/" + side)
    return rooms

                    
                    
                    
            

In [193]:


def empty_room_one_compatible(df):
    empty_cabins = rooms_to_fill(df) 
    for deck in empty_cabins.keys():
        for side in empty_cabins[deck].keys():
            for cabin in empty_cabins[deck][side]:
                room_number = int(cabin.split("/")[1])
                indices_of_compatible_rows = []
                temp = df[(df.Cabin.isna())]
                for index,row in temp.iterrows():
                    if deck not in row.potential_decks:
                        continue
                    if side not in row.potential_sides:
                        continue
                            
                    before_slice = df.iloc[:index]
                    after_slice = df.iloc[index+1:] 
                    before = list(before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number.unique())
                    if len(before) > 0:
                        if max(before) < room_number:
                            if min(list(after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number.unique())) > room_number:
                                indices_of_compatible_rows.append(index)
                if len(indices_of_compatible_rows) == 1:
                    df.loc[indices_of_compatible_rows[0], 'Cabin'] = cabin

    df = fill_deck_and_side_from_cabin(df)
    return df
                    
                    
                        

In [194]:
df = empty_room_one_compatible(df)



In [195]:
df.isna().sum()

PassengerId            0
HomePlanet            13
CryoSleep            310
Cabin                 41
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Set                    0
Transported         4277
Group                  0
GroupNumber            0
Deck                  41
Number                41
Side                  41
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12929
potential_sides    12929
dtype: int64

## No free rooms left so shares

In [196]:
def no_free_rooms_so_shares_3(df):
    for index, passenger in df[df.Cabin.isna()].iterrows():
        print()
        print(index)
        options = []
        for deck in passenger.potential_decks:
            for side in passenger.potential_sides:
                
                before_slice = df.iloc[:index]
                after_slice = df.iloc[index+1:] 
                top_room = before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number
                if len(top_room) > 0:
                    top_room_number_before = max(top_room)
                else:
                    continue

                smallest_room = after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number
                
                if len(smallest_room) > 0:
                    smallest_room_number_after = min(smallest_room)
                else:
                    continue
                if top_room_number_before + 1 < smallest_room_number_after:   
                        options.append([deck,side,top_room_number_before,smallest_room_number_after])
        print(options)
        if len(options) == 0:
            print(len(df[(df.Group == passenger.Group) & (df.Deck.isin(passenger.potential_decks))].dropna(subset = 'Cabin').Cabin.unique()))
            if len(df[(df.Group == passenger.Group) & (df.Deck.isin(passenger.potential_decks)) & (df.Side.isin(passenger.potential_sides)) ].dropna(subset = 'Cabin').Cabin.unique()) == 1:
                df.loc[index,'Cabin'] = df[(df.Group == passenger.Group) & (df.Deck.isin(passenger.potential_decks))& (df.Side.isin(passenger.potential_sides))].dropna(subset = 'Cabin').Cabin.iloc[0]
    df = fill_deck_and_side_from_cabin(df)

    
    return df
    


In [197]:
df = no_free_rooms_so_shares_3(df)


404
[['B', 'P', 12, 14], ['C', 'S', 12, 14]]

421
[['B', 'P', 12, 14], ['C', 'S', 12, 14]]

479
[['E', 'P', 19, 22]]

505
[['E', 'P', 19, 22]]

517
[['E', 'P', 19, 22]]

1429
[['E', 'P', 57, 59]]

1466
[['C', 'S', 39, 41], ['D', 'S', 35, 37], ['E', 'P', 57, 59]]

1543
[['C', 'S', 39, 41], ['D', 'S', 35, 37]]

2442
[]
3

2970
[]
3

3529
[['E', 'P', 149, 151], ['F', 'P', 518, 520]]

3530
[['E', 'P', 149, 151], ['F', 'P', 518, 520]]

4233
[['B', 'P', 97, 100]]

4254
[['B', 'P', 97, 100]]

4569
[]
2

4751
[]
2

5016
[['G', 'P', 589, 591], ['G', 'S', 578, 580]]

5017
[['G', 'P', 589, 591], ['G', 'S', 578, 580]]

6493
[['E', 'S', 299, 302]]

6514
[['E', 'S', 299, 302]]

8413
[['D', 'P', 190, 192]]

8450
[['D', 'P', 190, 192], ['E', 'P', 386, 388]]

8465
[['D', 'P', 190, 192], ['E', 'P', 386, 388]]

9265
[['F', 'S', 1266, 1268]]

9267
[['F', 'S', 1266, 1268]]

10081
[['F', 'P', 1488, 1490], ['G', 'P', 1156, 1158]]

10082
[['F', 'P', 1488, 1490], ['G', 'P', 1156, 1158]]

10290
[['C', 'S', 269

In [198]:
df.isna().sum()

PassengerId            0
HomePlanet            13
CryoSleep            310
Cabin                 41
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Set                    0
Transported         4277
Group                  0
GroupNumber            0
Deck                  41
Number                41
Side                  41
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12929
potential_sides    12929
dtype: int64

# in a group of one and only one option

In [199]:
def solo_group_and_one_option(df):
    empty_cabins = rooms_to_fill(df) 
    
    for index,passenger in df[(df.GroupSize == 1) & (df.Cabin.isna())].iterrows():
        print()
        print("index",index)
        options = []
        for deck in empty_cabins.keys():
            if deck not in passenger.potential_decks:
                continue
            
            for side in empty_cabins[deck]:
                if side not in passenger.potential_sides:
                    continue
                for cabin in empty_cabins[deck][side]:
                    room_number = int(cabin.split("/")[1])
                    before_slice = df.iloc[:index]
                    after_slice = df.iloc[index+1:] 
                    top_room = before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number
                    if len(top_room) > 0:
                        top_room_number_before = max(top_room)
                    else:
                        continue
                    smallest_room = after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number
                    
                    if len(smallest_room) > 0:
                        smallest_room_number_after = min(smallest_room)
                    else:
                        continue
                    if top_room_number_before < room_number:
                        if smallest_room_number_after > room_number:
                            options.append([deck,side,top_room_number_before,smallest_room_number_after])
                        else:
                            print("srna",smallest_room_number_after,room_number)
                        
                            
                    else:
                        print("trnb",top_room_number_before,room_number)   
                            
        print(options)
                            
        if len(options) == 1:
            if options[0][2] + 2 == options[0][3]:
                print()
                df.loc[index,'Cabin'] = options[0][0] + "/" + str(int(options[0][2]) + 1) + "/" + options[0][1]
                print("found",index,options)
            else:
                print()
                print("multigap")
                print(index,options)
    df = fill_deck_and_side_from_cabin(df)
    
    return df
                
                    
                    

In [200]:
df = solo_group_and_one_option(df)


index 404
srna 14 98
srna 14 99
srna 3 94
srna 14 40
srna 14 270
srna 14 298
srna 17 20
srna 17 21
srna 17 58
srna 17 150
srna 17 387
srna 18 300
srna 18 301
srna 18 528
srna 11 191
srna 11 235
srna 11 36
[['B', 'P', 12, 14], ['C', 'S', 12, 14]]

index 421
srna 14 98
srna 14 99
srna 4 94
srna 14 40
srna 14 270
srna 14 298
srna 17 20
srna 17 21
srna 17 58
srna 17 150
srna 17 387
srna 20 300
srna 20 301
srna 20 528
srna 12 191
srna 12 235
srna 11 36
[['B', 'P', 12, 14], ['C', 'S', 12, 14]]

index 1466
trnb 37 13
srna 38 98
srna 38 99
srna 8 94
trnb 39 13
srna 41 270
srna 41 298
trnb 57 20
trnb 57 21
srna 59 150
srna 59 387
srna 75 300
srna 75 301
srna 75 528
srna 40 191
srna 40 235
[['C', 'S', 39, 41], ['E', 'P', 57, 59], ['D', 'S', 35, 37]]

index 1543
trnb 38 13
srna 39 98
srna 39 99
srna 9 94
trnb 39 13
srna 41 270
srna 41 298
trnb 61 20
trnb 61 21
trnb 61 58
srna 62 150
srna 62 387
srna 76 300
srna 76 301
srna 76 528
srna 43 191
srna 43 235
[['C', 'S', 39, 41], ['D', 'S', 35, 37]]



In [205]:
df.isna().sum()

PassengerId            0
HomePlanet            13
CryoSleep            310
Cabin                 41
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Set                    0
Transported         4277
Group                  0
GroupNumber            0
Deck                  41
Number                41
Side                  41
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12929
potential_sides    12929
dtype: int64

# fuctions to help

In [213]:


df.loc[4233,'Cabin'] = 'B/98/P'
df.loc[4254,'Cabin'] = 'B/99/P'
df.loc[6493,'Cabin'] = 'E/300/S'
df.loc[6514,'Cabin'] = 'E/301/S'
df.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room
df.loc[12893,'Cabin'] = 'F/1785/S'
df.loc[9265,'Cabin'] = 'F/1267/S'
df.loc[9267,'Cabin'] = 'F/1267/S'
df.loc[8413,'Cabin'] = 'A/57/P'
df.loc[1429,'Cabin'] = 'E/58/P'

df = fill_deck_and_side_from_cabin(df)


In [84]:
def all_cabin_options_for_each_row(df):
    count = 0
    for index, passenger in df[df.Cabin.isna()].iterrows():
        print("\nindex", index)
        print("passenger",passenger.PassengerId)
        print("GroupSize", passenger.GroupSize)
        options = []
        for deck in passenger.potential_decks:
            for side in passenger.potential_sides:
                
                before_slice = df.iloc[:index]
                after_slice = df.iloc[index+1:] 
                top_room_number_before = np.max(before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number)
                smallest_room_number_after = np.min(after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number)
                if pd.isna(top_room_number_before) or pd.isna(smallest_room_number_after):
                        continue
                if top_room_number_before + 1 != smallest_room_number_after:
                    
                    if top_room_number_before == smallest_room_number_after:
                        continue
                    else:
                        options.append([deck,side,top_room_number_before,smallest_room_number_after])
        print(options)
        print(passenger.potential_sides)
        if len(options) == 0:
            count += 1
    print(count)


In [85]:
def all_rows_for_each_cabin_option(df):
    empty_cabins = rooms_to_fill(df) 
    for deck in empty_cabins.keys():
        for side in empty_cabins[deck].keys():
            for cabin in empty_cabins[deck][side]:
                room_number = int(cabin.split("/")[1])
                indices_of_compatible_rows = []
                temp = df[(df.Cabin.isna())]
                for index,row in temp.iterrows():
                    if deck not in row.potential_decks:
                        continue
                    if side not in row.potential_sides:
                        continue
                            
                    before_slice = df.iloc[:index]
                    after_slice = df.iloc[index+1:] 
                    before = list(before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number.unique())
                    after = list(after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number.unique())
                    if len(before) > 0:
                        if max(before) < room_number:
                            if len(after) > 0:
                                if min(after) > room_number:
                                    indices_of_compatible_rows.append(index)
                print()
                print("cabin", cabin)
                print(indices_of_compatible_rows)

In [86]:
all_cabin_options_for_each_row(df)


index 404
passenger 0293_01
GroupSize 1
[['B', 'P', 12, 14], ['C', 'S', 12, 14]]
['P', 'S']

index 421
passenger 0310_01
GroupSize 1
[['B', 'P', 12, 14], ['C', 'S', 12, 14]]
['P', 'S']

index 479
passenger 0348_02
GroupSize 2
[['E', 'P', 19, 22]]
['P']

index 505
passenger 0364_02
GroupSize 2
[['E', 'P', 19, 22]]
['P']

index 517
passenger 0374_02
GroupSize 2
[['E', 'P', 19, 22]]
['P']

index 1429
passenger 1011_01
GroupSize 2
[['E', 'P', 57, 59]]
['P']

index 1466
passenger 1041_01
GroupSize 1
[['C', 'S', 39, 41], ['D', 'S', 35, 37], ['E', 'P', 57, 59]]
['P', 'S']

index 1543
passenger 1095_01
GroupSize 1
[['C', 'S', 39, 41], ['D', 'S', 35, 37]]
['P', 'S']

index 2442
passenger 1709_03
GroupSize 7
[]
['S']

index 2970
passenger 2092_03
GroupSize 5
[]
['S']

index 3529
passenger 2513_01
GroupSize 1
[['E', 'P', 149, 151], ['F', 'P', 518, 520]]
['P', 'S']

index 3530
passenger 2514_01
GroupSize 1
[['E', 'P', 149, 151], ['F', 'P', 518, 520]]
['P', 'S']

index 4233
passenger 3034_01
Group

In [868]:
all_rows_for_each_cabin_option(df2)


cabin B/13/P
[404, 421]

cabin F/519/P
[3529, 3530]

cabin F/1489/P
[10081, 10082]

cabin F/1544/P
[10434, 10440]

cabin F/1424/S
[10394, 10408, 10411, 10434]

cabin A/94/P
[12651, 12668]

cabin G/590/P
[5016, 5017]

cabin G/1157/P
[10081, 10082]

cabin G/579/S
[5016, 5017]

cabin G/1206/S
[10408, 10411]

cabin G/1212/S
[10434, 10440]

cabin C/13/S
[404, 421]

cabin C/40/S
[1466, 1543]

cabin C/270/S
[10290, 10313]

cabin C/298/S
[11129, 11148]

cabin E/20/P
[479, 505, 517]

cabin E/21/P
[479, 505, 517]

cabin E/58/P
[1429, 1466]

cabin E/150/P
[3529, 3530]

cabin E/387/P
[8450, 8465]

cabin E/528/S
[11129, 11148]

cabin D/191/P
[8413, 8450, 8465]

cabin D/235/P
[10313, 10394]

cabin D/36/S
[1466, 1543]
