# preprocessing

In [4]:
import pandas as pd

traindata = pd.read_csv('data/train.csv')
testdata = pd.read_csv('data/test.csv')
traindata['Set'] = 'Train'
testdata['Set'] = 'Test'
df = pd.concat([traindata,testdata])


In [5]:
def splits(data_frame):
    split_df = data_frame['PassengerId'].str.split('_', expand=True)
    data_frame[['Group', 'GroupNumber']] = split_df

    split_df =  data_frame['Cabin'].str.split("/", expand = True)
    data_frame[['Deck', 'CabinNumber', 'Side']]= split_df
    data_frame.CabinNumber = data_frame.CabinNumber.astype('Int64')
    
    split_df = data_frame['Name'].str.split(" ",expand = True)
    data_frame[['FirstName','LastName']] = split_df


    return data_frame

df = splits(df)


In [6]:
def calculate_group_size(dataframe):
    # Calculate group sizes and assign directly to a new column
    dataframe['GroupSize'] = dataframe.groupby('Group')['Group'].transform('count')
    return dataframe


df = calculate_group_size(df)

df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df.loc[(df['Age'] < 13), 'Bills'] = 0
df.loc[(df['CryoSleep'] == True),'Bills'] = 0 
    
    
    


    


In [7]:

def fill_potential_decks(dataframe):
    
    
    potential_decks_by_homeplanet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
    }

    potential_decks_by_homeplanet_no_bills = {
        'Earth':['G'],
        'Europa':['B'],
        'Mars': ['E','F']
    }
    
    def func_potential_decks_apply(row):
        if pd.isna(row.Cabin):
            if row.Bills == 0 and not pd.isna(row.HomePlanet):
                if len(dataframe[dataframe.Group == row.Group].dropna(subset = 'Deck').Deck.unique()) > 1:
                    return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
                
            if not pd.isna(row.HomePlanet):
                return potential_decks_by_homeplanet[row.HomePlanet]
            
            else:
                return list(dataframe.dropna(subset = ['Deck']).Deck.unique())
            
    dataframe['potential_decks'] = dataframe.apply(func_potential_decks_apply,axis = 1)
    return dataframe
    
            

def fill_potential_sides(dataframe):
    
    def func_potential_sides_apply(row):
        if pd.isna(row.Cabin):
            if row.GroupSize > 1:
                group = dataframe[dataframe.Group == row.Group].dropna(subset = 'Side')
                if len(group) > 0:
                    return [group.iloc[0].Side]
            return ['P','S']
        
    dataframe['potential_sides'] = dataframe.apply(func_potential_sides_apply,axis = 1)
    return dataframe
    
    
df = fill_potential_decks(df)
df = fill_potential_sides(df)



In [8]:
def impute_based_on_shared_features(dataframe,attribute,shared_feature):
    for index, row in dataframe[dataframe[attribute].isna()].iterrows():
        # Find the group sharing the same feature value, excluding missing values in the attribute
        entries_with_shared_feature = dataframe[dataframe[shared_feature] == row[shared_feature]].dropna(subset=[attribute])
        
        if not entries_with_shared_feature.empty:
            # Assign the first non-missing value of the attribute in the group to the missing value
            dataframe.loc[index, attribute] = entries_with_shared_feature[attribute].iloc[0]

    return dataframe

df = impute_based_on_shared_features(df,'HomePlanet','Group')
df = impute_based_on_shared_features(df,'HomePlanet','LastName')

In [9]:
df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)


In [10]:
df_perm = df.copy()

In [16]:
def impute_from_cabin(dataframe,cabin,index):
    dataframe.loc[index,['Cabin','Deck','CabinNumber','Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe

# workings 2

In [17]:
def room_number_constraints_for_passengers(dataframe):
    passenger_options = {}
    passengers_without_cabin = dataframe[dataframe['Cabin'].isna()]

    for index, passenger in passengers_without_cabin.iterrows():
        potential_options = []

        for deck in passenger.potential_decks:
            for side in passenger.potential_sides:
                mask = (dataframe['Deck'] == deck) & (dataframe['Side'] == side)

                # Get the rooms before and after the current passenger index
                before = dataframe.loc[mask & (dataframe.index < index), 'CabinNumber'].dropna().unique()
                after = dataframe.loc[mask & (dataframe.index > index), 'CabinNumber'].dropna().unique()
                
                if len(before) == 0 or len(after) == 0:
                    continue
                
                elif max(before) + 1 >= min(after):
                    continue
                
                else: 
                    potential_options += [f"{deck}/{i}/{side}" for i in range(max(before) + 1, min(after))]
        

        passenger_options[index] = potential_options

    return passenger_options





# solo group and only one room that fits

In [18]:
def solo_group_one_option(dataframe):
    
    potential_options = room_number_constraints_for_passengers(dataframe)

    for index, _ in df[(df.Cabin.isna()) & (df.GroupSize == 1)].iterrows():

        if len(potential_options[index]) == 1:
            cabin = potential_options[index][0]
            dataframe = impute_from_cabin(dataframe,cabin,index)

                

    return dataframe


In [19]:
room_number_constraints_for_passengers(df)

{15: ['G/2/S'],
 66: [],
 137: ['F/20/P'],
 150: [],
 315: ['E/15/S', 'G/39/P'],
 331: ['E/15/S'],
 336: ['E/15/S'],
 382: [],
 394: [],
 404: ['B/13/P', 'C/13/S'],
 412: ['G/40/S'],
 421: ['B/13/P', 'C/13/S'],
 440: ['F/70/P', 'G/46/S'],
 444: ['F/70/P'],
 479: ['E/20/P', 'E/21/P'],
 492: ['E/20/P', 'E/21/P', 'G/49/S'],
 505: ['E/20/P', 'E/21/P'],
 517: ['E/20/P', 'E/21/P'],
 529: ['B/12/S'],
 623: [],
 650: ['G/73/P'],
 695: [],
 701: ['E/30/S'],
 715: ['G/75/S'],
 732: ['G/79/S'],
 809: ['F/106/S'],
 982: [],
 1003: [],
 1016: ['B/30/P'],
 1042: ['F/152/P', 'G/110/S'],
 1049: ['G/110/S'],
 1069: [],
 1077: [],
 1175: ['F/157/S'],
 1178: ['G/121/P'],
 1188: ['C/35/S'],
 1244: ['E/49/P', 'F/181/P'],
 1251: ['F/181/P'],
 1319: ['F/170/S'],
 1401: [],
 1423: [],
 1429: ['E/58/P'],
 1441: ['E/58/P', 'G/151/P'],
 1466: ['C/40/S', 'D/36/S', 'E/58/P'],
 1543: ['C/40/S', 'D/36/S'],
 1574: ['F/213/S'],
 1601: ['G/171/P'],
 1603: ['E/65/P'],
 1605: [],
 1631: [],
 1712: ['F/230/S'],
 1759: ['F

In [20]:
df = solo_group_one_option(df)
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                189
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Transported         4277
Set                    0
Group                  0
GroupNumber            0
Deck                 189
CabinNumber          189
Side                 189
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12671
potential_sides    12671
dtype: int64

# no free rooms so has to share

In [21]:
def no_free_rooms_so_shares(dataframe):
    potential_options = room_number_constraints_for_passengers(dataframe)
    
    for index,passenger_cabin_options in potential_options.items():
        
        if not bool(passenger_cabin_options):
            
            passenger_row = dataframe.loc[index]
            
            potential_cabins = dataframe[(dataframe['Group'] == passenger_row['Group']) &
                                  (dataframe['Deck'].isin(passenger_row['potential_decks'])) &
                                  (dataframe['Side'].isin(passenger_row['potential_sides']))].dropna(subset = 'Cabin')['Cabin'].unique()
            
            if len(potential_cabins) == 1:
                cabin = potential_cabins[0]
                dataframe = impute_from_cabin(dataframe,cabin,index)
                
    return dataframe
    


In [22]:
df = no_free_rooms_so_shares(df)
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                 91
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Transported         4277
Set                    0
Group                  0
GroupNumber            0
Deck                  91
CabinNumber           91
Side                  91
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12671
potential_sides    12671
dtype: int64

# only passenger that can take that cabin

In [23]:

                    
def rooms_to_fill(dataframe):
    rooms = []
    for deck in list(dataframe.dropna(subset = ['Deck']).Deck.unique()):
        for side in ['P','S']:
            rooms_seen = dataframe[(dataframe.Deck == deck) & (dataframe.Side == side) & (dataframe.CabinNumber.notna())].CabinNumber.astype(int).tolist()
            largest_room_number = max(rooms_seen, default=-1) 

            for i in range(largest_room_number + 1):
                if i not in rooms_seen:
                    rooms.append(f"{deck}/{i}/{side}")

    return rooms
           

In [24]:
rooms_to_fill(df)

['B/13/P',
 'B/98/P',
 'B/99/P',
 'F/152/P',
 'F/181/P',
 'F/293/P',
 'F/307/P',
 'F/519/P',
 'F/559/P',
 'F/772/P',
 'F/849/P',
 'F/891/P',
 'F/1093/P',
 'F/1122/P',
 'F/1178/P',
 'F/1237/P',
 'F/1489/P',
 'F/1544/P',
 'F/157/S',
 'F/213/S',
 'F/235/S',
 'F/674/S',
 'F/827/S',
 'F/1267/S',
 'F/1376/S',
 'F/1424/S',
 'F/1785/S',
 'A/94/P',
 'G/39/P',
 'G/151/P',
 'G/273/P',
 'G/590/P',
 'G/775/P',
 'G/1053/P',
 'G/1157/P',
 'G/1252/P',
 'G/1286/P',
 'G/1474/P',
 'G/46/S',
 'G/49/S',
 'G/280/S',
 'G/334/S',
 'G/399/S',
 'G/534/S',
 'G/579/S',
 'G/759/S',
 'G/874/S',
 'G/878/S',
 'G/1062/S',
 'G/1164/S',
 'G/1200/S',
 'G/1206/S',
 'G/1212/S',
 'G/1282/S',
 'C/138/P',
 'C/13/S',
 'C/40/S',
 'C/214/S',
 'C/270/S',
 'C/277/S',
 'C/298/S',
 'E/20/P',
 'E/21/P',
 'E/49/P',
 'E/58/P',
 'E/150/P',
 'E/161/P',
 'E/257/P',
 'E/387/P',
 'E/541/P',
 'E/174/S',
 'E/300/S',
 'E/301/S',
 'E/528/S',
 'D/191/P',
 'D/235/P',
 'D/281/P',
 'D/36/S',
 'D/212/S']

In [25]:
def only_passenger_that_fits(dataframe):
    
    free_rooms_dict = rooms_to_fill(dataframe)
    free_passengers = room_number_constraints_for_passengers(dataframe)
    
    for cabin in free_rooms_dict:
    
                
        passenger_options_for_empty_cabins = []
        
        for passenger_ind,passenger_options in free_passengers.items():
            if cabin in passenger_options:
                passenger_options_for_empty_cabins.append(passenger_ind)
                
        if len(passenger_options_for_empty_cabins) == 1:
            passenger_to_fill_index = passenger_options_for_empty_cabins[0]
            
            dataframe = impute_from_cabin(dataframe,cabin,passenger_to_fill_index)
                    
                            
    return dataframe
    

In [26]:
df = only_passenger_that_fits(df)

# all imputes

In [27]:
df = df_perm.copy()

In [28]:
def all_imputes(dataframe):
    dataframe = solo_group_one_option(dataframe)
    dataframe = no_free_rooms_so_shares(dataframe)
    dataframe = only_passenger_that_fits(dataframe)

    dataframe = solo_group_one_option(dataframe)
    dataframe = no_free_rooms_so_shares(dataframe)
    dataframe = only_passenger_that_fits(dataframe)
    
    return dataframe
    
df = all_imputes(df)
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                 41
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Transported         4277
Set                    0
Group                  0
GroupNumber            0
Deck                  41
CabinNumber           41
Side                  41
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12671
potential_sides    12671
dtype: int64

In [29]:
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                 41
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Transported         4277
Set                    0
Group                  0
GroupNumber            0
Deck                  41
CabinNumber           41
Side                  41
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12671
potential_sides    12671
dtype: int64

# workings


In [30]:
def all_cabin_options_for_each_row(dataframe):
    options = room_number_constraints_for_passengers(dataframe)
    for index, passenger in options.items():
        print()
        print("Index:",index, "GroupSize:", dataframe.iloc[index].GroupSize)
        print("Free cabins that match:")
        print(options[index])
                
             


In [31]:
all_cabin_options_for_each_row(df)


Index: 404 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 421 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 479 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 505 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 517 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 1429 GroupSize: 2
Free cabins that match:
['E/58/P']

Index: 1466 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S', 'E/58/P']

Index: 1543 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S']

Index: 2442 GroupSize: 7
Free cabins that match:
[]

Index: 2970 GroupSize: 5
Free cabins that match:
[]

Index: 3529 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 3530 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 4233 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4254 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4569 GroupSize: 3
Free cabins that 

In [34]:
def update_cabins(dataframe, cabin_list):
    for index, cabin in cabin_list:
        dataframe.loc[index, ['Cabin', 'Deck', 'CabinNumber', 'Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe

cabin_list = [(1429,'E/58/P'),(8413,'A/57/P'),(9265,'F/1267/S'),(9267,'F/1267/S')]

# maybe have to do 

cabin_list2 = [(4233,'B/98/P'),(4254,'B/99/P'),(6493,'E/300/S'),(6514,'E/301/S')]

df = update_cabins(df,cabin_list)
df = update_cabins(df,cabin_list2)

"""
df.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room
df.loc[12893,'Cabin'] = 'F/1785/S'"""

"\ndf.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room\ndf.loc[12893,'Cabin'] = 'F/1785/S'"

In [35]:
traindata = df[df.Set == 'Train']
testdata = df[df.Set == 'Test']

# End


In [36]:
df_to_comp = pd.read_csv('data/31remaining.csv')
df_to_comp = df_to_comp.rename(columns = {'Number':'CabinNumber'})
df_to_comp['CabinNumber'] = df_to_comp['CabinNumber'].astype('Int64')


In [37]:
for index,row in df.iterrows():
    if not (pd.isna(row.Cabin) and pd.isna(df_to_comp.iloc[index].Cabin)):
        if row.Cabin != df_to_comp.iloc[index].Cabin:
            print(index,row.Cabin, df_to_comp.iloc[index].Cabin)

12892 nan F/1785/S
12893 nan F/1785/S
