# preprocessing

In [59]:
import pandas as pd

traindata = pd.read_csv('data/train.csv')
testdata = pd.read_csv('data/test.csv')
traindata['Set'] = 'Train'
testdata['Set'] = 'Test'
df = pd.concat([traindata,testdata])


In [60]:
def splits(data_frame):
    split_df = data_frame['PassengerId'].str.split('_', expand=True)
    data_frame[['Group', 'GroupNumber']] = split_df

    split_df =  data_frame['Cabin'].str.split("/", expand = True)
    data_frame[['Deck', 'CabinNumber', 'Side']]= split_df
    data_frame.CabinNumber = data_frame.CabinNumber.astype('Int64')
    
    split_df = data_frame['Name'].str.split(" ",expand = True)
    data_frame[['FirstName','LastName']] = split_df


    return data_frame

df = splits(df)


In [61]:
def calculate_group_size(dataframe):
    # Calculate group sizes and assign directly to a new column
    dataframe['GroupSize'] = dataframe.groupby('Group')['Group'].transform('count')
    return dataframe


df = calculate_group_size(df)

df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df.loc[(df['Age'] < 13), 'Bills'] = 0
df.loc[(df['CryoSleep'] == True),'Bills'] = 0 
    
    
    


    


In [62]:

def fill_potential_decks(dataframe):
    
    
    potential_decks_by_homeplanet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
    }

    potential_decks_by_homeplanet_no_bills = {
        'Earth':['G'],
        'Europa':['B'],
        'Mars': ['E','F']
    }
    
    def func_potential_decks_apply(row):
        if pd.isna(row.Cabin):
            if row.Bills == 0 and not pd.isna(row.HomePlanet):
                if len(dataframe[dataframe.Group == row.Group].dropna(subset = 'Deck').Deck.unique()) > 1:
                    return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
                
            if not pd.isna(row.HomePlanet):
                return potential_decks_by_homeplanet[row.HomePlanet]
            
            else:
                return list(dataframe.dropna(subset = ['Deck']).Deck.unique())
            
    dataframe['potential_decks'] = dataframe.apply(func_potential_decks_apply,axis = 1)
    return dataframe
    
            

def fill_potential_sides(dataframe):
    
    def func_potential_sides_apply(row):
        if pd.isna(row.Cabin):
            if row.GroupSize > 1:
                group = dataframe[dataframe.Group == row.Group].dropna(subset = 'Side')
                if len(group) > 0:
                    return [group.iloc[0].Side]
            return ['P','S']
        
    dataframe['potential_sides'] = dataframe.apply(func_potential_sides_apply,axis = 1)
    return dataframe
    
    
df = fill_potential_decks(df)
df = fill_potential_sides(df)



In [63]:
def impute_based_on_shared_features(dataframe,attribute,shared_feature):
    for index, row in dataframe[dataframe[attribute].isna()].iterrows():
        # Find the group sharing the same feature value, excluding missing values in the attribute
        entries_with_shared_feature = dataframe[dataframe[shared_feature] == row[shared_feature]].dropna(subset=[attribute])
        
        if not entries_with_shared_feature.empty:
            # Assign the first non-missing value of the attribute in the group to the missing value
            dataframe.loc[index, attribute] = entries_with_shared_feature[attribute].iloc[0]

    return dataframe

df = impute_based_on_shared_features(df,'HomePlanet','Group')
df = impute_based_on_shared_features(df,'HomePlanet','LastName')

In [64]:
df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)


# workings 2

In [65]:
def room_number_constraints_for_passengers(dataframe):
    passenger_options = {}
    passengers_without_cabin = dataframe[dataframe['Cabin'].isna()]

    for index, passenger in passengers_without_cabin.iterrows():
        potential_options = {}

        for deck in passenger.potential_decks:
            for side in passenger.potential_sides:
                mask = (dataframe['Deck'] == deck) & (dataframe['Side'] == side)

                # Get the rooms before and after the current passenger index
                before = dataframe.loc[mask & (dataframe.index < index), 'CabinNumber'].dropna().unique()
                after = dataframe.loc[mask & (dataframe.index > index), 'CabinNumber'].dropna().unique()
                
                if len(before) == 0 or len(after) == 0:
                    continue
                
                elif max(before) + 1 >= min(after):
                    continue
                
                else:
                    if deck not in potential_options:
                        potential_options[deck] = {}
                    
                    potential_options[deck][side] = [i  for i in range(max(before) + 1, min(after))]
        

        passenger_options[index] = potential_options

    return passenger_options





# solo group and only one room that fits

# this should all be changed to see if theres only a single entry in the room_number_constraints_for passengers

In [88]:
def solo_group_one_option(dataframe):
    potential_options = room_number_constraints_for_passengers(dataframe)

    for index,passenger in potential_options.items():
        if dataframe.iloc[index].GroupSize != 1:
            continue
        empty_cabin_options_for_passenger = []
        
        refresh_options = False
        
        
         
        for deck in passenger.keys():
            for side in passenger[deck].keys():
                for number in passenger[deck][side]:
                    empty_cabin_options_for_passenger.append(f"{deck}/{number}/{side}")
        

        if len(empty_cabin_options_for_passenger) == 1:
            cabin = empty_cabin_options_for_passenger[0]
            dataframe.loc[index,['Cabin','Deck','CabinNumber','Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
            if refresh_options:
                potential_options = room_number_constraints_for_passengers(dataframe)
                

    return dataframe


In [89]:
df = solo_group_one_option(df)
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                 46
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Transported         4277
Set                    0
Group                  0
GroupNumber            0
Deck                  46
CabinNumber           46
Side                  46
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12671
potential_sides    12671
dtype: int64

# no free rooms so has to share

In [90]:
def no_free_rooms_so_shares(dataframe):
    potential_options = room_number_constraints_for_passengers(dataframe)
    
    for index,passenger in potential_options.items():
        
        if not bool(passenger):
            row = dataframe.loc[index]
            potential_cabins = dataframe[(dataframe['Group'] == row['Group']) &
                                  (dataframe.index != index) &
                                  (dataframe['Deck'].isin(row['potential_decks'])) &
                                  (dataframe['Side'].isin(row['potential_sides']))]['Cabin'].unique()
            if len(potential_cabins) == 1:
                cabin = potential_cabins[0]
                dataframe.loc[index,['Cabin','Deck','CabinNumber','Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe
    


In [91]:
df = no_free_rooms_so_shares(df)
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                 46
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Transported         4277
Set                    0
Group                  0
GroupNumber            0
Deck                  46
CabinNumber           46
Side                  46
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12671
potential_sides    12671
dtype: int64

# only passenger that can take that cabin

In [92]:

                    
def rooms_to_fill(dataframe):
    rooms = {}
    for deck in list(dataframe.dropna(subset = ['Deck']).Deck.unique()):
        rooms[deck] = {'P': [], 'S': []} 
        for side in rooms[deck].keys():
            rooms_seen = dataframe[(dataframe.Deck == deck) & (dataframe.Side == side) & (dataframe.CabinNumber.notna())].CabinNumber.astype(int).tolist()
            largest_room_number = max(rooms_seen, default=-1) 

            for i in range(largest_room_number + 1):
                if i not in rooms_seen:
                    rooms[deck][side].append(f"{deck}/{i}/{side}")

    return rooms
           

In [93]:
def only_passenger_that_fits(dataframe):
    free_rooms_dict = rooms_to_fill(dataframe)
    free_passengers = room_number_constraints_for_passengers(dataframe)
    for deck,deck_items in free_rooms_dict.items():
        for side,side_items in deck_items.items():
            for cabin in side_items:
                
                room_number = int(cabin.split('/')[1])
                passenger_options_for_empty_cabins = []
                
                for passenger_ind,passenger in free_passengers.items():
                    if deck not in passenger or side not in passenger[deck]:
                        continue
                    
                    if room_number in passenger[deck][side]:
                        passenger_options_for_empty_cabins.append(passenger_ind)
                        
                if len(passenger_options_for_empty_cabins) == 1:
                    passenger_to_fill_ind = passenger_options_for_empty_cabins[0]
                    
                    dataframe.loc[passenger_to_fill_ind,['Cabin','Deck','CabinNumber','Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
                    
                            
    return dataframe
    

In [94]:
df = only_passenger_that_fits(df)

In [95]:
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                 41
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Transported         4277
Set                    0
Group                  0
GroupNumber            0
Deck                  41
CabinNumber           41
Side                  41
FirstName            294
LastName             294
GroupSize              0
Bills                785
potential_decks    12671
potential_sides    12671
dtype: int64

# all imputes

In [13]:
def all_imputes(dataframe):
    dataframe = solo_group_one_option(dataframe)
    dataframe = no_free_rooms_so_shares(dataframe)
    dataframe = only_passenger_that_fits(dataframe)

    dataframe = solo_group_one_option(dataframe)
    dataframe = no_free_rooms_so_shares(dataframe)
    dataframe = only_passenger_that_fits(dataframe)
    
    return dataframe
    
df = all_imputes(df)
df.isna().sum()

IndexError: list index out of range

# workings


In [43]:
def all_cabin_options_for_each_row(dataframe):
    options = room_number_constraints_for_passengers(dataframe)
    for index, passenger in options.items():
        print()
        print("Index:",index, "GroupSize:", dataframe.iloc[index].GroupSize)
        print("Free cabins that match:")
        for deck in passenger.keys():
            for side, numbers in passenger[deck].items():
                print(deck,side,numbers)
                
             


In [55]:
df_to_comp = df_to_comp.rename(columns = {'Number':'CabinNumber'})


In [57]:
df_to_comp['CabinNumber'] = df_to_comp['CabinNumber'].astype('Int64')

In [58]:
all_cabin_options_for_each_row(df_to_comp)


Index: 404 GroupSize: 1
Free cabins that match:
B P [13]
C S [13]

Index: 421 GroupSize: 1
Free cabins that match:
B P [13]
C S [13]

Index: 479 GroupSize: 2
Free cabins that match:
E P [20, 21]

Index: 505 GroupSize: 2
Free cabins that match:
E P [20, 21]

Index: 517 GroupSize: 2
Free cabins that match:
E P [20, 21]

Index: 1466 GroupSize: 1
Free cabins that match:
C S [40]
D S [36]

Index: 1543 GroupSize: 1
Free cabins that match:
C S [40]
D S [36]

Index: 2442 GroupSize: 7
Free cabins that match:

Index: 2970 GroupSize: 5
Free cabins that match:

Index: 3529 GroupSize: 1
Free cabins that match:
E P [150]
F P [519]

Index: 3530 GroupSize: 1
Free cabins that match:
E P [150]
F P [519]

Index: 4569 GroupSize: 3
Free cabins that match:

Index: 4751 GroupSize: 7
Free cabins that match:

Index: 5016 GroupSize: 1
Free cabins that match:
G P [590]
G S [579]

Index: 5017 GroupSize: 1
Free cabins that match:
G P [590]
G S [579]

Index: 8450 GroupSize: 1
Free cabins that match:
D P [191]
E P 

In [None]:
def update_cabins(dataframe, cabin_list):
    for index, cabin in cabin_list:
        dataframe.loc[index, ['Cabin', 'Deck', 'Number', 'Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe

cabin_list = [(1429,'E/58/P'),(8413,'A/57/P'),(9265,'F/1267/S'),(9267,'F/1267/S')]

df = update_cabins(df,cabin_list)



df.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room
df.loc[12893,'Cabin'] = 'F/1785/S'

In [None]:
traindata = df[df.Set == 'Train']
testdata = df[df.Set == 'Test']

# End


In [40]:
df_to_comp = pd.read_csv('data/31remaining.csv')

In [87]:
for index,row in df.iterrows():
    if not (pd.isna(row.Cabin) and pd.isna(df_to_comp.iloc[index].Cabin)):
        if row.Cabin != df_to_comp.iloc[index].Cabin:
            print(index,row.Cabin, df_to_comp.iloc[index].Cabin)

1251 nan F/181/P
1429 nan E/58/P
3484 nan E/174/S
3701 nan E/161/P
4233 nan B/98/P
4254 nan B/99/P
6493 nan E/300/S
6514 nan E/301/S
7576 nan G/874/S
8413 nan A/57/P
9265 nan F/1267/S
9267 nan F/1267/S
11074 nan G/1286/P
12892 nan F/1785/S
12893 nan F/1785/S
