# go through simplifying function then change variable names

# preprocessing

In [1]:
import pandas as pd

traindata = pd.read_csv('data/train.csv')
testdata = pd.read_csv('data/test.csv')
traindata['Set'] = 'Train'
testdata['Set'] = 'Test'
df = pd.concat([traindata,testdata])


In [2]:
def splits(data_frame):
    data_frame[['Group', 'GroupNumber']] = data_frame['PassengerId'].str.split('_', expand=True)

    data_frame[['Deck', 'CabinNumber', 'Side']]= data_frame['Cabin'].str.split("/", expand = True)
    data_frame.CabinNumber = data_frame.CabinNumber.astype('Int64')
    
    data_frame[['FirstName','LastName']] = data_frame['Name'].str.split(" ",expand = True)

    return data_frame

df = splits(df)


In [3]:
df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df.loc[(df['Age'] < 13), 'Bills'] = 0
df.loc[(df['CryoSleep'] == True),'Bills'] = 0 
    

In [4]:
def calculate_group_size(dataframe):
    # Calculate group sizes and assign directly to a new column
    dataframe['GroupSize'] = dataframe.groupby('Group')['Group'].transform('count')
    return dataframe


df = calculate_group_size(df)


In [5]:

def fill_potential_decks(dataframe):
    
    potential_decks_by_homeplanet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
    }

    potential_decks_by_homeplanet_no_bills = {
        'Earth':['G'],
        'Europa':['B'],
        'Mars': ['E','F']
    }
    
    def func_potential_decks_apply(row):
        if pd.isna(row.Cabin):
            if row.Bills == 0 and not pd.isna(row.HomePlanet):
                if len(dataframe[dataframe.Group == row.Group].dropna(subset = 'Deck').Deck.unique()) > 1:
                    return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
                
            if not pd.isna(row.HomePlanet):
                return potential_decks_by_homeplanet[row.HomePlanet]
            
            else:
                return list(dataframe.dropna(subset = ['Deck']).Deck.unique())
            
    dataframe['potential_decks'] = dataframe.apply(func_potential_decks_apply,axis = 1)
    return dataframe
    
            

def fill_potential_sides(dataframe):
    
    def func_potential_sides_apply(row):
        if pd.isna(row.Cabin):
            group = dataframe[dataframe.Group == row.Group].dropna(subset = 'Side')
            if len(group) > 0:
                return [group.iloc[0].Side]
            return ['P','S']
        
    dataframe['potential_sides'] = dataframe.apply(func_potential_sides_apply,axis = 1)
    return dataframe

    
df = fill_potential_decks(df)
df = fill_potential_sides(df)



In [6]:
def impute_based_on_shared_features(dataframe,attribute,shared_feature):
    for index, row in dataframe[dataframe[attribute].isna()].iterrows():
        entries_with_shared_feature = dataframe[dataframe[shared_feature] == row[shared_feature]].dropna(subset=[attribute])
        
        if not entries_with_shared_feature.empty:
            dataframe.loc[index, attribute] = entries_with_shared_feature[attribute].iloc[0]

    return dataframe

df = impute_based_on_shared_features(df,'HomePlanet','Group')
df = impute_based_on_shared_features(df,'HomePlanet','LastName')

In [7]:
df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)


# workings 2

In [8]:
def impute_from_cabin(dataframe,cabin,index):
    dataframe.loc[index,['Cabin','Deck','CabinNumber','Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe

In [9]:
def room_number_constraints_for_passengers(dataframe):
    passengers_without_cabin = dataframe[dataframe['Cabin'].isna()]
    passenger_options = {}

    for index, passenger in passengers_without_cabin.iterrows():
        passenger_options[index] = []

        for deck in passenger.potential_decks:
            for side in passenger.potential_sides:
                # Filter dataframe for the current deck and side
                df_filtered = dataframe[(dataframe['Deck'] == deck) & (dataframe['Side'] == side)]

                # Split into cabins before and after the current passenger index
                max_before = max(df_filtered.loc[df_filtered.index < index, 'CabinNumber'].dropna().unique(), default = -1 )
                min_after = min(df_filtered.loc[df_filtered.index > index, 'CabinNumber'].dropna().unique(), default = -1)

                if max_before == -1 or min_after == -1:
                    continue

                if max_before + 1 < min_after:
                    
                    passenger_options[index] += [f"{deck}/{i}/{side}" for i in range(max_before + 1, min_after)]

    return passenger_options





# solo group and only one room that fits

In [10]:
def solo_group_one_option(dataframe):
    
    potential_options = room_number_constraints_for_passengers(dataframe)

    for index in list(df[(df.Cabin.isna()) & (df.GroupSize == 1)].index):

        if len(potential_options[index]) == 1:
            cabin = potential_options[index][0]
            dataframe = impute_from_cabin(dataframe,cabin,index)

    return dataframe


# no free rooms so has to share

cabin to impute instead of cabin for variable name?


In [11]:
def no_free_rooms_so_shares(dataframe):
    potential_options = room_number_constraints_for_passengers(dataframe)
    
    for index,passenger_cabin_options in potential_options.items():
        if not passenger_cabin_options:
            
            passenger_row = dataframe.loc[index]
            
            group_cabins = dataframe[(dataframe['Group'] == passenger_row['Group']) &
                                  (dataframe['Deck'].isin(passenger_row['potential_decks']))].dropna(subset = 'Cabin')['Cabin'].unique()
            
            if len(group_cabins) == 1:
                cabin = group_cabins[0]
                dataframe = impute_from_cabin(dataframe,cabin,index)
                
    return dataframe
    


# only passenger that can take that cabin

In [14]:
def only_passenger_that_fits(dataframe):
    free_passengers = room_number_constraints_for_passengers(dataframe)
    rooms_to_fill = {}
    for passenger,free_rooms in free_passengers.items():
        for cabin in free_rooms:
            if cabin in rooms_to_fill:
                rooms_to_fill[cabin].append(passenger)
            else:
                rooms_to_fill[cabin] = [passenger]
    
    for key in rooms_to_fill.keys():
        if len(rooms_to_fill[key]) == 1:
            dataframe = impute_from_cabin(dataframe,key,rooms_to_fill[key])
    return dataframe

            

# all imputes

In [16]:
def all_imputes(dataframe):
    dataframe = solo_group_one_option(dataframe)
    dataframe = no_free_rooms_so_shares(dataframe)
    dataframe = only_passenger_that_fits(dataframe)

    dataframe = solo_group_one_option(dataframe)
    dataframe = no_free_rooms_so_shares(dataframe)
    dataframe = only_passenger_that_fits(dataframe)
    
    return dataframe
    
df = all_imputes(df)
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                 41
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Transported         4277
Set                    0
Group                  0
GroupNumber            0
Deck                  41
CabinNumber           41
Side                  41
FirstName            294
LastName             294
Bills                785
GroupSize              0
potential_decks    12671
potential_sides    12671
dtype: int64

In [17]:
df.isna().sum()

PassengerId            0
HomePlanet            12
CryoSleep            310
Cabin                 41
Destination          274
Age                  270
VIP                  296
RoomService          263
FoodCourt            289
ShoppingMall         306
Spa                  284
VRDeck               268
Name                 294
Transported         4277
Set                    0
Group                  0
GroupNumber            0
Deck                  41
CabinNumber           41
Side                  41
FirstName            294
LastName             294
Bills                785
GroupSize              0
potential_decks    12671
potential_sides    12671
dtype: int64

# workings


In [18]:
def all_cabin_options_for_each_row(dataframe):
    options = room_number_constraints_for_passengers(dataframe)
    for index, passenger in options.items():
        print()
        print("Index:",index, "GroupSize:", dataframe.iloc[index].GroupSize)
        print("Free cabins that match:")
        print(options[index])
                
             


In [19]:
all_cabin_options_for_each_row(df)


Index: 404 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 421 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 479 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 505 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 517 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 1429 GroupSize: 2
Free cabins that match:
['E/58/P']

Index: 1466 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S', 'E/58/P']

Index: 1543 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S']

Index: 2442 GroupSize: 7
Free cabins that match:
[]

Index: 2970 GroupSize: 5
Free cabins that match:
[]

Index: 3529 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 3530 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 4233 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4254 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4569 GroupSize: 3
Free cabins that 

In [20]:
def update_cabins(dataframe, cabin_list):
    for index, cabin in cabin_list:
        dataframe.loc[index, ['Cabin', 'Deck', 'CabinNumber', 'Side']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe

cabin_list = [(1429,'E/58/P'),(8413,'A/57/P'),(9265,'F/1267/S'),(9267,'F/1267/S')]

# maybe have to do 

cabin_list2 = [(4233,'B/98/P'),(4254,'B/99/P'),(6493,'E/300/S'),(6514,'E/301/S')]

df = update_cabins(df,cabin_list)
df = update_cabins(df,cabin_list2)

"""
df.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room
df.loc[12893,'Cabin'] = 'F/1785/S'"""

"\ndf.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room\ndf.loc[12893,'Cabin'] = 'F/1785/S'"

In [21]:
traindata = df[df.Set == 'Train']
testdata = df[df.Set == 'Test']

# End


In [22]:
df_to_comp = pd.read_csv('data/31remaining.csv')
df_to_comp = df_to_comp.rename(columns = {'Number':'CabinNumber'})
df_to_comp['CabinNumber'] = df_to_comp['CabinNumber'].astype('Int64')


In [23]:
for index,row in df.iterrows():
    if not (pd.isna(row.Cabin) and pd.isna(df_to_comp.iloc[index].Cabin)):
        if row.Cabin != df_to_comp.iloc[index].Cabin:
            print(index,row.Cabin, df_to_comp.iloc[index].Cabin)

12892 nan F/1785/S
12893 nan F/1785/S
