# Feature engineering

In [1]:
import pandas as pd
from collections import defaultdict


training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
training_data['Set'] = 'Train'
test_data['Set'] = 'Test'
df = pd.concat([training_data,test_data])


In [2]:
def column_splits(data_frame):
    data_frame[['Group', 'GroupNumber']] = data_frame['PassengerId'].str.split('_', expand=True)

    data_frame[['CabinDeck', 'CabinNumber', 'CabinSide']]= data_frame['Cabin'].str.split("/", expand = True)
    data_frame.CabinNumber = data_frame.CabinNumber.astype('Int64')
    
    data_frame[['FirstName','LastName']] = data_frame['Name'].str.split(" ",expand = True)

    return data_frame

df = column_splits(df)


In [3]:
df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df.loc[(df['Age'] < 13), 'Bills'] = 0
df.loc[(df['CryoSleep'] == True),'Bills'] = 0 
    

In [4]:
def calculate_group_size(dataframe):
    # Calculate group sizes and assign directly to a new column
    dataframe['GroupSize'] = dataframe.groupby('Group')['Group'].transform('count')
    return dataframe


df = calculate_group_size(df)


In [5]:

def fill_potential_decks(dataframe):
    
    potential_decks_by_homeplanet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
    }

    potential_decks_by_homeplanet_no_bills = {
        'Earth':['G'],
        'Europa':['B'],
        'Mars': ['E','F']
    }
    
    def func_potential_decks_apply(row):
        if pd.isna(row.Cabin):
            if row.Bills == 0 and not pd.isna(row.HomePlanet):
                if len(dataframe[dataframe.Group == row.Group].dropna(subset = 'CabinDeck').CabinDeck.unique()) > 1:
                    return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
                
            if not pd.isna(row.HomePlanet):
                return potential_decks_by_homeplanet[row.HomePlanet]
            
            else:
                return list(dataframe.dropna(subset = ['CabinDeck']).CabinDeck.unique())
            
    dataframe['PotentialDecks'] = dataframe.apply(func_potential_decks_apply,axis = 1)
    return dataframe
    
            

def fill_potential_sides(dataframe):
    
    def func_potential_sides_apply(row):
        if pd.isna(row.Cabin):
            group = dataframe[dataframe.Group == row.Group].dropna(subset = 'CabinSide')
            if len(group) > 0:
                return [group.iloc[0].CabinSide]
            return ['P','S']
        
    dataframe['PotentialSides'] = dataframe.apply(func_potential_sides_apply,axis = 1)
    return dataframe

    
df = fill_potential_decks(df)
df = fill_potential_sides(df)



In [6]:
def impute_based_on_shared_features(dataframe,attribute,shared_feature):
    for index, row in dataframe[dataframe[attribute].isna()].iterrows():
        entries_with_shared_feature = dataframe[dataframe[shared_feature] == row[shared_feature]].dropna(subset=[attribute])
        
        if not entries_with_shared_feature.empty:
            dataframe.loc[index, attribute] = entries_with_shared_feature[attribute].iloc[0]

    return dataframe

df = impute_based_on_shared_features(df,'HomePlanet','Group')
df = impute_based_on_shared_features(df,'HomePlanet','LastName')

In [7]:
df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)


# workings 2

In [8]:
def impute_from_cabin(dataframe,cabin,index):
    dataframe.loc[index,['Cabin','CabinDeck','CabinNumber','CabinSide']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe

In [9]:
def room_number_constraints_for_passengers(dataframe):
    passengers_without_cabin = dataframe[dataframe['Cabin'].isna()]
    passenger_options = {}

    for index, passenger in passengers_without_cabin.iterrows():
        passenger_options[index] = []

        for deck in passenger.PotentialDecks:
            for side in passenger.PotentialSides:
                # Filter dataframe for the current deck and side
                df_filtered = dataframe[(dataframe['CabinDeck'] == deck) & (dataframe['CabinSide'] == side)]

                # Split into cabins before and after the current passenger index
                max_before = max(df_filtered.loc[df_filtered.index < index, 'CabinNumber'].dropna().unique(), default = -1 )
                min_after = min(df_filtered.loc[df_filtered.index > index, 'CabinNumber'].dropna().unique(), default = -1)

                if max_before == -1 or min_after == -1:
                    continue

                if max_before + 1 < min_after:
                    
                    passenger_options[index] += [f"{deck}/{i}/{side}" for i in range(max_before + 1, min_after)]

    return passenger_options





# solo group and only one room that fits

In [10]:
def solo_group_one_option(dataframe):
    
    potential_options = room_number_constraints_for_passengers(dataframe)

    for index in list(df[(df.Cabin.isna()) & (df.GroupSize == 1)].index):

        if len(potential_options[index]) == 1:
            cabin = potential_options[index][0]
            dataframe = impute_from_cabin(dataframe,cabin,index)

    return dataframe


# no free rooms so has to share

cabin to impute instead of cabin for variable name?


In [11]:
def no_free_rooms_so_shares(dataframe):
    potential_options = room_number_constraints_for_passengers(dataframe)
    
    for index,passenger_cabin_options in potential_options.items():
        if not passenger_cabin_options:
            
            passenger_row = dataframe.loc[index]
            
            group_cabins = dataframe[(dataframe['Group'] == passenger_row['Group']) &
                                  (dataframe['CabinDeck'].isin(passenger_row['PotentialDecks']))].dropna(subset = 'Cabin')['Cabin'].unique()
            
            if len(group_cabins) == 1:
                cabin = group_cabins[0]
                dataframe = impute_from_cabin(dataframe,cabin,index)
                
    return dataframe
    


# only passenger that can take that cabin

In [12]:
def only_passenger_that_fits(dataframe):
    free_passengers = room_number_constraints_for_passengers(dataframe)
    
    rooms_to_fill = defaultdict(list)
    for passenger, free_rooms in free_passengers.items():
        for cabin in free_rooms:
            rooms_to_fill[cabin].append(passenger)
    
    # Iterate over rooms and impute passengers where only one fits
    for cabin, passengers in rooms_to_fill.items():
        if len(passengers) == 1:
            dataframe = impute_from_cabin(dataframe, cabin, passengers)
    
    return dataframe


# all imputes

In [13]:
def all_imputes(dataframe):
    dataframe = solo_group_one_option(dataframe)
    dataframe = no_free_rooms_so_shares(dataframe)
    dataframe = only_passenger_that_fits(dataframe)

    dataframe = solo_group_one_option(dataframe)
    dataframe = no_free_rooms_so_shares(dataframe)
    dataframe = only_passenger_that_fits(dataframe)
    
    return dataframe
    
df = all_imputes(df)
df.isna().sum()

PassengerId           0
HomePlanet           12
CryoSleep           310
Cabin                41
Destination         274
Age                 270
VIP                 296
RoomService         263
FoodCourt           289
ShoppingMall        306
Spa                 284
VRDeck              268
Name                294
Transported        4277
Set                   0
Group                 0
GroupNumber           0
CabinDeck            41
CabinNumber          41
CabinSide            41
FirstName           294
LastName            294
Bills               785
GroupSize             0
PotentialDecks    12671
PotentialSides    12671
dtype: int64

In [14]:
df.isna().sum()

PassengerId           0
HomePlanet           12
CryoSleep           310
Cabin                41
Destination         274
Age                 270
VIP                 296
RoomService         263
FoodCourt           289
ShoppingMall        306
Spa                 284
VRDeck              268
Name                294
Transported        4277
Set                   0
Group                 0
GroupNumber           0
CabinDeck            41
CabinNumber          41
CabinSide            41
FirstName           294
LastName            294
Bills               785
GroupSize             0
PotentialDecks    12671
PotentialSides    12671
dtype: int64

In [15]:
room_number_constraints_for_passengers(df)

{404: ['B/13/P', 'C/13/S'],
 421: ['B/13/P', 'C/13/S'],
 479: ['E/20/P', 'E/21/P'],
 505: ['E/20/P', 'E/21/P'],
 517: ['E/20/P', 'E/21/P'],
 1429: ['E/58/P'],
 1466: ['C/40/S', 'D/36/S', 'E/58/P'],
 1543: ['C/40/S', 'D/36/S'],
 2442: [],
 2970: [],
 3529: ['E/150/P', 'F/519/P'],
 3530: ['E/150/P', 'F/519/P'],
 4233: ['B/98/P', 'B/99/P'],
 4254: ['B/98/P', 'B/99/P'],
 4569: [],
 4751: [],
 5016: ['G/590/P', 'G/579/S'],
 5017: ['G/590/P', 'G/579/S'],
 6493: ['E/300/S', 'E/301/S'],
 6514: ['E/300/S', 'E/301/S'],
 8413: ['D/191/P'],
 8450: ['D/191/P', 'E/387/P'],
 8465: ['D/191/P', 'E/387/P'],
 9265: ['F/1267/S'],
 9267: ['F/1267/S'],
 10081: ['F/1489/P', 'G/1157/P'],
 10082: ['F/1489/P', 'G/1157/P'],
 10290: ['C/270/S'],
 10313: ['C/270/S', 'D/235/P'],
 10394: ['D/235/P', 'F/1424/S'],
 10408: ['F/1424/S', 'G/1206/S'],
 10411: ['F/1424/S', 'G/1206/S'],
 10434: ['F/1544/P', 'F/1424/S', 'G/1212/S'],
 10440: ['F/1544/P', 'G/1212/S', 'D/235/P'],
 11129: ['C/298/S', 'E/528/S'],
 11148: ['C/298/

# workings


In [16]:
def all_cabin_options_for_each_row(dataframe):
    options = room_number_constraints_for_passengers(dataframe)
    for index, passenger in options.items():
        print()
        print("Index:",index, "GroupSize:", dataframe.iloc[index].GroupSize)
        print("Free cabins that match:")
        print(options[index])
                
             


In [17]:
all_cabin_options_for_each_row(df)


Index: 404 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 421 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 479 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 505 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 517 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 1429 GroupSize: 2
Free cabins that match:
['E/58/P']

Index: 1466 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S', 'E/58/P']

Index: 1543 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S']

Index: 2442 GroupSize: 7
Free cabins that match:
[]

Index: 2970 GroupSize: 5
Free cabins that match:
[]

Index: 3529 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 3530 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 4233 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4254 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4569 GroupSize: 3
Free cabins that 

In [18]:




cabin_list = [(1429,'E/58/P'),(8413,'A/57/P'),(9265,'F/1267/S'),(9267,'F/1267/S'),(4233,'B/98/P'),(4254,'B/99/P'),(6493,'E/300/S'),(6514,'E/301/S')]

for index,cabin in cabin_list:
    impute_from_cabin(df,cabin,index)

"""
df.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room
df.loc[12893,'Cabin'] = 'F/1785/S'"""

"\ndf.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room\ndf.loc[12893,'Cabin'] = 'F/1785/S'"

In [19]:
traindata = df[df.Set == 'Train']
testdata = df[df.Set == 'Test']

# End


In [20]:
df_to_comp = pd.read_csv('data/31remaining.csv')
df_to_comp = df_to_comp.rename(columns = {'Number':'CabinNumber'})
df_to_comp['CabinNumber'] = df_to_comp['CabinNumber'].astype('Int64')


In [21]:
for index,row in df.iterrows():
    if not (pd.isna(row.Cabin) and pd.isna(df_to_comp.iloc[index].Cabin)):
        if row.Cabin != df_to_comp.iloc[index].Cabin:
            print(index,row.Cabin, df_to_comp.iloc[index].Cabin)

12892 nan F/1785/S
12893 nan F/1785/S
