In [1]:
import pandas as pd
import numpy as np

traindf = pd.read_csv('data/train.csv')
testdf = pd.read_csv('data/test.csv')
traindf['Set'] = 'Train'
testdf['Set'] = 'Test'
togeth = pd.concat([testdf,traindf])


In [2]:
homeplanets = ['Earth', 'Europa', 'Mars']

In [235]:
decks_by_planet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
}

planet_by_deck = {
    'A':['Europa'],'B':['Europa'],'C':['Europa'],'D':['Europa','Mars'],'E':['Europa','Mars','Earth'],
    'F':['Earth','Mars'],'G':['Earth'],'T':['Europa']
}

cabin_sides = ['P','S']

## datasplits 

In [4]:
def splits(data_frame):
    split_df = data_frame['PassengerId'].str.split('_', expand=True)
    data_frame[['Group', 'GroupNumber']] = split_df

    split_df =  data_frame['Cabin'].str.split("/", expand = True)
    data_frame[['Deck', 'Number', 'Side']]= split_df
    data_frame.Number = data_frame.Number.astype('float')
    
    split_df = data_frame['Name'].str.split(" ",expand = True)
    data_frame[['FristName','LastName']] = split_df


    return data_frame

togeth = splits(togeth)
togeth = togeth.sort_values(by = ['Group','GroupNumber'])
togeth = togeth.reset_index()
togeth = togeth[['PassengerId','HomePlanet','Cabin','Destination','Group','GroupNumber','Deck','Number','Side','Set','LastName']]


In [5]:
def group_size(df):
    def group_size_apply(row):
        group = df[df.Group == row.Group]
        return len(group)
    df['GroupSize'] = df.apply(group_size_apply, axis = 1)
    return df
togeth = group_size(togeth)

In [6]:
togeth.head()

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set,LastName,GroupSize
0,0001_01,Europa,B/0/P,TRAPPIST-1e,1,1,B,0.0,P,Train,Ofracculy,1
1,0002_01,Earth,F/0/S,TRAPPIST-1e,2,1,F,0.0,S,Train,Vines,1
2,0003_01,Europa,A/0/S,TRAPPIST-1e,3,1,A,0.0,S,Train,Susent,2
3,0003_02,Europa,A/0/S,TRAPPIST-1e,3,2,A,0.0,S,Train,Susent,2
4,0004_01,Earth,F/1/S,TRAPPIST-1e,4,1,F,1.0,S,Train,Santantines,1


In [7]:
def impute_attribute_based_on_shared_feature(df, attribute, feature):
    # Function to fill NaNs in a group based on the first available non-NaN value
    def fill_with_first_available(group_df):
        if not group_df[attribute].dropna().empty:
            first_available = group_df[attribute].dropna().iloc[0]
            group_df[attribute] = group_df[attribute].fillna(first_available)
        return group_df

    # Filter out rows where the feature is NaN, then apply the fill function to each group
    filtered_df = df.dropna(subset=[feature])
    imputed_df = filtered_df.groupby(feature, group_keys=False).apply(fill_with_first_available)
    
    # Merge back with original df rows where the feature is NaN to retain those rows unchanged
    df_with_nan_feature = df[df[feature].isna()]
    final_df = pd.concat([imputed_df, df_with_nan_feature], ignore_index=True)

    return final_df

## Certain imputes

In [8]:
togeth = impute_attribute_based_on_shared_feature(togeth,'HomePlanet','Group')

  imputed_df = filtered_df.groupby(feature, group_keys=False).apply(fill_with_first_available)


# by last name

In [9]:
togeth = impute_attribute_based_on_shared_feature(togeth,'HomePlanet','LastName')

  imputed_df = filtered_df.groupby(feature, group_keys=False).apply(fill_with_first_available)


In [10]:
togeth = togeth.sort_values(by = ['Group','GroupNumber'])
togeth = togeth.reset_index(drop = True)


In [11]:
def home_planet_from_deck_abc(df):
    df.loc[df['Deck'].isin(['A', 'B', 'C']), 'HomePlanet'] = df.loc[df['Deck'].isin(['A', 'B', 'C']), 'HomePlanet'].fillna('Europa')
    return df
def home_planet_from_deck_g(df):
    df.loc[df['Deck']== 'G', 'HomePlanet'] = df.loc[df['Deck'] == 'G', 'HomePlanet'].fillna('Earth')
    return df


## do i need these for my imputing?

In [12]:
togeth = home_planet_from_deck_abc(togeth)
togeth = home_planet_from_deck_g(togeth)

In [13]:
togeth[togeth.HomePlanet.isna()]

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set,LastName,GroupSize
870,0616_01,,E/40/S,TRAPPIST-1e,616,1,E,40.0,S,Test,Spriney,1
3439,2443_01,,D/72/P,TRAPPIST-1e,2443,1,D,72.0,P,Train,,1
3946,2817_01,,F/584/P,TRAPPIST-1e,2817,1,F,584.0,P,Train,Sutty,1
4632,3331_01,,F/631/S,TRAPPIST-1e,3331,1,F,631.0,S,Train,,1
5687,4052_01,,D/136/P,TRAPPIST-1e,4052,1,D,136.0,P,Test,Coneveseng,1
6786,4840_01,,F/915/S,TRAPPIST-1e,4840,1,F,915.0,S,Train,,1
8528,6108_01,,F/1166/S,TRAPPIST-1e,6108,1,F,1166.0,S,Train,,1
9916,7065_01,,E/471/S,TRAPPIST-1e,7065,1,E,471.0,S,Test,Maltorted,1
11757,8435_01,,F/1730/P,TRAPPIST-1e,8435,1,F,1730.0,P,Test,,1
12244,8775_01,,D/275/P,TRAPPIST-1e,8775,1,D,275.0,P,Test,Maltorted,1


In [14]:
togeth.isna().sum()

PassengerId      0
HomePlanet      10
Cabin          299
Destination    274
Group            0
GroupNumber      0
Deck           299
Number         299
Side           299
Set              0
LastName       294
GroupSize        0
dtype: int64

# Filling missed Cabins


In [15]:
used_cabins = {}
for deck in list(togeth.dropna().Deck.unique()):
    cab = {}
    for side in list(togeth.dropna().Side.unique()):
        rooms = list(togeth[(togeth.Deck == deck) & (togeth.Side == side)].dropna().Number.unique())
        rooms.sort()
        cab[side] = rooms
    used_cabins[deck] = cab
    

#### fill people that must be sharing a room with people in the same group
ie, from their chome planet we know the decks they could be in, if the side and number below and above are one above and one below then they must be sharing with someone from their group. make sure that their group has only one cabin

need to make sure is only one person  with cabin being na in the group otherwise one na might be in another cabin 

for row in dataframe

if cabin.isna() and homeplanet is known,
 for potential decks in homeplanets remit ie europa check a, b , c in dataframe up to that point and over that point, see if any have a free space and if not then fill it with the one in the same group

In [149]:
def checks(df):
    for home in homeplanets:
        print("\n" + home)
        print(df[df.HomePlanet == home].Deck.value_counts())
    return df

In [263]:
df = togeth.copy()

In [264]:
df2 = togeth.copy()

Finding groups that have more than 1 member that all share the same cabin and that dont have other nans in group

In [293]:
df2.isna().sum()

PassengerId      0
HomePlanet      10
Cabin           89
Destination    274
Group            0
GroupNumber      0
Deck           145
Number         145
Side           145
Set              0
LastName       294
GroupSize        0
dtype: int64

# one room available and its alone in its group ( should be first in cabin fill)

In [290]:
for index, passenger in df2[df2.Cabin.isna()].iterrows():
    print("\nindex", index)
    print("passenger",passenger)
    options = []
    for deck in decks_by_planet[passenger.HomePlanet]:
        for side in cabin_sides:
            
            before_slice = df2.iloc[:index]
            after_slice = df2.iloc[index+1:] 
            top_room_number_before = np.max(before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number)
            smallest_room_number_after = np.min(after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number)
            
            if top_room_number_before + 1 != smallest_room_number_after:
                if pd.isna(top_room_number_before) and smallest_room_number_after == 0:
                    continue
                elif top_room_number_before == smallest_room_number_after:
                    continue
                else:
                    print('deck',deck,'side',side)
                    print(top_room_number_before,smallest_room_number_after)
                    options.append([deck,side,top_room_number_before,smallest_room_number_after])
    
    if len(options) == 1:
        option = options[0]
        print("groupsize",len(df2[df2.Group == passenger.Group]))
        if option[2] + 2 == option[3] and len(df2[df2.Group == passenger.Group]) == 1:
            print("only option")
            print("options",options)
            print("index",index)
            df2.loc[index,'Side'] = option[1]
            df2.loc[index,'Deck'] = option[0]
            df2.loc[index,'Number'] = int(option[2] + 1)
            df2.loc[index,'Cabin'] = str(option[0]) + "/" + str(option[1]) + "/" + str(int(option[2] + 1))

                
    
    

    


index 404
passenger PassengerId        0293_01
HomePlanet          Europa
Cabin                  NaN
Destination    TRAPPIST-1e
Group                 0293
GroupNumber             01
Deck                   NaN
Number                 NaN
Side                   NaN
Set                   Test
LastName         Suptibler
GroupSize                1
Name: 404, dtype: object
deck B side P
12.0 14.0
deck C side S
12.0 14.0

index 421
passenger PassengerId        0310_01
HomePlanet          Europa
Cabin                  NaN
Destination    TRAPPIST-1e
Group                 0310
GroupNumber             01
Deck                   NaN
Number                 NaN
Side                   NaN
Set                  Train
LastName          Coudered
GroupSize                1
Name: 421, dtype: object
deck B side P
12.0 14.0
deck C side S
12.0 14.0

index 479
passenger PassengerId        0348_02
HomePlanet            Mars
Cabin                  NaN
Destination    TRAPPIST-1e
Group                 0348
GroupNum

# there arent any free rooms for it so has to share


In [292]:

for index, passenger in df2[df2.Cabin.isna()].iterrows():
    print("\nindex", index)
    print("passenger",passenger)
    options = False
    for deck in decks_by_planet[passenger.HomePlanet]:
        for side in cabin_sides:
            
            before_slice = df2.iloc[:index]
            after_slice = df2.iloc[index+1:] 
            top_room_number_before = np.max(before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number)
            smallest_room_number_after = np.min(after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number)
            
            if top_room_number_before + 1 != smallest_room_number_after:
                if pd.isna(top_room_number_before) and smallest_room_number_after == 0:
                    continue
                elif top_room_number_before == smallest_room_number_after:
                    continue
                else:
                    print('deck',deck)
                    print('side',side)
                    print(top_room_number_before,smallest_room_number_after)
                    options = True
                    break
        if options:
            break
    if not options:
        print("no options")
        other_group_member = df2[(df2.Group == passenger.Group) & (~df2.Cabin.isna())]
        print("other_group_members",other_group_member)
        if len(other_group_member.Cabin.unique()) == 1:
            df2.loc[index,'Cabin'] = other_group_member.iloc[0].Cabin

            print("Cabin updated to", other_group_member.iloc[0].Cabin)

  
            # Slice the DataFrame first and then apply the boolean mask
        



index 404
passenger PassengerId        0293_01
HomePlanet          Europa
Cabin                  NaN
Destination    TRAPPIST-1e
Group                 0293
GroupNumber             01
Deck                   NaN
Number                 NaN
Side                   NaN
Set                   Test
LastName         Suptibler
GroupSize                1
Name: 404, dtype: object
deck B
side P
12.0 14.0

index 421
passenger PassengerId        0310_01
HomePlanet          Europa
Cabin                  NaN
Destination    TRAPPIST-1e
Group                 0310
GroupNumber             01
Deck                   NaN
Number                 NaN
Side                   NaN
Set                  Train
LastName          Coudered
GroupSize                1
Name: 421, dtype: object
deck B
side P
12.0 14.0

index 479
passenger PassengerId        0348_02
HomePlanet            Mars
Cabin                  NaN
Destination    TRAPPIST-1e
Group                 0348
GroupNumber             02
Deck                   NaN
Nu

In [268]:
df2[df2.Cabin.isna()]

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set,LastName,GroupSize
315,0227_01,Earth,,TRAPPIST-1e,0227,01,,,,Test,Buckentry,1
404,0293_01,Europa,,TRAPPIST-1e,0293,01,,,,Test,Suptibler,1
421,0310_01,Europa,,TRAPPIST-1e,0310,01,,,,Train,Coudered,1
440,0323_01,Earth,,55 Cancri e,0323,01,,,,Test,Gaineyerson,1
479,0348_02,Mars,,TRAPPIST-1e,0348,02,,,,Train,Mane,2
...,...,...,...,...,...,...,...,...,...,...,...,...
12768,9138_01,Europa,,TRAPPIST-1e,9138,01,,,,Test,Trupistic,1
12892,9223_01,Mars,,TRAPPIST-1e,9223,01,,,,Test,Sun,2
12893,9223_02,Mars,,TRAPPIST-1e,9223,02,,,,Test,Sun,2
12918,9238_05,Earth,,TRAPPIST-1e,9238,05,,,,Test,Emenez,7


Cases
* there arent any free rooms for it so has to share
* its the only person that could fill that room 

In [271]:
df2.isna().sum()

PassengerId      0
HomePlanet      10
Cabin          139
Destination    274
Group            0
GroupNumber      0
Deck           195
Number         195
Side           195
Set              0
LastName       294
GroupSize        0
dtype: int64

# free room where only one person can take it

In [294]:
def rooms_to_fill(df):
    rooms = {}
    for deck in df.dropna(subset = ['Deck']).Deck.unique():
        for side in df.dropna(subset = ['Side']).Side.unique():
            rooms_seen = list(df[(df.Deck == deck) & (df.Side == side)].dropna(subset = ['Number']).Number.unique())
            largest_room_number = int(max(list(df[(df.Deck == deck) & (df.Side == side)].dropna(subset = ['Number']).Number.unique())))
            for i in range(largest_room_number):
                if i not in rooms_seen:
                    if deck not in rooms:
                        rooms[deck] = {'P':[],'S':[]}
                    rooms[deck][side].append(i)
    return rooms

rooms = rooms_to_fill(df2)
                    
                    
                    
            

In [295]:
def fill_by_empty_room(df):
    for deck in rooms.keys():
        for side in ['P','S']:
            for number in rooms[deck][side]:
                indices_matching = []
                print()
                print('deck',deck,'side',side,'number',number)
                temp = df[(df.Cabin.isna()) & (pd.isna(df.HomePlanet) | df.HomePlanet.isin(planet_by_deck[deck]))]
                for index,row in temp.iterrows():
                    before_slice = df.iloc[:index]
                    after_slice = df.iloc[index+1:] 
                    if max(list(before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number.unique())) < number:
                        if min(list(after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number.unique())) > number:
                            indices_matching.append(index)
                if len(indices_matching) == 1:
                    df.loc[indices_matching[0],'Cabin'] = deck + "/" + str(number) + "/" + side
                    df.loc[indices_matching[0],'Side'] = side
                    df.loc[indices_matching[0],'Deck'] = deck
                    df.loc[indices_matching[0],'Number'] = number            
fill_by_empty_room(df2)


deck B side P number 13

deck B side P number 98

deck B side P number 99

deck F side P number 519

deck F side P number 1489

deck F side P number 1544

deck F side S number 1267

deck F side S number 1424

deck F side S number 1785

deck A side P number 94

deck G side P number 590

deck G side P number 1157

deck G side P number 1286

deck G side P number 1474

deck G side S number 579

deck G side S number 1206

deck G side S number 1212

deck G side S number 1282

deck C side S number 13

deck C side S number 40

deck C side S number 193

deck C side S number 214

deck C side S number 270

deck C side S number 298

deck E side P number 20

deck E side P number 21

deck E side P number 58

deck E side P number 150

deck E side P number 387

deck E side S number 300

deck E side S number 301

deck E side S number 528

deck D side P number 191

deck D side P number 235

deck D side S number 36


In [296]:
df2[df2.Cabin.isna()]

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set,LastName,GroupSize
404,0293_01,Europa,,TRAPPIST-1e,0293,01,,,,Test,Suptibler,1
421,0310_01,Europa,,TRAPPIST-1e,0310,01,,,,Train,Coudered,1
479,0348_02,Mars,,TRAPPIST-1e,0348,02,,,,Train,Mane,2
505,0364_02,Mars,,TRAPPIST-1e,0364,02,,,,Test,Chité,2
517,0374_02,Earth,,TRAPPIST-1e,0374,02,,,,Test,Sterreray,2
...,...,...,...,...,...,...,...,...,...,...,...,...
12671,9070_01,Earth,,TRAPPIST-1e,9070,01,,,,Test,Flynney,2
12694,9081_03,Earth,,TRAPPIST-1e,9081,03,,,,Train,Clemondsey,8
12892,9223_01,Mars,,TRAPPIST-1e,9223,01,,,,Test,Sun,2
12893,9223_02,Mars,,TRAPPIST-1e,9223,02,,,,Test,Sun,2


In [297]:
rooms

{'B': {'P': [13, 98, 99], 'S': []},
 'F': {'P': [519, 1489, 1544], 'S': [1267, 1424, 1785]},
 'A': {'P': [94], 'S': []},
 'G': {'P': [590, 1157, 1286, 1474], 'S': [579, 1206, 1212, 1282]},
 'C': {'P': [], 'S': [13, 40, 193, 214, 270, 298]},
 'E': {'P': [20, 21, 58, 150, 387], 'S': [300, 301, 528]},
 'D': {'P': [191, 235], 'S': [36]}}

# remaining cabins

# which cabin for empty people

In [304]:
def remaining_cabins(df):
    for index, passenger in df[df.Cabin.isna()].iterrows():
        print("\nindex", index)
        print("passenger",passenger.PassengerId)
        options = []
        for deck in decks_by_planet[passenger.HomePlanet]:
            for side in cabin_sides:
                
                before_slice = df.iloc[:index]
                after_slice = df.iloc[index+1:] 
                top_room_number_before = np.max(before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number)
                smallest_room_number_after = np.min(after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number)
                
                if top_room_number_before + 1 != smallest_room_number_after:
                    if pd.isna(top_room_number_before) and smallest_room_number_after == 0:
                        continue
                    elif top_room_number_before == smallest_room_number_after:
                        continue
                    else:
                        options.append([deck,side,top_room_number_before,smallest_room_number_after])
        print(options)
remaining_cabins(df2)


index 404
passenger 0293_01
[['B', 'P', 12.0, 14.0], ['C', 'S', 12.0, 14.0]]

index 421
passenger 0310_01
[['B', 'P', 12.0, 14.0], ['C', 'S', 12.0, 14.0]]

index 479
passenger 0348_02
[['E', 'P', 19.0, 22.0]]

index 505
passenger 0364_02
[['E', 'P', 19.0, 22.0]]

index 517
passenger 0374_02
[['E', 'P', 19.0, 22.0]]

index 1401
passenger 0992_04
[['E', 'P', 57.0, 59.0]]

index 1423
passenger 1006_03
[['E', 'P', 57.0, 59.0]]

index 1429
passenger 1011_01
[['E', 'P', 57.0, 59.0]]

index 1466
passenger 1041_01
[['C', 'S', 39.0, 41.0], ['D', 'S', 35.0, 37.0], ['E', 'P', 57.0, 59.0]]

index 1543
passenger 1095_01
[['C', 'S', 39.0, 41.0], ['D', 'S', 35.0, 37.0]]

index 2414
passenger 1688_02
[]

index 2442
passenger 1709_03
[]

index 2970
passenger 2092_03
[]

index 3416
passenger 2425_05
[]

index 3529
passenger 2513_01
[['E', 'P', 149.0, 151.0], ['F', 'P', 518.0, 520.0]]

index 3530
passenger 2514_01
[['E', 'P', 149.0, 151.0], ['F', 'P', 518.0, 520.0]]

index 4233
passenger 3034_01
[['B', 

In [None]:
for deck in cabin

# which people for empty cabin

In [302]:
def fill_by_empty_room(df):
    for deck in rooms.keys():
        for side in ['P','S']:
            for number in rooms[deck][side]:
                indices_matching = []
                print()
                print('deck',deck,'side',side,'number',number)
                temp = df[(df.Cabin.isna()) & (pd.isna(df.HomePlanet) | df.HomePlanet.isin(planet_by_deck[deck]))]
                for index,row in temp.iterrows():
                    before_slice = df.iloc[:index]
                    after_slice = df.iloc[index+1:] 
                    if max(list(before_slice[(before_slice.Deck == deck) & (before_slice.Side == side)].Number.unique())) < number:
                        if min(list(after_slice[(after_slice.Deck == deck) & (after_slice.Side == side)].Number.unique())) > number:
                            indices_matching.append(index)
                print(df.iloc[indices_matching].PassengerId)
                print()
                print()
fill_by_empty_room(df2)


deck B side P number 13
404    0293_01
421    0310_01
Name: PassengerId, dtype: object



deck B side P number 98
4233    3034_01
4254    3053_01
Name: PassengerId, dtype: object



deck B side P number 99
4233    3034_01
4254    3053_01
Name: PassengerId, dtype: object



deck F side P number 519
3529    2513_01
3530    2514_01
Name: PassengerId, dtype: object



deck F side P number 1489
10081    7182_01
10082    7183_01
Name: PassengerId, dtype: object



deck F side P number 1544
10434    7463_01
10440    7469_01
Name: PassengerId, dtype: object



deck F side S number 1267
9265    6612_03
9267    6612_05
Name: PassengerId, dtype: object



deck F side S number 1424
10394    7429_01
10408    7440_01
10411    7442_02
10434    7463_01
Name: PassengerId, dtype: object



deck F side S number 1785
12892    9223_01
12893    9223_02
Name: PassengerId, dtype: object



deck A side P number 94
12651    9057_01
12668    9069_03
Name: PassengerId, dtype: object



deck G side P number 590
5

In [306]:
for deck in df.Deck.unique():
    for side in df.Side.unique():
        max_size = 0
        for cabin in df[(df.Deck == deck) & (df.Side == side)].Cabin:
            max_size = max(len(df[df.Cabin == cabin]),max_size)
        print('deck',deck,'side',side,'max_size',max_size)
            
        

deck B side P max_size 7
deck B side S max_size 7
deck B side nan max_size 0
deck F side P max_size 7
deck F side S max_size 5
deck F side nan max_size 0
deck A side P max_size 6
deck A side S max_size 6
deck A side nan max_size 0
deck G side P max_size 8
deck G side S max_size 8
deck G side nan max_size 0
deck nan side P max_size 0
deck nan side S max_size 0
deck nan side nan max_size 0
deck C side P max_size 7
deck C side S max_size 7
deck C side nan max_size 0
deck E side P max_size 6
deck E side S max_size 7
deck E side nan max_size 0
deck D side P max_size 6
deck D side S max_size 7
deck D side nan max_size 0
deck T side P max_size 1
deck T side S max_size 3
deck T side nan max_size 0


## what can be deduced with these cabins now filled ?