In [796]:
import pandas as pd
import numpy as np

traindf = pd.read_csv('data/train.csv')
testdf = pd.read_csv('data/test.csv')
traindf['Set'] = 'Train'
testdf['Set'] = 'Test'
togeth = pd.concat([testdf,traindf])


In [797]:
homeplanets = ['Earth', 'Europa', 'Mars']

In [798]:
decks_by_planet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
}

cabin_sides = ['P','S']

## datasplits 

In [799]:
def splits(data_frame):
    split_df = data_frame['PassengerId'].str.split('_', expand=True)
    data_frame[['Group', 'GroupNumber']] = split_df

    split_df =  data_frame['Cabin'].str.split("/", expand = True)
    data_frame[['Deck', 'Number', 'Side']]= split_df
    data_frame.Number = data_frame.Number.astype('float')
    
    split_df = data_frame['Name'].str.split(" ",expand = True)
    data_frame[['FristName','LastName']] = split_df


    return data_frame

togeth = splits(togeth)
togeth = togeth.sort_values(by = ['Group','GroupNumber'])
togeth = togeth.reset_index()
togeth = togeth[['PassengerId','HomePlanet','Cabin','Destination','Group','GroupNumber','Deck','Number','Side','Set','LastName']]


In [800]:
def group_size(df):
    def group_size_apply(row):
        group = df[df.Group == row.Group]
        return len(group)
    df['GroupSize'] = df.apply(group_size_apply, axis = 1)
    return df
togeth = group_size(togeth)

In [801]:
togeth.head()

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set,LastName,GroupSize
0,0001_01,Europa,B/0/P,TRAPPIST-1e,1,1,B,0.0,P,Train,Ofracculy,1
1,0002_01,Earth,F/0/S,TRAPPIST-1e,2,1,F,0.0,S,Train,Vines,1
2,0003_01,Europa,A/0/S,TRAPPIST-1e,3,1,A,0.0,S,Train,Susent,2
3,0003_02,Europa,A/0/S,TRAPPIST-1e,3,2,A,0.0,S,Train,Susent,2
4,0004_01,Earth,F/1/S,TRAPPIST-1e,4,1,F,1.0,S,Train,Santantines,1


In [802]:
def impute_attribute_based_on_shared_feature(df, attribute, feature):
    # Function to fill NaNs in a group based on the first available non-NaN value
    def fill_with_first_available(group_df):
        if not group_df[attribute].dropna().empty:
            first_available = group_df[attribute].dropna().iloc[0]
            group_df[attribute] = group_df[attribute].fillna(first_available)
        return group_df

    # Filter out rows where the feature is NaN, then apply the fill function to each group
    filtered_df = df.dropna(subset=[feature])
    imputed_df = filtered_df.groupby(feature, group_keys=False).apply(fill_with_first_available)
    
    # Merge back with original df rows where the feature is NaN to retain those rows unchanged
    df_with_nan_feature = df[df[feature].isna()]
    final_df = pd.concat([imputed_df, df_with_nan_feature], ignore_index=True)

    return final_df

## Certain imputes

In [803]:
togeth = impute_attribute_based_on_shared_feature(togeth,'HomePlanet','Group')

  imputed_df = filtered_df.groupby(feature, group_keys=False).apply(fill_with_first_available)


# by last name

In [804]:
togeth = impute_attribute_based_on_shared_feature(togeth,'HomePlanet','LastName')

  imputed_df = filtered_df.groupby(feature, group_keys=False).apply(fill_with_first_available)


In [805]:
togeth = togeth.sort_values(by = ['Group','GroupNumber'])
togeth = togeth.reset_index(drop = True)


In [806]:
def home_planet_from_deck_abc(df):
    df.loc[df['Deck'].isin(['A', 'B', 'C']), 'HomePlanet'] = df.loc[df['Deck'].isin(['A', 'B', 'C']), 'HomePlanet'].fillna('Europa')
    return df
def home_planet_from_deck_g(df):
    df.loc[df['Deck']== 'G', 'HomePlanet'] = df.loc[df['Deck'] == 'G', 'HomePlanet'].fillna('Earth')
    return df


## do i need these for my imputing?

In [807]:
togeth = home_planet_from_deck_abc(togeth)
togeth = home_planet_from_deck_g(togeth)

In [808]:
togeth[togeth.HomePlanet.isna()]

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set,LastName,GroupSize
870,0616_01,,E/40/S,TRAPPIST-1e,616,1,E,40.0,S,Test,Spriney,1
3439,2443_01,,D/72/P,TRAPPIST-1e,2443,1,D,72.0,P,Train,,1
3946,2817_01,,F/584/P,TRAPPIST-1e,2817,1,F,584.0,P,Train,Sutty,1
4632,3331_01,,F/631/S,TRAPPIST-1e,3331,1,F,631.0,S,Train,,1
5687,4052_01,,D/136/P,TRAPPIST-1e,4052,1,D,136.0,P,Test,Coneveseng,1
6786,4840_01,,F/915/S,TRAPPIST-1e,4840,1,F,915.0,S,Train,,1
8528,6108_01,,F/1166/S,TRAPPIST-1e,6108,1,F,1166.0,S,Train,,1
9916,7065_01,,E/471/S,TRAPPIST-1e,7065,1,E,471.0,S,Test,Maltorted,1
11757,8435_01,,F/1730/P,TRAPPIST-1e,8435,1,F,1730.0,P,Test,,1
12244,8775_01,,D/275/P,TRAPPIST-1e,8775,1,D,275.0,P,Test,Maltorted,1


In [809]:
togeth.isna().sum()

PassengerId      0
HomePlanet      10
Cabin          299
Destination    274
Group            0
GroupNumber      0
Deck           299
Number         299
Side           299
Set              0
LastName       294
GroupSize        0
dtype: int64

# Filling missed Cabins


In [810]:
used_cabins = {}
for deck in list(togeth.dropna().Deck.unique()):
    cab = {}
    for side in list(togeth.dropna().Side.unique()):
        rooms = list(togeth[(togeth.Deck == deck) & (togeth.Side == side)].dropna().Number.unique())
        rooms.sort()
        cab[side] = rooms
    used_cabins[deck] = cab
    

#### fill people that must be sharing a room with people in the same group
ie, from their chome planet we know the decks they could be in, if the side and number below and above are one above and one below then they must be sharing with someone from their group. make sure that their group has only one cabin

need to make sure is only one person  with cabin being na in the group otherwise one na might be in another cabin 

In [None]:
df = togeth.copy()

In [None]:
df2 = togeth.copy()

for row in dataframe

if cabin.isna() and homeplanet is known,
 for potential decks in homeplanets remit ie europa check a, b , c in dataframe up to that point and over that point, see if any have a free space and if not then fill it with the one in the same group

In [None]:
df = checks(df)


Earth
Deck
G    3781
F    2475
E     595
Name: count, dtype: int64

Europa
Deck
B    1141
C    1102
A     354
D     306
E     203
T      11
Name: count, dtype: int64

Mars
Deck
F    1759
E     523
D     411
Name: count, dtype: int64


Finding groups that have more than 1 member that all share the same cabin and that dont have other nans in group

In [None]:
for index,passenger in df[df.Cabin.isna()].iterrows():
    print()
    print("index",index)
    print("passenger",passenger)
    options = 0
    for deck in decks_by_planet[passenger.HomePlanet]:
        for side in cabin_sides:
            top_room_number_before = np.max(df[(df.Deck == deck) & (df.Side == side)].iloc[:index].Number)
            smallest_room_number_after = np.max(df[(df.Deck == deck) & (df.Side == side)].iloc[index + 1:].Number)
            if top_room_number_before + 1 != smallest_room_number_after:
                print(top_room_number_before,smallest_room_number_after)
                options += 1
                break
        if options > 1:
            break
    if options == 0:
        other_group_member = df[df.Group == passenger.Group].dropna(subset = 'Cabin')
        if len(other_group_member.Cabin.unique()) == 1:
            df.loc[index,'Cabin'] = other_group_member.iloc[0].Cabin

        
                
                
    
    

    

14.0 598.0
11.0 1894.0
40.0 98.0
36.0 301.0
102.0 297.0
129.0 598.0
89.0 98.0
86.0 301.0
298.0 598.0
261.0 1894.0
265.0 297.0
314.0 598.0
269.0 297.0
318.0 598.0
359.0 598.0
325.0 1894.0
98.0 nan
230.0 301.0
98.0 nan
236.0 301.0
388.0 598.0
353.0 1894.0
98.0 nan
245.0 301.0
415.0 598.0
381.0 1894.0
419.0 598.0
382.0 1894.0
297.0 nan
447.0 598.0
459.0 598.0
424.0 1894.0
297.0 nan
470.0 598.0
479.0 598.0
448.0 1894.0
98.0 nan
301.0 nan
98.0 nan
301.0 nan
598.0 nan
565.0 1894.0
98.0 nan
301.0 nan
297.0 nan
598.0 nan
598.0 nan
620.0 1894.0
598.0 nan
635.0 1894.0
297.0 nan
598.0 nan
98.0 nan
301.0 nan
98.0 nan
301.0 nan
98.0 nan
301.0 nan
598.0 nan
914.0 1894.0
598.0 nan
921.0 1894.0
98.0 nan
301.0 nan
98.0 nan
301.0 nan
598.0 nan
1025.0 1894.0
598.0 nan
1027.0 1894.0
98.0 nan
301.0 nan
598.0 nan
1085.0 1894.0
598.0 nan
1092.0 1894.0
297.0 nan
598.0 nan
98.0 nan
301.0 nan
598.0 nan
1236.0 1894.0
598.0 nan
1243.0 1894.0
598.0 nan
1255.0 1894.0
98.0 nan
301.0 nan
98.0 nan
301.0 nan
598.0 nan


In [995]:
# Analyze each group
def filter_group(group):
    # Check if all non-NaN Cabin values in the group are the same and no NaNs except for one
    if group['Cabin'].nunique(dropna=True) == 1 and group['Cabin'].isna().sum() == 1:
        return group[group['Cabin'].isna()].index  # Return index of NaN Cabin if conditions met

# Apply function and collect results
result_indices = df.groupby('Group').apply(filter_group)

# Drop NaNs from result (groups that did not meet criteria will produce NaN entries)
filtered_indices = result_indices.dropna()
filtered_indices = [row.item() for row in filtered_indices]


  result_indices = df.groupby('Group').apply(filter_group)


In [999]:
for index, passenger in df.loc[filtered_indices].iterrows():
    print()
    print("passenger",passenger)
    potentials = []
    for deck in decks_by_planet[passenger.HomePlanet]:
        for side in cabin_sides:
            first_member_of_group_index = df[df.Group == passenger.Group].index[0]                
            last_member_of_group_index = df[df.Group == passenger.Group].index[-1]
            
            before_passenger = df.iloc[:first_member_of_group_index]
            top_room_number_before = np.max(before_passenger[(before_passenger.Deck == deck) & (before_passenger.Side == side)].Number)
            after_passenger = df.iloc[last_member_of_group_index + 1:]
            smallest_room_number_after = np.min(after_passenger[(after_passenger.Deck == deck) & (after_passenger.Side == side)].Number)
        if smallest_room_number_after == 0:
            continue
        elif top_room_number_before + 1 != smallest_room_number_after:
            potentials.append([deck,side,top_room_number_before,smallest_room_number_after])
    print("potentials", potentials)
   
    if len(potentials) == 1:
        other_group_member = df[df.Group == passenger.Group].dropna(subset = 'Cabin').iloc[0]
        print("other_group_member",other_group_member)
        if other_group_member.Deck == potentials[0][0] and other_group_member.Side == potentials[0][1]:
            print("same deck")
            if other_group_member.Number + 1 == potentials[0][3]:
                print("upper good")
                if other_group_member.Number -1 == potentials[0][2] or pd.isna(potentials[0][2]):
                    print("lower good")
                    df.loc[index,['Cabin','Side','Number','Deck']] = other_group_member[['Cabin','Side','Number','Deck']]     
                    print("COMPLETE!!")
                    print(index,other_group_member.Cabin)               
        
                    
                

        
        
    



passenger PassengerId        0047_02
HomePlanet          Europa
Cabin                B/0/S
Destination    TRAPPIST-1e
Group                 0047
GroupNumber             02
Deck                     B
Number                 0.0
Side                     S
Set                   Test
LastName          Prucerod
GroupSize                3
Name: 66, dtype: object
potentials [['B', 'S', nan, 1.0]]
other_group_member PassengerId        0047_01
HomePlanet          Europa
Cabin                B/0/S
Destination    TRAPPIST-1e
Group                 0047
GroupNumber             01
Deck                     B
Number                 0.0
Side                     S
Set                   Test
LastName          Prucerod
GroupSize                3
Name: 65, dtype: object
same deck
upper good
lower good
COMPLETE!!
66 B/0/S

passenger PassengerId        0110_01
HomePlanet          Europa
Cabin                  NaN
Destination    TRAPPIST-1e
Group                 0110
GroupNumber             01
Deck           

Cases
* there arent any free rooms for it so has to share
* its the only person that could fill that room 

In [1000]:
df.isna().sum()

PassengerId      0
HomePlanet      10
Cabin          261
Destination    274
Group            0
GroupNumber      0
Deck           261
Number         261
Side           261
Set              0
LastName       294
GroupSize        0
dtype: int64

## what can be deduced with these cabins now filled ?