In [721]:
import pandas as pd
import numpy as np

traindf = pd.read_csv('data/train.csv')
testdf = pd.read_csv('data/test.csv')
traindf['Set'] = 'Train'
testdf['Set'] = 'Test'
togeth = pd.concat([testdf,traindf])


In [722]:
homeplanets = ['Earth', 'Europa', 'Mars']

In [723]:
decks_by_planet = {
    'earth':['E','F','G'],
    'europa': ['A','B','C','D','E','T'],
    'mars': ['D','E','F']
}

cabin_sides = ['P','S']

## datasplits 

In [724]:
def splits(dataframe):
    split_df = dataframe['PassengerId'].str.split('_', expand=True)
    dataframe[['Group', 'GroupNumber']] = split_df

    split_df =  dataframe['Cabin'].str.split("/", expand = True)
    dataframe[['Deck', 'Number', 'Side']]= split_df
    
    split_df = dataframe['Name'].str.split(" ",expand = True)
    dataframe[['FristName','LastName']] = split_df


    return dataframe

togeth = splits(togeth)
togeth = togeth.sort_values(by = ['Group','GroupNumber'])
togeth = togeth.reset_index()
togeth = togeth[['PassengerId','HomePlanet','Cabin','Destination','Group','GroupNumber','Deck','Number','Side','Set','LastName']]


In [725]:
def group_size(df):
    def group_size_apply(row):
        group = df[df.Group == row.Group]
        return len(group)
    df['GroupSize'] = df.apply(group_size_apply, axis = 1)
    return df
togeth = group_size(togeth)

In [726]:
togeth.head()

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set,LastName,GroupSize
0,0001_01,Europa,B/0/P,TRAPPIST-1e,1,1,B,0,P,Train,Ofracculy,1
1,0002_01,Earth,F/0/S,TRAPPIST-1e,2,1,F,0,S,Train,Vines,1
2,0003_01,Europa,A/0/S,TRAPPIST-1e,3,1,A,0,S,Train,Susent,2
3,0003_02,Europa,A/0/S,TRAPPIST-1e,3,2,A,0,S,Train,Susent,2
4,0004_01,Earth,F/1/S,TRAPPIST-1e,4,1,F,1,S,Train,Santantines,1


In [727]:
def impute_attribute_based_on_shared_feature(df, attribute, feature):
    # Function to fill NaNs in a group based on the first available non-NaN value
    def fill_with_first_available(group_df):
        if not group_df[attribute].dropna().empty:
            first_available = group_df[attribute].dropna().iloc[0]
            group_df[attribute] = group_df[attribute].fillna(first_available)
        return group_df

    # Filter out rows where the feature is NaN, then apply the fill function to each group
    filtered_df = df.dropna(subset=[feature])
    imputed_df = filtered_df.groupby(feature, group_keys=False).apply(fill_with_first_available)
    
    # Merge back with original df rows where the feature is NaN to retain those rows unchanged
    df_with_nan_feature = df[df[feature].isna()]
    final_df = pd.concat([imputed_df, df_with_nan_feature], ignore_index=True)

    return final_df

## Certain imputes

In [728]:
togeth = impute_attribute_based_on_shared_feature(togeth,'HomePlanet','Group')

  imputed_df = filtered_df.groupby(feature, group_keys=False).apply(fill_with_first_available)


# by last name

In [729]:
togeth = impute_attribute_based_on_shared_feature(togeth,'HomePlanet','LastName')

  imputed_df = filtered_df.groupby(feature, group_keys=False).apply(fill_with_first_available)


In [730]:
togeth = togeth.sort_values(by = ['Group','GroupNumber'])
togeth = togeth.reset_index(drop = True)


In [731]:
def home_planet_from_deck_abc(df):
    df.loc[df['Deck'].isin(['A', 'B', 'C']), 'HomePlanet'] = df.loc[df['Deck'].isin(['A', 'B', 'C']), 'HomePlanet'].fillna('Europa')
    return df
def home_planet_from_deck_g(df):
    df.loc[df['Deck']== 'G', 'HomePlanet'] = df.loc[df['Deck'] == 'G', 'HomePlanet'].fillna('Earth')
    return df


## do i need these for my imputing?

In [732]:
togeth = home_planet_from_deck_abc(togeth)
togeth = home_planet_from_deck_g(togeth)

In [733]:
togeth[togeth.HomePlanet.isna()]

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set,LastName,GroupSize
870,0616_01,,E/40/S,TRAPPIST-1e,616,1,E,40,S,Test,Spriney,1
3439,2443_01,,D/72/P,TRAPPIST-1e,2443,1,D,72,P,Train,,1
3946,2817_01,,F/584/P,TRAPPIST-1e,2817,1,F,584,P,Train,Sutty,1
4632,3331_01,,F/631/S,TRAPPIST-1e,3331,1,F,631,S,Train,,1
5687,4052_01,,D/136/P,TRAPPIST-1e,4052,1,D,136,P,Test,Coneveseng,1
6786,4840_01,,F/915/S,TRAPPIST-1e,4840,1,F,915,S,Train,,1
8528,6108_01,,F/1166/S,TRAPPIST-1e,6108,1,F,1166,S,Train,,1
9916,7065_01,,E/471/S,TRAPPIST-1e,7065,1,E,471,S,Test,Maltorted,1
11757,8435_01,,F/1730/P,TRAPPIST-1e,8435,1,F,1730,P,Test,,1
12244,8775_01,,D/275/P,TRAPPIST-1e,8775,1,D,275,P,Test,Maltorted,1


In [734]:
togeth.isna().sum()

PassengerId      0
HomePlanet      10
Cabin          299
Destination    274
Group            0
GroupNumber      0
Deck           299
Number         299
Side           299
Set              0
LastName       294
GroupSize        0
dtype: int64

# Filling missed Cabins


In [735]:
used_cabins = {}
for deck in list(togeth.dropna().Deck.unique()):
    cab = {}
    for side in list(togeth.dropna().Side.unique()):
        rooms = list(togeth[(togeth.Deck == deck) & (togeth.Side == side)].dropna().Number.unique())
        rooms.sort()
        cab[side] = rooms
    used_cabins[deck] = cab
    

#### fill people that must be sharing a room with people in the same group
ie, from their chome planet we know the decks they could be in, if the side and number below and above are one above and one below then they must be sharing with someone from their group. make sure that their group has only one cabin

need to make sure is only one person  with cabin being na in the group otherwise one na might be in another cabin 

In [736]:
df = togeth.copy()

for row in dataframe

if cabin.isna() and homeplanet is known,
 for potential decks in homeplanets remit ie europa check a, b , c in dataframe up to that point and over that point, see if any have a free space and if not then fill it with the one in the same group

In [737]:
df = checks(df)


Earth
Deck
G    3781
F    2475
E     595
Name: count, dtype: int64

Europa
Deck
B    1141
C    1102
A     354
D     306
E     203
T      11
Name: count, dtype: int64

Mars
Deck
F    1759
E     523
D     411
Name: count, dtype: int64


In [738]:
passenger_index = 1000
deck = 'A'

In [739]:
before_passenger = df.iloc[:passenger_index]
np.max((before_passenger[before_passenger.Deck == deck].Number))

'6'

Finding groups that have more than 1 member that all share the same cabin and that dont have other nans in group

In [740]:
# Analyze each group
def filter_group(group):
    # Check if all non-NaN Cabin values in the group are the same and no NaNs except for one
    if group['Cabin'].nunique(dropna=True) == 1 and group['Cabin'].isna().sum() == 1:
        return group[group['Cabin'].isna()].index  # Return index of NaN Cabin if conditions met

# Apply function and collect results
result_indices = df.groupby('Group').apply(filter_group)

# Drop NaNs from result (groups that did not meet criteria will produce NaN entries)
filtered_indices = result_indices.dropna()

  result_indices = df.groupby('Group').apply(filter_group)


In [744]:
type(filtered_indices)

pandas.core.series.Series

In [741]:
filtered_indices = [idx for sublist in result_indices for idx in sublist]


TypeError: 'NoneType' object is not iterable

In [720]:
for index, passenger in df.loc[filtered_indices].iterrows():
    print()
    print(passenger)
    if passenger.HomePlanet.isna():
        pass
    else:
        potentials = []
        for deck in decks_by_planet[passenger.HomePlanet]:
            for side in cabin_sides:
                first_member_of_group_index = df[df.Group == passenger.Group].index[0]                
                last_member_of_group_index = df[df.Group == passenger.Group].index[-1]
                
                before_passenger = df.iloc[:first_member_of_group_index]
                top_room_number_before = np.max(before_passenger[(before_passenger.Deck == deck) & (before_passenger.Side == side)].Number)
                after_passenger = df.iloc[last_member_of_group_index + 1:]
                smallest_room_number_after = np.min(after_passenger[(after_passenger.Deck == deck) & (after_passenger.Side == side)].Number)
                
                if top_room_number_before + 1 != smallest_room_number_after:
                    potentials.append([deck,side,top_room_number_before,smallest_room_number_after])
        print(potentials)
                    
        
                    
                

        
        
    


TypeError: unhashable type: 'Index'

In [684]:
df.iloc[62:72]

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set,LastName,GroupSize
62,0046_01,Earth,G/4/P,TRAPPIST-1e,46,1,G,4.0,P,Test,Powery,3
63,0046_02,Earth,F/11/P,TRAPPIST-1e,46,2,F,11.0,P,Test,Powery,3
64,0046_03,Earth,F/11/P,TRAPPIST-1e,46,3,F,11.0,P,Test,Powery,3
65,0047_01,Europa,B/0/S,TRAPPIST-1e,47,1,B,0.0,S,Test,Prucerod,3
66,0047_02,Europa,,TRAPPIST-1e,47,2,,,,Test,Prucerod,3
67,0047_03,Europa,B/0/S,TRAPPIST-1e,47,3,B,0.0,S,Test,,3
68,0048_01,Earth,G/5/S,TRAPPIST-1e,48,1,G,5.0,S,Test,Deckerry,1
69,0049_01,Earth,E/1/P,TRAPPIST-1e,49,1,E,1.0,P,Test,Flynney,1
70,0050_01,Earth,E/1/S,55 Cancri e,50,1,E,1.0,S,Train,Lancis,1
71,0051_01,Earth,E/2/S,TRAPPIST-1e,51,1,E,2.0,S,Train,Johnshines,1


In [689]:
df[df.Group == '0047'].index[-1]

67

## what can be deduced with these cabins now filled ?