In [168]:
import pandas as pd

traindf = pd.read_csv('data/train.csv')
testdf = pd.read_csv('data/test.csv')
traindf['Set'] = 'Train'
testdf['Set'] = 'Test'
togeth = pd.concat([testdf,traindf])
homeplanets = ['Earth', 'Europa', 'Mars']

## datasplits 

In [169]:

def splits(dataframe):
    
    dataframe['Group'] = dataframe['PassengerId'].apply(lambda x: x.split("_")[0] if pd.notna(x) else pd.NA)
    dataframe['GroupNumber'] = dataframe['PassengerId'].apply(lambda x: x.split("_")[1] if pd.notna(x) else pd.NA)
    

    dataframe[['Deck', 'Number', 'Side']] = dataframe['Cabin'].str.split("/", expand=True)

    dataframe[['Group','GroupNumber']] = dataframe['PassengerId'].str.split("_", expand = True)

    togeth['Name'] = togeth['Name'].astype(str)
    togeth[['FirstName','LastName']] = togeth['Name'].str.split(" ",expand = True)


    dataframe[['Group', 'GroupNumber','Number']] = dataframe[['Group', 'GroupNumber','Number']].astype(float)

    return dataframe

togeth = splits(togeth)


## Certain imputes

#### Same name implies home planet

write how many there are beforehand with same last name and therefore home planet 

In [170]:
togeth_cleaned = togeth.dropna(subset=['HomePlanet','LastName'])

# Group by 'LastName' and aggregate HomePlanet into lists
grouped = togeth_cleaned.groupby('LastName')['HomePlanet'].agg(list).reset_index()

# Function to check if all elements in a list are the same
def is_uniform(homeplanets):
    return len(set(homeplanets)) == 1

# Apply the function to identify uniform and mixed groups
grouped['UniformPlanet'] = grouped['HomePlanet'].apply(is_uniform)

# Count how many people are in uniform and mixed planet groups
uniform_groups = grouped[grouped['UniformPlanet'] == True]
mixed_groups = grouped[grouped['UniformPlanet'] == False]

# Sum up the number of people in uniform and mixed groups
people_uniform = uniform_groups['HomePlanet'].apply(len).sum()
people_mixed = mixed_groups['HomePlanet'].apply(len).sum()

print(f"Number of people from groups where all share the same HomePlanet: {people_uniform}")
print(f"Number of people from groups where not all share the same HomePlanet: {people_mixed}")


Number of people from groups where all share the same HomePlanet: 12398
Number of people from groups where not all share the same HomePlanet: 0


#### same group implies homeplanet 

In [171]:
togeth_cleaned = togeth.dropna(subset=['HomePlanet','Group'])


# Group by 'LastName' and aggregate HomePlanet into lists
grouped = togeth_cleaned.groupby('Group')['HomePlanet'].agg(list).reset_index()

# Function to check if all elements in a list are the same
def is_uniform(homeplanets):
    return len(set(homeplanets)) == 1

# Apply the function to identify uniform and mixed groups
grouped['UniformPlanet'] = grouped['HomePlanet'].apply(is_uniform)

# Count how many people are in uniform and mixed planet groups
uniform_groups = grouped[grouped['UniformPlanet'] == True]
mixed_groups = grouped[grouped['UniformPlanet'] == False]

# Sum up the number of people in uniform and mixed groups
people_uniform = uniform_groups['HomePlanet'].apply(len).sum()
people_mixed = mixed_groups['HomePlanet'].apply(len).sum()

print(f"Number of people from groups where all share the same HomePlanet: {people_uniform}")
print(f"Number of people from groups where not all share the same HomePlanet: {people_mixed}")


Number of people from groups where all share the same HomePlanet: 12682
Number of people from groups where not all share the same HomePlanet: 0


#### impute findings

In [180]:
def fill_home_planet_by_last_name(df):
    for index, row in df.iterrows():
        if pd.isna(row['HomePlanet']):
            matching_rows = df[df['LastName'] == row['LastName']]
            non_nan_planet = matching_rows.dropna(subset=['HomePlanet'])
            if not non_nan_planet.empty:
                planet_to_fill = non_nan_planet['HomePlanet'].values[0]
                df.at[index, 'HomePlanet'] = planet_to_fill
    return df


def fill_home_planet_by_group(df):
    for index, row in df.iterrows():
        if pd.isna(row['HomePlanet']):
            matching_rows = df[df['Group'] == row['Group']]
            non_nan_planet = matching_rows.dropna(subset=['HomePlanet'])
            if not non_nan_planet.empty:
                planet_to_fill = non_nan_planet['HomePlanet'].values[0]
                df.at[index, 'HomePlanet'] = planet_to_fill
    return df



## Homeplanet implies deck

In [176]:
togeth.isna().sum()

PassengerId        0
HomePlanet       283
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Name               0
Set                0
Transported     4277
Group              0
GroupNumber        0
Deck             299
Number           299
Side             299
FirstName          0
LastName         294
dtype: int64

In [178]:
togeth = fill_home_planet_by_group(togeth)

In [179]:
for home in homeplanets:
    print("\n" + home)
    print(togeth[togeth.HomePlanet == home].Deck.value_counts())


Earth
Deck
G    3719
F    2439
E     589
C       3
B       2
A       2
D       1
Name: count, dtype: int64

Europa
Deck
B    1133
C    1091
A     349
D     300
E     200
T      10
F       8
G       5
Name: count, dtype: int64

Mars
Deck
F    1736
E     514
D     410
G       8
B       2
C       2
A       1
Name: count, dtype: int64


People from Earth are only in decks G, F, E
People from Europa are only in decks B,C,A,D,E,T
people from Mars are only in decks F,E,D









In [145]:
def fill_home_planet_by_last_name(df):
    for index,row in df.iterrows():
        if pd.isna(row['HomePlanet']):
            matching_rows = df[df['LastName'] == row['LastName']]
            non_nan_planet = matching_rows.dropna(subset=['HomePlanet'])
            if len(non_nan_planet) > 1:
                planet_to_fill = non_nan_planet['HomePlanet'].values[0]
                df.at[index, 'HomePlanet'] = planet_to_fill
    return df

In [155]:
togeth = home_planet_from_deck_abc(togeth)
togeth = home_planet_from_deck_g(togeth)
togeth = fill_home_planet_by_last_name(togeth)

In [115]:
def home_planet_from_deck_abc(df):
    df.loc[df['Deck'].isin(['A', 'B', 'C']), 'HomePlanet'] = df.loc[df['Deck'].isin(['A', 'B', 'C']), 'HomePlanet'].fillna('Europa')
    return df
def home_planet_from_deck_g(df):
    df.loc[df['Deck']== 'G', 'HomePlanet'] = df.loc[df['Deck'] == 'G', 'HomePlanet'].fillna('Earth')
    return df


In [116]:
def impute(df):
    df = fill_home_planet_by_group(df)
    df = fill_home_planet_by_last_name(df)
    df = home_planet_from_deck_abc(df)
    df = home_planet_from_deck_g(df)
    return df 

togeth = impute(togeth)
togeth = togeth.sort_values(by = ['Group','GroupNumber'])


In [68]:
togeth.isna().sum()

PassengerId        0
HomePlanet         9
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Name               0
Set                0
Transported     4277
Group              0
GroupNumber        0
Deck             299
Number           299
Side             299
FirstName          0
LastName         294
dtype: int64

In [69]:
togeth.isna().sum().sum()

8336

Empty Cabins 299

In [70]:
togeth = togeth[['PassengerId','HomePlanet','Cabin','Destination','Group','GroupNumber','Deck','Number','Side','Set']]

In [99]:
togeth

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set
0,0001_01,Europa,B/0/P,TRAPPIST-1e,1.0,1.0,B,0.0,P,Train
1,0002_01,Earth,F/0/S,TRAPPIST-1e,2.0,1.0,F,0.0,S,Train
2,0003_01,Europa,A/0/S,TRAPPIST-1e,3.0,1.0,A,0.0,S,Train
3,0003_02,Europa,A/0/S,TRAPPIST-1e,3.0,2.0,A,0.0,S,Train
4,0004_01,Earth,F/1/S,TRAPPIST-1e,4.0,1.0,F,1.0,S,Train
...,...,...,...,...,...,...,...,...,...,...
4276,9277_01,Earth,G/1498/S,PSO J318.5-22,9277.0,1.0,G,1498.0,S,Test
8689,9278_01,Earth,G/1499/S,PSO J318.5-22,9278.0,1.0,G,1499.0,S,Train
8690,9279_01,Earth,G/1500/S,TRAPPIST-1e,9279.0,1.0,G,1500.0,S,Train
8691,9280_01,Europa,E/608/S,55 Cancri e,9280.0,1.0,E,608.0,S,Train


In [72]:
togeth[togeth.Group == 1]

Unnamed: 0,PassengerId,HomePlanet,Cabin,Destination,Group,GroupNumber,Deck,Number,Side,Set
0,0001_01,Europa,B/0/P,TRAPPIST-1e,1.0,1.0,B,0.0,P,Train


In [73]:
list(togeth[togeth.Deck == 'B'].Cabin.unique())

['B/0/P',
 'B/1/P',
 'B/2/P',
 'B/3/P',
 'B/0/S',
 'B/4/P',
 'B/5/P',
 'B/1/S',
 'B/6/P',
 'B/7/P',
 'B/8/P',
 'B/2/S',
 'B/3/S',
 'B/9/P',
 'B/10/P',
 'B/4/S',
 'B/11/P',
 'B/5/S',
 'B/6/S',
 'B/12/P',
 'B/7/S',
 'B/8/S',
 'B/9/S',
 'B/14/P',
 'B/10/S',
 'B/15/P',
 'B/11/S',
 'B/16/P',
 'B/17/P',
 'B/18/P',
 'B/19/P',
 'B/20/P',
 'B/13/S',
 'B/21/P',
 'B/14/S',
 'B/15/S',
 'B/22/P',
 'B/16/S',
 'B/17/S',
 'B/18/S',
 'B/19/S',
 'B/20/S',
 'B/21/S',
 'B/22/S',
 'B/23/P',
 'B/23/S',
 'B/24/P',
 'B/25/P',
 'B/26/P',
 'B/27/P',
 'B/24/S',
 'B/28/P',
 'B/25/S',
 'B/29/P',
 'B/31/P',
 'B/26/S',
 'B/27/S',
 'B/32/P',
 'B/28/S',
 'B/33/P',
 'B/29/S',
 'B/30/S',
 'B/31/S',
 'B/32/S',
 'B/33/S',
 'B/34/S',
 'B/35/S',
 'B/36/S',
 'B/34/P',
 'B/37/S',
 'B/35/P',
 'B/38/S',
 'B/39/S',
 'B/36/P',
 'B/40/S',
 'B/37/P',
 'B/38/P',
 'B/41/S',
 'B/42/S',
 'B/39/P',
 'B/40/P',
 'B/43/S',
 'B/41/P',
 'B/44/S',
 'B/45/S',
 'B/46/S',
 'B/42/P',
 'B/43/P',
 'B/47/S',
 'B/48/S',
 'B/44/P',
 'B/49/S',
 'B/50/S

In [86]:
used_cabins = {}
for deck in list(togeth.dropna().Deck.unique()):
    cab = {}
    for side in list(togeth.dropna().Side.unique()):
        rooms = list(togeth[(togeth.Deck == deck) & (togeth.Side == side)].dropna().Number.unique())
        rooms.sort()
        cab[side] = rooms
    used_cabins[deck] = cab
    

#### Same cabin means same group

In [97]:
togeth_cleaned = togeth.dropna(subset=['Group','Cabin'])


# Cabin by 'LastName' and aggregate Group into lists
Cabined = togeth_cleaned.groupby('Cabin')['Group'].agg(list).reset_index()

# Function to check if all elements in a list are the same
def is_uniform(Groups):
    return len(set(Groups)) == 1

# Apply the function to identify uniform and mixed Cabins
Cabined['UniformPlanet'] = Cabined['Group'].apply(is_uniform)

# Count how many people are in uniform and mixed planet Cabins
uniform_Cabins = Cabined[Cabined['UniformPlanet'] == True]
mixed_Cabins = Cabined[Cabined['UniformPlanet'] == False]

# Sum up the number of people in uniform and mixed Cabins
people_uniform = uniform_Cabins['Group'].apply(len).sum()
people_mixed = mixed_Cabins['Group'].apply(len).sum()

print(f"Number of people from Cabins where all share the same Group: {people_uniform}")
print(f"Number of people from Cabins where not all share the same Group: {people_mixed}")


Number of people from Cabins where all share the same Group: 12671
Number of people from Cabins where not all share the same Group: 0


#### fill people that must be sharing a room with people in the same group
ie, from their chome planet we know the decks they could be in, if the side and number below and above are one above and one below then they must be sharing with someone from their group. make sure that their group has only one cabin

In [None]:
def sharing_room_with_group_member(df):
    potential_decks = 
    for row in df:

## what can be deduced with these cabins now filled ?