In [2]:
import os
import pandas as pd
from fastbook import *
from fastai.tabular.all import *
import sklearn

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


traindf = pd.read_csv('C:/Users/jsult/Desktop/spacetitanic3/train.csv',low_memory=False)
testdf = pd.read_csv('C:/Users/jsult/Desktop/spacetitanic3/test.csv',low_memory= False)
traindf['train'] = True
testdf['train'] = False
togeth = pd.concat([traindf,testdf])



In [3]:
originalcolumns = list(togeth.columns)
originalcolumns.remove('Name')
originalcolumns.remove('Transported')
originalcolumns.remove('train')
originalcolumns.remove('PassengerId')

In [4]:
togeth[originalcolumns].isna().sum()

HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
dtype: int64

In [5]:
togeth[originalcolumns].isna().sum().sum()

3147

## Imputes before splits

Impute cabin

In [34]:
with open('C:/Users/jsult/Desktop/spacetitanic3/FINALtransferred/idtocabin.pkl', 'rb') as fp:
    idtocabin = pickle.load(fp)

def cabinimpute(df):
    for i,row in df.iterrows():
        if pd.isna(row.Cabin):
            if row.PassengerId in idtocabin:
                df.at[i,'Cabin'] = idtocabin[row.PassengerId]
    return df



In [35]:
togeth = cabinimpute(togeth)

In [36]:
togeth[originalcolumns].isna().sum().sum()

1661

Impute spending


In [37]:
cryodict = {'RoomService':0,'FoodCourt' : 0, 'ShoppingMall' : 0, 'Spa' : 0, 'VRDeck' : 0}
spending = ['RoomService','FoodCourt','ShoppingMall','Spa', 'VRDeck']

def fillspending0(df):
    df.loc[df.CryoSleep == True, list(cryodict.keys())] = df.loc[df.CryoSleep == True, list(cryodict.keys())].fillna(value=cryodict)
    return df

def spending_under_13(df):
    df.loc[df.Age < 13, spending] = df.loc[df.Age < 13, spending].fillna(0)
    return df
def spending_impute(df):
    df = fillspending0(df)
    df = spending_under_13(df)
    return df


In [38]:
togeth = spending_impute(togeth)

In [39]:
togeth[originalcolumns].isna().sum().sum()

1661

Impute CryoSleep

In [40]:
def not_cryosleep_if_spending(df):
    for index, row in df[df['CryoSleep'].isna()].iterrows():
        if (row[spending] > 0).any():
            df.at[index, 'CryoSleep'] = False
    return df


In [41]:
togeth = not_cryosleep_if_spending(togeth)

In [42]:
togeth[originalcolumns].isna().sum().sum()

1661

Impute VIP

In [43]:
def not_vip_earth(df):
    df.loc[df.HomePlanet == 'Earth','VIP'] = df.loc[df.HomePlanet == 'Earth','VIP'].fillna(False)
    return df

def not_vip_underage(df):
    df.loc[df.Age < 18,'VIP'] = df.loc[df.Age < 18, 'VIP'].fillna(False)
    return df

def not_vip_mars_cryo(df):
    df.loc[(df.HomePlanet == 'Mars') & (df.CryoSleep == True),'VIP'] = df.loc[(df.HomePlanet == 'Mars') & (df.CryoSleep == True),'VIP'].fillna(False)
    return df

def vip_impute(df):
    df = not_vip_underage(df)
    df = not_vip_earth(df)
    df = not_vip_mars_cryo(df)
    return df


In [44]:
togeth = vip_impute(togeth)

In [45]:
togeth[originalcolumns].isna().sum().sum()

1660

In [46]:
togeth[originalcolumns].isna().sum()

HomePlanet       10
CryoSleep       135
Cabin            55
Destination     274
Age             270
VIP             104
RoomService     162
FoodCourt       171
ShoppingMall    163
Spa             166
VRDeck          150
dtype: int64

# Recommended feature engineering

In [47]:
def split_group_number(df):
    df['Group'] = df['PassengerId'].apply(lambda x: x.split("_")[0] if pd.notna(x) else pd.NA)
    df.Group = df.Group.astype('float')
    df['GroupNumber'] = df['PassengerId'].apply(lambda x: x.split("_")[1] if pd.notna(x) else pd.NA)
    df.GroupNumber = df.GroupNumber.astype('float')
    return df

def split_names(df):
    # Create new columns for first name and last name
    df['FirstName'] = df['Name'].apply(lambda x: x.split()[0] if pd.notna(x) else pd.NA)
    df['LastName'] = df['Name'].apply(lambda x: x.split()[1] if pd.notna(x) else pd.NA)
    return df

def sort_df_group(df):
    df = df.sort_values(by = ['Group','GroupNumber'])
    df.reset_index(inplace = True)
    df.drop(columns = 'index',inplace = True)
    return df
    
def split_cabin(df):
    df['CabinDeck'] = df['Cabin'].apply(lambda x: x.split("/" )[0] if pd.notna(x) else pd.NA)
    custom_order = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']
    df['CabinDeck'] = pd.Categorical(df['CabinDeck'], categories=custom_order, ordered=True)
    df['CabinSide'] = df['Cabin'].apply(lambda x: x.split("/")[2] if pd.notna(x) else pd.NA)
    df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split("/")[1]) if pd.notna(x) else pd.NA)
    return df

def initial_splits(df):
    df = split_group_number(df)
    df = split_names(df)
    df = sort_df_group(df)
    df = split_cabin(df)
    return df


In [48]:
togeth = initial_splits(togeth)

# More certain imputing

Cabinside

In [49]:
def fill_cabin_side_by_group(df):
    for index, row in df.iterrows():
        if pd.isna(row['CabinSide']):
            matching_rows = togeth[togeth['Group'] == row['Group']]
            non_nan_deck = matching_rows.dropna(subset=['CabinSide'])
            if not non_nan_deck.empty:
                side_to_fill = non_nan_deck['CabinSide'].values[0]
                df.at[index, 'CabinSide'] = side_to_fill
    return df

In [50]:
togeth = fill_cabin_side_by_group(togeth)

In [51]:
togeth[originalcolumns].isna().sum().sum()

1660

HomePlanet

In [52]:
def fill_home_planet_by_last_name(df):
    for index, row in df.iterrows():
        if pd.isna(row['HomePlanet']):
            matching_rows = df[df['LastName'] == row['LastName']]
            non_nan_planet = matching_rows.dropna(subset=['HomePlanet'])
            if not non_nan_planet.empty:
                planet_to_fill = non_nan_planet['HomePlanet'].values[0]
                df.at[index, 'HomePlanet'] = planet_to_fill
    return df

def home_planet_from_cabindeck_abc(df):
    df.loc[df['CabinDeck'].isin(['A', 'B', 'C']), 'HomePlanet'] = df.loc[df['CabinDeck'].isin(['A', 'B', 'C']), 'HomePlanet'].fillna('Europa')
    return df

def home_planet_from_cabindeck_g(df):
    df.loc[df['CabinDeck']== 'G', 'HomePlanet'] = df.loc[df['CabinDeck'] == 'G', 'HomePlanet'].fillna('Earth')
    return df

def fill_home_planet_by_group(df):
    for index, row in df.iterrows():
        if pd.isna(row['HomePlanet']):
            matching_rows = df[df['Group'] == row['Group']]
            non_nan_planet = matching_rows.dropna(subset=['HomePlanet'])
            if not non_nan_planet.empty:
                planet_to_fill = non_nan_planet['HomePlanet'].values[0]
                df.at[index, 'HomePlanet'] = planet_to_fill
    return df

def homeplanet_impute(df):
    df = fill_home_planet_by_last_name(df)
    df = home_planet_from_cabindeck_abc(df)
    df = home_planet_from_cabindeck_g(df)
    df = fill_home_planet_by_group(df)
    return df


In [53]:
togeth = homeplanet_impute(togeth)

In [54]:
togeth[originalcolumns].isna().sum().sum()

1660

VIP

In [55]:
def not_vip_g(df):
    df.loc[df.CabinDeck == 'G','VIP'] = df.loc[df.CabinDeck == 'G','VIP'].fillna(False)
    return df


In [56]:
togeth = not_vip_g(togeth)

In [57]:
togeth[originalcolumns].isna().sum().sum()

1660

In [58]:
togeth[originalcolumns].isna().sum()

HomePlanet       10
CryoSleep       135
Cabin            55
Destination     274
Age             270
VIP             104
RoomService     162
FoodCourt       171
ShoppingMall    163
Spa             166
VRDeck          150
dtype: int64

## Feature Engineering

spending

In [59]:
def total_ammenities(df):
    df['TotalAmmenities'] = df[['Spa','VRDeck','FoodCourt','ShoppingMall']].sum(axis = 1)
    return df


def total_cabin_ammenities(df):
    def total_cabin_ammenities_apply(row):
        if pd.isna(row.Cabin):
            return pd.NA
        samecabin = df[df.Cabin == row.Cabin]
        return np.sum(samecabin.TotalAmmenities)
    df['TotalCabinAmmenities'] = df.apply(total_cabin_ammenities_apply, axis = 1)
    return df


def total_spending(df):
    def total_spending_apply(row):
        if pd.isna(row.RoomService):
            return pd.NA
        else:
            totalspending = 0
            for item in spending:
                if not pd.isna(row[item]):
                    totalspending += row[item]
            return totalspending
    df['TotalSpending'] = df.apply(total_spending_apply, axis = 1)
    return df

def total_cabin_spending(df):
    def total_cabin_spending_apply(row):
        if pd.isna(row.Cabin):
            return pd.NA
        samecabin = df[df.Cabin == row.Cabin]
        return np.sum(samecabin.TotalSpending)
    df['TotalCabinSpending'] = df.apply(total_cabin_spending_apply, axis = 1)
    return df

def total_cabin_room_service(df):
    def total_cabin_room_service_apply(row):
        if pd.isna(row.Cabin):
            return pd.NA
        samecabin = df[df.Cabin == row.Cabin]
        return np.sum(samecabin.RoomService)
    df['TotalCabinRoomService'] = df.apply(total_cabin_room_service_apply, axis = 1)
    return df

def calculate_family_spending(df):
    # Define the columns related to spending
    spending_columns = ['VRDeck', 'Spa', 'RoomService', 'FoodCourt', 'ShoppingMall']

    # Group the DataFrame by last name and group, and calculate the total spending for each family
    family_spending = df.groupby(['LastName', 'Group'])[spending_columns].transform('sum')

    # Rename the family spending columns to distinguish them from the original columns
    family_spending.columns = [col + '_Family' for col in spending_columns]

    # Add the total family spending and total family amenities columns to the DataFrame
    df = pd.concat([df, family_spending], axis=1)

    famamenities = ['VRDeck_Family', 'Spa_Family', 'FoodCourt_Family', 'ShoppingMall_Family']
    famspending = ['VRDeck_Family', 'Spa_Family', 'RoomService_Family', 'FoodCourt_Family', 'ShoppingMall_Family']
    df['TotalFamilyAmmenities'] = df[famamenities].sum(axis=1)
    df['TotalFamilySpending'] = df[famspending].sum(axis=1)

    return df

def spendingFE(df):
    df = total_ammenities(df)
    df = total_cabin_ammenities(df)
    df = total_spending(df)
    df = total_cabin_spending(df)
    df = total_cabin_room_service(df)
    df = calculate_family_spending(df)
    return df

In [60]:
togeth = spendingFE(togeth)

GroupSize, CabinSize, Family Size, Solo Cabin

In [61]:
def group_size(df):
    def group_size_apply(row):
        group = df[df.Group == row.Group]
        return len(group)
    df['GroupSize'] = df.apply(group_size_apply, axis = 1)
    return df

def family_size(df):
    df['FamilySize'] = df.apply(lambda row: pd.NA if pd.isna(row['LastName']) else df[(df['LastName'] == row['LastName']) & (df['Group'] == row['Group'])].shape[0], axis=1)
    return df

def cabin_size(df):
    def cabin_size_apply(row):
        if pd.isna(row.Cabin):
            if row.GroupSize == 1:
                return 1
            else:
                return pd.NA
        else:
            samecabin = df[df.Cabin == row.Cabin]
            return len(samecabin)
    df['CabinSize'] = df.apply(cabin_size_apply, axis = 1)
    return df

def solo_cabin(df):
    def solo_cabin_apply(row):
        if pd.isna(row.Cabin):
            if row.GroupSize == 1:
                return True
            else:
                return pd.NA
        size = len(df[df.Cabin == row.Cabin])
        return size == 1
    df['SoloCabin'] = df.apply(solo_cabin_apply, axis = 1)
    return df

def sizeFE(df):
    df = group_size(df)
    df = family_size(df)
    df = cabin_size(df)
    df = solo_cabin(df)
    return df


In [62]:
togeth = sizeFE(togeth)

Age

In [63]:
def under_13(df):
    df['Under13'] = df['Age'].apply(lambda x: True if x < 13 else False if x >= 13 else pd.NA)
    return df

def under_18(df):
    df['Under18'] = df['Age'].apply(lambda x: True if x < 18 else False if x >= 18 else pd.NA)
    return df

def under_19(df):
    df['Under19'] = df['Age'].apply(lambda x: True if x < 19 else False if x >= 19 else pd.NA)
    return df

def ageFE(df):
    df = under_13(df)
    df = under_18(df)
    df = under_19(df)
    return df

In [64]:
togeth['Agebin'] = pd.qcut(togeth['Age'], 4)
togeth = ageFE(togeth)

# Additional Imputes Certainty

In [65]:
def not_under_18_if_vip(df):
    df['Under18'] = df.apply(lambda row: False if pd.isna(row['Age']) and row['VIP'] == True else row['Under18'], axis=1)
    return df

def not_under_13_if_spending(df):
    for index, row in df[df['Under13'].isna()].iterrows():
        if (row[spending] > 0).any():
            df.at[index, 'Under13'] = False
    return df

def age_impute(df):
    df = not_under_13_if_spending(df)
    df = not_under_18_if_vip(df)
    return df


In [66]:
togeth = age_impute(togeth)

# export certainty csv

In [67]:
togeth.to_csv('certainty_imputed_data')

# splits for nas or cryo

In [5]:
nas = list(testdf.isna().sum(axis = 1))
indexes = list(testdf.index)
keepinds = []
dropinds = []
for i in range(len(indexes)):
    if nas[i] == 0:
        keepinds.append(i)
    else:
        dropinds.append(i)
droptest = testdf.loc[dropinds].copy()
testdf = testdf.loc[keepinds].copy()

KeyError: '[6, 11, 15, 18, 19, 21, 38, 42, 48, 68, 73, 78, 92, 93, 99, 101, 105, 114, 117, 127, 128, 134, 135, 136, 140, 141, 146, 147, 148, 150, 152, 157, 161, 164, 165, 171, 172, 177, 180, 181, 183, 184, 185, 197, 198, 199, 201, 203, 209, 211, 219, 225, 227, 234, 236, 241, 242, 248, 253, 263, 265, 276, 277, 278, 279, 285, 292, 307, 312, 314, 327, 328, 331, 332, 334, 339, 340, 344, 345, 348, 354, 358, 378, 383, 385, 387, 388, 389, 405, 410, 413, 419, 422, 427, 433, 435, 436, 446, 450, 454, 457, 460, 464, 472, 481, 484, 488, 489, 499, 500, 504, 509, 512, 527, 532, 535, 543, 544, 547, 556, 557, 559, 561, 565, 566, 573, 578, 582, 584, 585, 586, 587, 589, 595, 598, 599, 601, 602, 617, 621, 626, 627, 628, 629, 647, 650, 651, 654, 656, 657, 669, 678, 685, 686, 699, 701, 711, 712, 713, 729, 732, 737, 738, 752, 754, 755, 756, 761, 763, 766, 776, 777, 778, 781, 783, 785, 786, 788, 792, 794, 795, 799, 800, 801, 803, 812, 815, 817, 825, 826, 830, 832, 843, 846, 850, 853, 859, 861, 863, 873, 877, 894, 898, 899, 907, 911, 912, 913, 920, 922, 925, 934, 938, 961, 963, 966, 977, 980, 991, 995, 1004, 1007, 1013, 1020, 1023, 1030, 1032, 1033, 1040, 1041, 1042, 1046, 1052, 1054, 1056, 1057, 1063, 1065, 1072, 1073, 1074, 1079, 1090, 1093, 1098, 1099, 1100, 1105, 1106, 1115, 1121, 1124, 1125, 1131, 1135, 1137, 1144, 1151, 1152, 1155, 1157, 1159, 1160, 1165, 1170, 1176, 1177, 1180, 1183, 1187, 1191, 1192, 1196, 1199, 1202, 1204, 1206, 1209, 1216, 1221, 1222, 1241, 1243, 1249, 1253, 1257, 1261, 1264, 1269, 1270, 1271, 1275, 1280, 1281, 1282, 1285, 1297, 1298, 1300, 1323, 1325, 1329, 1335, 1339, 1341, 1342, 1350, 1356, 1365, 1366, 1367, 1374, 1376, 1379, 1381, 1394, 1403, 1404, 1413, 1414, 1420, 1421, 1422, 1427, 1443, 1446, 1448, 1450, 1454, 1457, 1462, 1465, 1468, 1474, 1475, 1476, 1477, 1479, 1482, 1485, 1489, 1490, 1496, 1513, 1515, 1524, 1528, 1535, 1536, 1545, 1548, 1549, 1551, 1552, 1555, 1559, 1564, 1567, 1569, 1570, 1578, 1580, 1584, 1588, 1591, 1601, 1603, 1604, 1607, 1615, 1625, 1627, 1640, 1647, 1649, 1655, 1658, 1663, 1665, 1666, 1671, 1681, 1688, 1695, 1708, 1718, 1719, 1723, 1727, 1735, 1737, 1740, 1743, 1745, 1761, 1766, 1769, 1776, 1781, 1793, 1794, 1800, 1803, 1809, 1811, 1816, 1817, 1820, 1826, 1833, 1844, 1845, 1854, 1859, 1860, 1864, 1870, 1872, 1879, 1888, 1890, 1896, 1898, 1901, 1908, 1909, 1916, 1918, 1920, 1921, 1924, 1925, 1929, 1937, 1938, 1942, 1947, 1949, 1954, 1959, 1964, 1965, 1968, 1973, 1977, 1981, 1982, 1984, 1985, 1987, 1989, 1992, 1993, 1994, 1998, 1999, 2004, 2005, 2014, 2019, 2024, 2026, 2029, 2030, 2036, 2038, 2042, 2043, 2046, 2048, 2050, 2055, 2056, 2057, 2060, 2061, 2063, 2065, 2066, 2067, 2082, 2086, 2091, 2095, 2102, 2105, 2107, 2114, 2120, 2121, 2129, 2133, 2134, 2136, 2139, 2141, 2146, 2150, 2153, 2156, 2168, 2169, 2174, 2177, 2191, 2202, 2204, 2209, 2211, 2216, 2220, 2221, 2226, 2234, 2239, 2240, 2243, 2245, 2251, 2252, 2256, 2259, 2261, 2267, 2275, 2278, 2279, 2280, 2283, 2284, 2287, 2288, 2291, 2292, 2296, 2300, 2302, 2313, 2315, 2325, 2331, 2332, 2334, 2340, 2346, 2348, 2356, 2357, 2358, 2359, 2361, 2362, 2365, 2374, 2376, 2377, 2378, 2383, 2392, 2395, 2397, 2405, 2410, 2414, 2419, 2427, 2433, 2435, 2440, 2441, 2450, 2456, 2462, 2466, 2471, 2476, 2483, 2484, 2486, 2488, 2492, 2495, 2500, 2506, 2509, 2510, 2517, 2518, 2519, 2523, 2526, 2527, 2537, 2538, 2539, 2546, 2547, 2549, 2551, 2557, 2560, 2571, 2580, 2588, 2589, 2591, 2600, 2602, 2605, 2608, 2609, 2612, 2615, 2631, 2632, 2635, 2642, 2643, 2647, 2650, 2652, 2657, 2658, 2660, 2684, 2688, 2691, 2712, 2713, 2715, 2716, 2729, 2742, 2746, 2752, 2754, 2759, 2760, 2762, 2764, 2769, 2773, 2774, 2777, 2779, 2780, 2784, 2787, 2791, 2795, 2800, 2803, 2805, 2806, 2808, 2810, 2822, 2833, 2839, 2854, 2855, 2859, 2866, 2867, 2870, 2873, 2878, 2880, 2889, 2900, 2901, 2902, 2903, 2904, 2908, 2918, 2919, 2922, 2923, 2932, 2934, 2939, 2942, 2947, 2951, 2952, 2959, 2961, 2966, 2968, 2972, 2975, 2983, 2989, 2992, 2996, 3000, 3006, 3007, 3009, 3012, 3014, 3016, 3018, 3021, 3034, 3035, 3036, 3037, 3040, 3042, 3043, 3047, 3054, 3058, 3061, 3072, 3073, 3083, 3091, 3093, 3097, 3100, 3105, 3107, 3109, 3113, 3114, 3118, 3125, 3126, 3128, 3133, 3148, 3149, 3150, 3153, 3160, 3163, 3165, 3166, 3168, 3170, 3177, 3183, 3184, 3187, 3189, 3199, 3203, 3205, 3206, 3208, 3210, 3211, 3216, 3220, 3223, 3225, 3238, 3243, 3246, 3247, 3249, 3251, 3253, 3266, 3267, 3270, 3276] not in index'

In [7]:
traindf = togeth[togeth.train]
testdf = togeth[togeth.train == False]
cryotrain = traindf[traindf.CryoSleep == True].copy()
cryotest = testdf[testdf.CryoSleep == True].copy()

SyntaxError: invalid syntax (3115374223.py, line 2)

## Guessing Imputations

Seems very rare for earth and F to be in CryoSleep

In [41]:
def cryosleep_if_f_and_earth(df):
    df.loc[(df['HomePlanet'] == 'Earth') & (df['CabinDeck'] == 'F'), 'CryoSleep'] = False
    return df


In [42]:
togeth = cryosleep_if_f_and_earth(togeth)

In [8]:
togeth[togeth.HomePlanet == 'Earth'].CryoSleep.value_counts()

CryoSleep
False    4632
True     2070
Name: count, dtype: int64

## Really Guessing Imputes

## FillerFunction

In [118]:
def fillerfunc(df,conds,target):
    for i,row in df[df[target].isna()].iterrows():
        go = True
        for nas in conds:
            if pd.isna(row[nas]):
                go = False
        if go:
            first = conds[0]
            a = (df[first] == row[first])
            for item in conds[1:]:
                a = a & (df[item] == row[item])
            valcounts = df[a][target].value_counts()

            if valcounts[0] > 25 and len(valcounts) == 1:
                df.loc[df.PassengerId == row.PassengerId,target] = df.loc[df.PassengerId == row.PassengerId,target].fillna(valcounts.index[0])
            elif valcounts[0] > 25 and len(valcounts) > 1 and valcounts[0] / valcounts[1] > 1.5:
                df.loc[df.PassengerId == row.PassengerId,target] = df.loc[df.PassengerId == row.PassengerId,target].fillna(valcounts.index[0])
            else:
                print("help")
                print("i",i)
                print("row",row)
                print("valcounts",valcounts.index[0])
    return df



homeplanet  - not so informed but somewhat likely

In [97]:
def fillerfunc_homeplanet_dest(df):
    return fillerfunc(df,['Destination'],'HomePlanet')


Destination

In [98]:
def fillerfunc_dest_homeplanet(df):
    return fillerfunc(df,['HomePlanet'],'Destination')

Cryosleep

if earth and E or F then not cryosleep!! or mars and d
if earth and g and trappist then cryosleep

In [None]:
def fillerfunc_cryosleep_cabin_homeplanet_VIP(df):
    return fillerfunc(df,['HomePlanet','Cabin','VIP'],'CryoSleep')