# Spaceship Titanic

Le but ici est de déterminer si les passagers ont été transportés dans une autre dimension

In [1372]:
import pandas as pd
import numpy as np

In [1373]:
train = pd.read_csv('spaceship-titanic/train.csv')
test = pd.read_csv('spaceship-titanic/test.csv')

Data initiale :

In [1374]:
print('Nombre de lignes dans le dataset train :', len(train))

train.head(5)


Nombre de lignes dans le dataset train : 8693


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


### Déterminer les valeurs manquantes au dataset

In [1375]:
# Tableau entre la l'intiutlé de la table et le nombre de données manquantes pour chaque colonne
missing_data = train.isnull().sum()

# Créer un DataFrame avec les résultats
missing_data_table = pd.DataFrame({
    'Colonne': missing_data.index,
    'Nombre de données manquantes': missing_data.values
})

# Afficher le tableau
print("Tableau des données manquantes par colonne :")
missing_data_table

Tableau des données manquantes par colonne :


Unnamed: 0,Colonne,Nombre de données manquantes
0,PassengerId,0
1,HomePlanet,201
2,CryoSleep,217
3,Cabin,199
4,Destination,182
5,Age,179
6,VIP,203
7,RoomService,181
8,FoodCourt,183
9,ShoppingMall,208


# Preprocessing

### Clean des données

Groupe d'ages

In [1376]:
# Créer des groupes d'âge numériques (0-5)
# Train dataset
train.loc[train['Age'] <= 12, 'Age_group'] = 0
train.loc[(train['Age'] > 12) & (train['Age'] < 18), 'Age_group'] = 1
train.loc[(train['Age'] >= 18) & (train['Age'] <= 25), 'Age_group'] = 2
train.loc[(train['Age'] > 25) & (train['Age'] <= 30), 'Age_group'] = 3
train.loc[(train['Age'] > 30) & (train['Age'] <= 50), 'Age_group'] = 4
train.loc[train['Age'] > 50, 'Age_group'] = 5

# Test dataset
test.loc[test['Age'] <= 12, 'Age_group'] = 0
test.loc[(test['Age'] > 12) & (test['Age'] < 18), 'Age_group'] = 1
test.loc[(test['Age'] >= 18) & (test['Age'] <= 25), 'Age_group'] = 2
test.loc[(test['Age'] > 25) & (test['Age'] <= 30), 'Age_group'] = 3
test.loc[(test['Age'] > 30) & (test['Age'] <= 50), 'Age_group'] = 4
test.loc[test['Age'] > 50, 'Age_group'] = 5

# Remplir les NaN avec le mode (valeur la plus fréquente) avant conversion
train_mode = train['Age_group'].mode()[0] if not train['Age_group'].mode().empty else 2
test_mode = test['Age_group'].mode()[0] if not test['Age_group'].mode().empty else 2

train['Age_group'] = train['Age_group'].fillna(train_mode)
test['Age_group'] = test['Age_group'].fillna(test_mode)

# S'assurer que Age_group est de type entier
train['Age_group'] = train['Age_group'].astype(int)
test['Age_group'] = test['Age_group'].astype(int)

train.drop(columns=['Age'], inplace=True)
test.drop(columns=['Age'], inplace=True)

train.head(10)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Age_group
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,4
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,5
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,4
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True,4
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True,3
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True,3
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True,4
9,0008_01,Europa,True,B/1/P,55 Cancri e,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True,1


Planète d'origine

In [1377]:
# Créer des groupes de planètes numériques (0-2)
# Train dataset
train.loc[train['HomePlanet'] == 'Europa', 'HomePlanet_group'] = 0
train.loc[train['HomePlanet'] == 'Earth', 'HomePlanet_group'] = 1
train.loc[train['HomePlanet'] == 'Mars', 'HomePlanet_group'] = 2

# Test dataset
test.loc[test['HomePlanet'] == 'Europa', 'HomePlanet_group'] = 0
test.loc[test['HomePlanet'] == 'Earth', 'HomePlanet_group'] = 1
test.loc[test['HomePlanet'] == 'Mars', 'HomePlanet_group'] = 2

# Remplir les valeurs manquantes selon le groupe (première partie du PassengerId)
# Extraire le groupe (première partie avant le '_')
train['Group'] = train['PassengerId'].str.split('_').str[0]
test['Group'] = test['PassengerId'].str.split('_').str[0]

# Créer un mapping groupe -> HomePlanet_group
group_homeplanet_mapping = {}

# Analyser les groupes dans le train
for group in train['Group'].unique():
    group_data = train[train['Group'] == group]
    # Si le groupe a des valeurs non-nulles de HomePlanet_group
    if group_data['HomePlanet_group'].notna().any():
        # Prendre la valeur la plus fréquente dans ce groupe
        most_common_homeplanet = group_data['HomePlanet_group'].mode()[0]
        group_homeplanet_mapping[group] = most_common_homeplanet

# Remplir les valeurs manquantes dans le train
train['HomePlanet_group'] = train['HomePlanet_group'].fillna(train['Group'].map(group_homeplanet_mapping))

# Remplir les valeurs manquantes dans le test
test['HomePlanet_group'] = test['HomePlanet_group'].fillna(test['Group'].map(group_homeplanet_mapping))

# Remplir les valeurs restantes avec le mode global
if train['HomePlanet_group'].isnull().any():
    global_mode = train['HomePlanet_group'].mode()[0]
    train['HomePlanet_group'] = train['HomePlanet_group'].fillna(global_mode)

if test['HomePlanet_group'].isnull().any():
    global_mode = test['HomePlanet_group'].mode()[0]
    test['HomePlanet_group'] = test['HomePlanet_group'].fillna(global_mode)

# S'assurer que HomePlanet_group est de type entier
train['HomePlanet_group'] = train['HomePlanet_group'].astype(int)
test['HomePlanet_group'] = test['HomePlanet_group'].astype(int)

train.drop(columns=['HomePlanet'], inplace=True)
test.drop(columns=['HomePlanet'], inplace=True)

train.tail(10)



Unnamed: 0,PassengerId,CryoSleep,Cabin,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Age_group,HomePlanet_group,Group
8683,9272_02,False,F/1894/P,TRAPPIST-1e,False,86.0,3.0,149.0,208.0,329.0,Gordo Simson,False,2,1,9272
8684,9274_01,True,G/1508/P,TRAPPIST-1e,False,0.0,0.0,0.0,0.0,0.0,Chelsa Bullisey,True,2,1,9274
8685,9275_01,False,A/97/P,TRAPPIST-1e,False,0.0,0.0,0.0,0.0,0.0,Polaton Conable,True,0,0,9275
8686,9275_02,False,A/97/P,TRAPPIST-1e,False,1.0,1146.0,0.0,50.0,34.0,Diram Conable,False,4,0,9275
8687,9275_03,,A/97/P,TRAPPIST-1e,False,0.0,3208.0,0.0,2.0,330.0,Atlasym Conable,True,3,0,9275
8688,9276_01,False,A/98/P,55 Cancri e,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,4,0,9276
8689,9278_01,True,G/1499/S,PSO J318.5-22,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,2,1,9278
8690,9279_01,False,G/1500/S,TRAPPIST-1e,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,3,1,9279
8691,9280_01,False,E/608/S,55 Cancri e,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,4,0,9280
8692,9280_02,False,E/608/S,TRAPPIST-1e,False,126.0,4688.0,0.0,0.0,12.0,Propsh Hontichre,True,4,0,9280


Dépenses globales

In [1378]:
# Créer une colonne binaire pour les dépenses

# Calculer le total des dépenses pour chaque passager
train['TotalSpent'] = train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']
test['TotalSpent'] = test['RoomService'] + test['FoodCourt'] + test['ShoppingMall'] + test['Spa'] + test['VRDeck']

# Créer la colonne binaire HasSpent
train['HasSpent'] = (train['TotalSpent'] > 0).astype(int)
test['HasSpent'] = (test['TotalSpent'] > 0).astype(int)

train.drop(columns=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'VIP', 'TotalSpent'], inplace=True)
test.drop(columns=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'VIP', 'TotalSpent'], inplace=True)

train.head(10)


Unnamed: 0,PassengerId,CryoSleep,Cabin,Destination,Name,Transported,Age_group,HomePlanet_group,Group,HasSpent
0,0001_01,False,B/0/P,TRAPPIST-1e,Maham Ofracculy,False,4,0,1,0
1,0002_01,False,F/0/S,TRAPPIST-1e,Juanna Vines,True,2,1,2,1
2,0003_01,False,A/0/S,TRAPPIST-1e,Altark Susent,False,5,0,3,1
3,0003_02,False,A/0/S,TRAPPIST-1e,Solam Susent,False,4,0,3,1
4,0004_01,False,F/1/S,TRAPPIST-1e,Willy Santantines,True,1,1,4,1
5,0005_01,False,F/0/P,PSO J318.5-22,Sandie Hinetthews,True,4,1,5,1
6,0006_01,False,F/2/S,TRAPPIST-1e,Billex Jacostaffey,True,3,1,6,1
7,0006_02,True,G/0/S,TRAPPIST-1e,Candra Jacostaffey,True,3,1,6,0
8,0007_01,False,F/3/S,TRAPPIST-1e,Andona Beston,True,4,1,7,1
9,0008_01,True,B/1/P,55 Cancri e,Erraiam Flatic,True,1,0,8,0


Destination

In [1379]:
# Créer des groupes de destinations numériques (0-2)
# Train dataset
train.loc[train['Destination'] == 'TRAPPIST-1e', 'Destination_group'] = 0
train.loc[train['Destination'] == 'PSO J318.5-22', 'Destination_group'] = 1
train.loc[train['Destination'] == '55 Cancri e', 'Destination_group'] = 2

# Test dataset
test.loc[test['Destination'] == 'TRAPPIST-1e', 'Destination_group'] = 0
test.loc[test['Destination'] == 'PSO J318.5-22', 'Destination_group'] = 1
test.loc[test['Destination'] == '55 Cancri e', 'Destination_group'] = 2

# Remplir les NaN avec le mode (valeur la plus fréquente) avant conversion
train_mode = train['Destination_group'].mode()[0] if not train['Destination_group'].mode().empty else 0
test_mode = test['Destination_group'].mode()[0] if not test['Destination_group'].mode().empty else 0

train['Destination_group'] = train['Destination_group'].fillna(train_mode)
test['Destination_group'] = test['Destination_group'].fillna(test_mode)

# S'assurer que Destination_group est de type entier
train['Destination_group'] = train['Destination_group'].astype(int)
test['Destination_group'] = test['Destination_group'].astype(int)

train.drop(columns=['Destination'], inplace=True)
test.drop(columns=['Destination'], inplace=True)

train.head(10)




Unnamed: 0,PassengerId,CryoSleep,Cabin,Name,Transported,Age_group,HomePlanet_group,Group,HasSpent,Destination_group
0,0001_01,False,B/0/P,Maham Ofracculy,False,4,0,1,0,0
1,0002_01,False,F/0/S,Juanna Vines,True,2,1,2,1,0
2,0003_01,False,A/0/S,Altark Susent,False,5,0,3,1,0
3,0003_02,False,A/0/S,Solam Susent,False,4,0,3,1,0
4,0004_01,False,F/1/S,Willy Santantines,True,1,1,4,1,0
5,0005_01,False,F/0/P,Sandie Hinetthews,True,4,1,5,1,1
6,0006_01,False,F/2/S,Billex Jacostaffey,True,3,1,6,1,0
7,0006_02,True,G/0/S,Candra Jacostaffey,True,3,1,6,0,0
8,0007_01,False,F/3/S,Andona Beston,True,4,1,7,1,0
9,0008_01,True,B/1/P,Erraiam Flatic,True,1,0,8,0,2


Passager en groupe

In [1380]:
# Créer une colonne IsAlone basée sur le PassengerId

# Extraire le numéro du passager (après le _)
train['PassengerNumber'] = train['PassengerId'].str.split('_').str[1].astype(int)
test['PassengerNumber'] = test['PassengerId'].str.split('_').str[1].astype(int)

# Créer la colonne IsAlone
# 0 = seul (_01), 1 = accompagné (_02 et plus)
train['IsAlone'] = (train['PassengerNumber'] > 1).astype(int)
test['IsAlone'] = (test['PassengerNumber'] > 1).astype(int)


# Nettoyage - supprimer la colonne temporaire
train.drop('PassengerNumber', axis=1, inplace=True)
test.drop('PassengerNumber', axis=1, inplace=True)

train.head(10)


Unnamed: 0,PassengerId,CryoSleep,Cabin,Name,Transported,Age_group,HomePlanet_group,Group,HasSpent,Destination_group,IsAlone
0,0001_01,False,B/0/P,Maham Ofracculy,False,4,0,1,0,0,0
1,0002_01,False,F/0/S,Juanna Vines,True,2,1,2,1,0,0
2,0003_01,False,A/0/S,Altark Susent,False,5,0,3,1,0,0
3,0003_02,False,A/0/S,Solam Susent,False,4,0,3,1,0,1
4,0004_01,False,F/1/S,Willy Santantines,True,1,1,4,1,0,0
5,0005_01,False,F/0/P,Sandie Hinetthews,True,4,1,5,1,1,0
6,0006_01,False,F/2/S,Billex Jacostaffey,True,3,1,6,1,0,0
7,0006_02,True,G/0/S,Candra Jacostaffey,True,3,1,6,0,0,1
8,0007_01,False,F/3/S,Andona Beston,True,4,1,7,1,0,0
9,0008_01,True,B/1/P,Erraiam Flatic,True,1,0,8,0,2,0


Extraction des infos de cabine

In [1381]:

# Extraire le numéro de cabine en gérant les NaN
# Remplacer temporairement les NaN par une valeur par défaut
train['Cabin_temp'] = train['Cabin'].fillna('Z/9999/Z')
test['Cabin_temp'] = test['Cabin'].fillna('Z/9999/Z')

# Extraire le dock (lettres A, B, C, etc.)
train['Cabin_deck'] = train['Cabin_temp'].str.split('/').str[0]
test['Cabin_deck'] = test['Cabin_temp'].str.split('/').str[0]

# Convertir les lettres en chiffres (A=0, B=1, C=2, etc.)
dock_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}
train['Cabin_deck'] = train['Cabin_deck'].map(dock_mapping).fillna(0).astype(int)
test['Cabin_deck'] = test['Cabin_deck'].map(dock_mapping).fillna(0).astype(int)

# Extraire le numéro de cabine
train['Cabin_number'] = train['Cabin_temp'].str.split('/').str[1].astype(int)
test['Cabin_number'] = test['Cabin_temp'].str.split('/').str[1].astype(int)

# Extraire le side (P ou S)
train['Cabin_side'] = train['Cabin_temp'].str.split('/').str[2]
test['Cabin_side'] = test['Cabin_temp'].str.split('/').str[2]

# Convertir P=0, S=1
side_mapping = {'P': 0, 'S': 1}
train['Cabin_side'] = train['Cabin_side'].map(side_mapping).fillna(0).astype(int)
test['Cabin_side'] = test['Cabin_side'].map(side_mapping).fillna(0).astype(int)


# Remettre les NaN pour les valeurs temporaires
train.loc[train['Cabin_temp'] == 'Z/9999/Z', 'Cabin_number'] = np.nan
test.loc[test['Cabin_temp'] == 'Z/9999/Z', 'Cabin_number'] = np.nan

# Supprimer la colonne temporaire
train.drop('Cabin_temp', axis=1, inplace=True)
test.drop('Cabin_temp', axis=1, inplace=True)

# Convertir Cabin_number en entier (gérer les NaN)
train['Cabin_number'] = train['Cabin_number'].fillna(0).astype(int)
test['Cabin_number'] = test['Cabin_number'].fillna(0).astype(int)

train.tail(10)

Unnamed: 0,PassengerId,CryoSleep,Cabin,Name,Transported,Age_group,HomePlanet_group,Group,HasSpent,Destination_group,IsAlone,Cabin_deck,Cabin_number,Cabin_side
8683,9272_02,False,F/1894/P,Gordo Simson,False,2,1,9272,1,0,1,5,1894,0
8684,9274_01,True,G/1508/P,Chelsa Bullisey,True,2,1,9274,0,0,0,6,1508,0
8685,9275_01,False,A/97/P,Polaton Conable,True,0,0,9275,0,0,0,0,97,0
8686,9275_02,False,A/97/P,Diram Conable,False,4,0,9275,1,0,1,0,97,0
8687,9275_03,,A/97/P,Atlasym Conable,True,3,0,9275,1,0,1,0,97,0
8688,9276_01,False,A/98/P,Gravior Noxnuther,False,4,0,9276,1,2,0,0,98,0
8689,9278_01,True,G/1499/S,Kurta Mondalley,False,2,1,9278,0,1,0,6,1499,1
8690,9279_01,False,G/1500/S,Fayey Connon,True,3,1,9279,1,0,0,6,1500,1
8691,9280_01,False,E/608/S,Celeon Hontichre,False,4,0,9280,1,2,0,4,608,1
8692,9280_02,False,E/608/S,Propsh Hontichre,True,4,0,9280,1,0,1,4,608,1


In [1382]:
# Créer une seule colonne Cabin_number avec des valeurs 0-7
def create_single_cabin_number(df):
    # Assigner les valeurs 0-7 selon les chunks de 300
    df.loc[df['Cabin_number'] < 300, 'Cabin_number'] = 0
    df.loc[(df['Cabin_number'] >= 300) & (df['Cabin_number'] < 600), 'Cabin_number'] = 1
    df.loc[(df['Cabin_number'] >= 600) & (df['Cabin_number'] < 900), 'Cabin_number'] = 2
    df.loc[(df['Cabin_number'] >= 900) & (df['Cabin_number'] < 1200), 'Cabin_number'] = 3
    df.loc[(df['Cabin_number'] >= 1200) & (df['Cabin_number'] < 1500), 'Cabin_number'] = 4
    df.loc[(df['Cabin_number'] >= 1500) & (df['Cabin_number'] < 1800), 'Cabin_number'] = 5
    df.loc[df['Cabin_number'] >= 1800, 'Cabin_number'] = 6
    
    return df

# Appliquer aux deux datasets
train = create_single_cabin_number(train)
test = create_single_cabin_number(test)

train.drop(columns=['Cabin'], inplace=True)
test.drop(columns=['Cabin'], inplace=True)


train.tail(10)



Unnamed: 0,PassengerId,CryoSleep,Name,Transported,Age_group,HomePlanet_group,Group,HasSpent,Destination_group,IsAlone,Cabin_deck,Cabin_number,Cabin_side
8683,9272_02,False,Gordo Simson,False,2,1,9272,1,0,1,5,6,0
8684,9274_01,True,Chelsa Bullisey,True,2,1,9274,0,0,0,6,5,0
8685,9275_01,False,Polaton Conable,True,0,0,9275,0,0,0,0,0,0
8686,9275_02,False,Diram Conable,False,4,0,9275,1,0,1,0,0,0
8687,9275_03,,Atlasym Conable,True,3,0,9275,1,0,1,0,0,0
8688,9276_01,False,Gravior Noxnuther,False,4,0,9276,1,2,0,0,0,0
8689,9278_01,True,Kurta Mondalley,False,2,1,9278,0,1,0,6,4,1
8690,9279_01,False,Fayey Connon,True,3,1,9279,1,0,0,6,5,1
8691,9280_01,False,Celeon Hontichre,False,4,0,9280,1,2,0,4,2,1
8692,9280_02,False,Propsh Hontichre,True,4,0,9280,1,0,1,4,2,1


Taille de la famille (en fonction du nom)

In [1383]:
# Calculate family size from last name
# Extract last name from Name column
train['LastName'] = train['Name'].str.split(' ').str[-1]
test['LastName'] = test['Name'].str.split(' ').str[-1]

# Calculate family size by counting occurrences of each last name
train_family_sizes = train['LastName'].value_counts()
test_family_sizes = test['LastName'].value_counts()

# Map family sizes to each passenger
train['FamilySize'] = train['LastName'].map(train_family_sizes)
test['FamilySize'] = test['LastName'].map(test_family_sizes)

# Fill NaN values with 1 (individual passenger)
train['FamilySize'] = train['FamilySize'].fillna(1)
test['FamilySize'] = test['FamilySize'].fillna(1)

# Convert to integer
train['FamilySize'] = train['FamilySize'].astype(int)
test['FamilySize'] = test['FamilySize'].astype(int)

train.drop(columns=['LastName', 'Name'], inplace=True)
test.drop(columns=['LastName', 'Name'], inplace=True)


train.head(10)


Unnamed: 0,PassengerId,CryoSleep,Transported,Age_group,HomePlanet_group,Group,HasSpent,Destination_group,IsAlone,Cabin_deck,Cabin_number,Cabin_side,FamilySize
0,0001_01,False,False,4,0,1,0,0,0,1,0,0,1
1,0002_01,False,True,2,1,2,1,0,0,5,0,1,4
2,0003_01,False,False,5,0,3,1,0,0,0,0,1,6
3,0003_02,False,False,4,0,3,1,0,1,0,0,1,6
4,0004_01,False,True,1,1,4,1,0,0,5,0,1,6
5,0005_01,False,True,4,1,5,1,1,0,5,0,0,7
6,0006_01,False,True,3,1,6,1,0,0,5,0,1,7
7,0006_02,True,True,3,1,6,0,0,1,6,0,1,7
8,0007_01,False,True,4,1,7,1,0,0,5,0,1,5
9,0008_01,True,True,1,0,8,0,2,0,1,0,0,3


### Remplissage des données manquantes

In [1384]:
# Remplir CryoSleep avec False
train['CryoSleep'] = train['CryoSleep'].fillna(False)
test['CryoSleep'] = test['CryoSleep'].fillna(False)


  train['CryoSleep'] = train['CryoSleep'].fillna(False)
  test['CryoSleep'] = test['CryoSleep'].fillna(False)


In [1385]:
# Tableau entre la l'intiutlé de la table et le nombre de données manquantes pour chaque colonne
missing_data = train.isnull().sum()

# Créer un DataFrame avec les résultats
missing_data_table = pd.DataFrame({
    'Colonne': missing_data.index,
    'Nombre de données manquantes': missing_data.values
})

# Afficher le tableau
print("Tableau des données manquantes par colonne :")
missing_data_table

Tableau des données manquantes par colonne :


Unnamed: 0,Colonne,Nombre de données manquantes
0,PassengerId,0
1,CryoSleep,0
2,Transported,0
3,Age_group,0
4,HomePlanet_group,0
5,Group,0
6,HasSpent,0
7,Destination_group,0
8,IsAlone,0
9,Cabin_deck,0


Sortie propre

In [1386]:
train.head(10)

Unnamed: 0,PassengerId,CryoSleep,Transported,Age_group,HomePlanet_group,Group,HasSpent,Destination_group,IsAlone,Cabin_deck,Cabin_number,Cabin_side,FamilySize
0,0001_01,False,False,4,0,1,0,0,0,1,0,0,1
1,0002_01,False,True,2,1,2,1,0,0,5,0,1,4
2,0003_01,False,False,5,0,3,1,0,0,0,0,1,6
3,0003_02,False,False,4,0,3,1,0,1,0,0,1,6
4,0004_01,False,True,1,1,4,1,0,0,5,0,1,6
5,0005_01,False,True,4,1,5,1,1,0,5,0,0,7
6,0006_01,False,True,3,1,6,1,0,0,5,0,1,7
7,0006_02,True,True,3,1,6,0,0,1,6,0,1,7
8,0007_01,False,True,4,1,7,1,0,0,5,0,1,5
9,0008_01,True,True,1,0,8,0,2,0,1,0,0,3


### Prédiction

In [1387]:

from sklearn.ensemble import RandomForestClassifier

y = train["Transported"]

features = ["CryoSleep", "Age_group", "HomePlanet_group", "Destination_group", "HasSpent", "Cabin_deck", "Cabin_side"]

# Préparer les données
X = train[features]
X_test = test[features]

# Entraîner le modèle
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2, 
    max_features='sqrt',
    random_state=1
)
model.fit(X, y)

# Faire les prédictions
predictions = model.predict(X_test)

# Créer le fichier de soumission
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

# Afficher l'importance des features
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nImportance des features :")
print(feature_importance)

Your submission was successfully saved!

Importance des features :
             Feature  Importance
0          CryoSleep    0.348285
4           HasSpent    0.241155
5         Cabin_deck    0.137344
1          Age_group    0.110937
2   HomePlanet_group    0.089401
6         Cabin_side    0.037494
3  Destination_group    0.035384
