In [1]:
import pandas as pd

# Load the Titanic dataset
df = pd.read_csv("train.csv")


num_rows_before = df.shape[0]
print(f"Rows before: {num_rows_before}")
duplicate_count = df.duplicated().sum()
print(f"Duplicate rows: {duplicate_count}")
df_cleaned = df.drop_duplicates()
num_rows_after = df_cleaned.shape[0]
print(f"Rows after removing duplicates: {num_rows_after}")

Rows before: 891
Duplicate rows: 0
Rows after removing duplicates: 891


In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer

df = pd.read_csv("train.csv")

missing_values = df.isnull().sum()
print("Valeurs manquantes par colonne :\n", missing_values[missing_values > 0])

# Suppression des lignes avec des valeurs manquantes (Embarked : seulement 2 valeurs manquantes)
df_cleaned = df.dropna(subset=["Embarked"])  

# Remplissage des valeurs manquantes (Cabin : trop de NaN, on met 'Unknown')
df_cleaned["Cabin"].fillna("Unknown", inplace=True)

# Imputation des valeurs numériques avec la médiane (Age)
imputer = SimpleImputer(strategy="median")
df_cleaned["Age"] = imputer.fit_transform(df_cleaned[["Age"]])

print("\nValeurs manquantes après traitement :\n", df_cleaned.isnull().sum())

Valeurs manquantes par colonne :
 Age         177
Cabin       687
Embarked      2
dtype: int64

Valeurs manquantes après traitement :
 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
# refaire le code avec les indications ci-dessus
import pandas as pd
from sklearn.impute import SimpleImputer

df = pd.read_csv("train.csv")

missing_values = df.isnull().sum()
print("Valeurs manquantes par colonne :\n", missing_values[missing_values > 0])

# Suppression des lignes avec des valeurs manquantes (Embarked : seulement 2 valeurs manquantes)
df_cleaned = df.dropna(subset=["Embarked"]).copy()

# Remplissage des valeurs manquantes (Cabin : trop de NaN, on met 'Unknown')
df_cleaned.loc[:, "Cabin"] = df_cleaned["Cabin"].fillna("Unknown") 

# Imputation des valeurs numériques avec la médiane (Age)
imputer = SimpleImputer(strategy="median")
df_cleaned.loc[:, "Age"] = imputer.fit_transform(df_cleaned[["Age"]])

print("\nValeurs manquantes après traitement :\n", df_cleaned.isnull().sum())

Valeurs manquantes par colonne :
 Age         177
Cabin       687
Embarked      2
dtype: int64

Valeurs manquantes après traitement :
 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

df = pd.read_csv("train.csv")

df["FamilySize"] = df["SibSp"] + df["Parch"] + 1  # +1 pour inclure la personne elle-même
df["Title"] = df["Name"].str.extract(r' ([A-Za-z]+)\.', expand=False)

label_encoder = LabelEncoder()
df["Title"] = label_encoder.fit_transform(df["Title"])

df = pd.get_dummies(df, columns=["Sex", "Embarked"], drop_first=True)  # drop_first=True pour éviter la colinéarité

scaler = MinMaxScaler()
df[["Age", "Fare", "FamilySize"]] = scaler.fit_transform(df[["Age", "Fare", "FamilySize"]])

print(df[["Age", "Fare", "FamilySize", "Title", "Sex_male", "Embarked_Q", "Embarked_S"]].head())

        Age      Fare  FamilySize  Title  Sex_male  Embarked_Q  Embarked_S
0  0.271174  0.014151         0.1     12         1           0           1
1  0.472229  0.139136         0.1     13         0           0           0
2  0.321438  0.015469         0.0      9         0           0           1
3  0.434531  0.103644         0.1     13         0           0           1
4  0.434531  0.015713         0.0     12         1           0           1


In [None]:
import numpy as np

def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

outliers_age, lower_b_age, upper_b_age = detect_outliers_iqr(df, "Age")
outliers_fare, lower_b_fare, upper_b_fare = detect_outliers_iqr(df, "Fare")

print(f"Outliers Age:  {len(outliers_age)}")
print(f"Outliers Fare: {len(outliers_fare)}")

df = df[(df["Age"] >= lower_b_age) & (df["Age"] <= upper_b_age)]
df = df[(df["Fare"] >= lower_b_fare) & (df["Fare"] <= upper_b_fare)]

Outliers Age: 11
Outliers Fare: 116


In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[["Age", "Fare", "FamilySize"]] = scaler.fit_transform(df[["Age", "Fare", "FamilySize"]])

print(df[["Age", "Fare", "FamilySize"]].head())

        Age      Fare  FamilySize
0  0.339415  0.111538    0.142857
2  0.402328  0.121923    0.000000
3  0.543882  0.816923    0.142857
4  0.543882  0.123846    0.000000
6  0.842718  0.797885    0.000000


In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = df.select_dtypes(include=["object"]).columns
non_cols = df.select_dtypes(exclude=['object']).columns
print(cat_cols)
df = pd.get_dummies(df, columns=[cat_cols], drop_first=True)
label_encoder = LabelEncoder()
df["Pclass"] = label_encoder.fit_transform(df["Pclass"])

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,...,Age_62.0,Age_63.0,Age_64.0,Age_65.0,Age_66.0,Age_70.0,Age_70.5,Age_71.0,Age_74.0,Age_80.0
0,1,0,2,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,,...,0,0,0,0,0,0,0,0,0,0
1,2,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C85,...,0,0,0,0,0,0,0,0,0,0
2,3,1,2,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,C123,...,0,0,0,0,0,0,0,0,0,0
4,5,0,2,"Allen, Mr. William Henry",male,0,0,373450,8.05,,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df= pd.read_csv("train.csv")
bins = [0, 18, 30, 40, 50, 65, 100]  # Définir les bins
labels = ['0-18', '19-30', '31-40', '41-50', '51-65', '66+']  # Définir les labels pour les bins
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

# Étape 2 : Appliquer l'encodage one-hot
df_encoded = pd.get_dummies(df, columns=['AgeGroup'], drop_first=True)

# Afficher le résultat
print("\nDonnées avec groupes d'âge et encodage one-hot :")
df_encoded.head()


Données avec groupes d'âge et encodage one-hot :


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup_19-30,AgeGroup_31-40,AgeGroup_41-50,AgeGroup_51-65,AgeGroup_66+
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,0,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,1,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,1,0,0,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,1,0,0,0
