In [1]:
import pandas as pd

In [9]:
df_train = pd.read_csv("/Users/macbook/Downloads/house-prices-dataset/train.csv")

In [3]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
### 1. Traitement des valeurs manquantes selon la description ###

# Variables catégorielles avec "NA" dans la description
na_categorical = {
    "Alley": "NA",  # No alley access
    "BsmtQual": "NA",  # No Basement
    "BsmtCond": "NA",  # No Basement
    "BsmtExposure": "NA",  # No Basement
    "BsmtFinType1": "NA",  # No Basement
    "BsmtFinType2": "NA",  # No Basement
    "FireplaceQu": "NA",  # No Fireplace
    "GarageType": "NA",  # No Garage
    "GarageFinish": "NA",  # No Garage
    "GarageQual": "NA",  # No Garage
    "GarageCond": "NA",  # No Garage
    "PoolQC": "NA",  # No Pool
    "Fence": "NA",  # No Fence
    "MiscFeature": "NA"  # None
}

for col, na_value in na_categorical.items():
    df_train[col] = df_train[col].fillna(na_value)

# Variables avec "None" comme option valide
none_categorical = {
    "MasVnrType": "None"  # Masonry veneer type
}
for col, none_value in none_categorical.items():
    df_train[col] = df_train[col].fillna(none_value)

# Variables numériques
df_train["LotFrontage"] = df_train.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
df_train["MasVnrArea"] = df_train["MasVnrArea"].fillna(0)  # Si pas de maçonnerie
df_train["GarageYrBlt"] = df_train["GarageYrBlt"].fillna(0)  # Si pas de garage

# Electrical - seul NA, utiliser le mode
df_train["Electrical"] = df_train["Electrical"].fillna(df_train["Electrical"].mode()[0])

### 2. Vérification des relations entre variables ###

# Si pas de sous-sol, surfaces = 0
bsmt_cols = ["BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF"]
df_train.loc[df_train["BsmtQual"] == "NA", bsmt_cols] = 0

# Si pas de garage, surfaces = 0
df_train.loc[df_train["GarageType"] == "NA", ["GarageCars", "GarageArea"]] = 0

# Si pas de cheminée
df_train.loc[df_train["Fireplaces"] == 0, "FireplaceQu"] = "NA"

# Si pas de piscine
df_train.loc[df_train["PoolArea"] == 0, "PoolQC"] = "NA"

### 3. Vérification finale ###
print("Valeurs manquantes restantes:")
print(df_train.isnull().sum().sum())  

Valeurs manquantes restantes:
0


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          1460 non-null   object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [7]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
df_train.to_csv('train_cleaned.csv', index=False)