In [1]:
from ydata_profiling import ProfileReport
import os
from sklearn.preprocessing import LabelEncoder
import wandb
import pandas as pd 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error , r2_score
import math  
import numpy as np 



In [2]:
run = wandb.init(project="predict_house_price", job_type="load_data")

artifact = run.use_artifact('clean_data_csv:latest', type='processed_data')

artifact_dir = artifact.download()

file_path = os.path.join(artifact_dir, "clean_data.csv")

clean_data = pd.read_csv(file_path, keep_default_na=False)

run.finish()


[34m[1mwandb[0m: Currently logged in as: [33mkngocun[0m ([33mkngocun-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


In [None]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316 entries, 0 to 1315
Data columns (total 84 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1316 non-null   int64  
 1   Id               1316 non-null   int64  
 2   MSSubClass       1316 non-null   int64  
 3   MSZoning         1316 non-null   object 
 4   LotFrontage      1316 non-null   float64
 5   LotArea          1316 non-null   int64  
 6   Street           1316 non-null   object 
 7   Alley            1316 non-null   object 
 8   LotShape         1316 non-null   object 
 9   LandContour      1316 non-null   object 
 10  Utilities        1316 non-null   object 
 11  LotConfig        1316 non-null   object 
 12  LandSlope        1316 non-null   object 
 13  Neighborhood     1316 non-null   object 
 14  Condition1       1316 non-null   object 
 15  Condition2       1316 non-null   object 
 16  BldgType         1316 non-null   object 
 17  HouseStyle    

In [None]:
drop_cols = [
    "Id", "LowQualFinSF", "PoolArea", "PoolQC", "MiscVal"
]
clean_data = clean_data.drop(columns=drop_cols, errors="ignore")

In [None]:

# Gom nhóm các giá trị hiếm thành "Other"

# 1. Exterior1st (nhóm < 10 mẫu)
count_exterior1 = clean_data['Exterior1st'].value_counts()
rare_exterior1 = count_exterior1[count_exterior1 < 10].index.tolist()
clean_data['Exterior1st'] = clean_data['Exterior1st'].replace(rare_exterior1, 'Other')

# 2. Exterior2nd (nhóm < 10 mẫu)
count_exterior2 = clean_data['Exterior2nd'].value_counts()
rare_exterior2 = count_exterior2[count_exterior2 < 10].index.tolist()
clean_data['Exterior2nd'] = clean_data['Exterior2nd'].replace(rare_exterior2, 'Other')

# 3. BsmtFinType2 (nhóm < 100 mẫu)
count_bsmt = clean_data['BsmtFinType2'].value_counts()
rare_bsmt = count_bsmt[count_bsmt < 100].index.tolist()
clean_data['BsmtFinType2'] = clean_data['BsmtFinType2'].replace(rare_bsmt, 'Other')

# 4. GarageType (nhóm < 30 mẫu)
count_garage = clean_data['GarageType'].value_counts()
rare_garage = count_garage[count_garage < 30].index.tolist()
clean_data['GarageType'] = clean_data['GarageType'].replace(rare_garage, 'Other')

# 5. Fence (nhóm < 60 mẫu)
count_fence = clean_data['Fence'].value_counts()
rare_fence = count_fence[count_fence < 60].index.tolist()
clean_data['Fence'] = clean_data['Fence'].replace(rare_fence, 'Other')



In [None]:
from sklearn.preprocessing import OrdinalEncoder

# 1. Encode ordinal features
ordinal_cols = ["ExterQual", "KitchenQual", "BsmtQual"]
quality_order = ["None", "Po", "Fa", "TA", "Gd", "Ex"]

encoder = OrdinalEncoder(categories=[quality_order] * len(ordinal_cols))
clean_data[ordinal_cols] = encoder.fit_transform(clean_data[ordinal_cols])

# 2. Encode nominal features (còn lại sau khi xử lý rare)
ordinal_encoded = set(ordinal_cols)
all_cat_cols = clean_data.select_dtypes(include="object").columns.tolist()
nominal_cols = [col for col in all_cat_cols if col not in ordinal_encoded]

# Chỉ giữ các cột có số nhóm hợp lý (≤ 20)
nominal_cols_to_encode = [col for col in nominal_cols if clean_data[col].nunique() <= 20]

# 3. One-hot encoding cho các cột này
clean_data = pd.get_dummies(clean_data, columns=nominal_cols_to_encode, drop_first=True)

# 4. Frequency encoding cho biến nhiều nhóm (ví dụ: Neighborhood)
if "Neighborhood" in clean_data.columns:
    freq_map = clean_data["Neighborhood"].value_counts(normalize=True)
    clean_data["Neighborhood_FE"] = clean_data["Neighborhood"].map(freq_map)
    clean_data.drop(columns=["Neighborhood"], inplace=True)


In [7]:
clean_data.head()

Unnamed: 0.1,Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MoSold,YrSold,SalePrice,LotArea_log,TotalBsmtSF_log,GrLivArea_log,1stFlrSF_log,...,Functional_Sev,Functional_Typ,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageType_BuiltIn,GarageType_Detchd,GarageType_Other,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,Fence_None,Fence_Other,MiscFeature_Othr,MiscFeature_Shed,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Neighborhood_FE
0,0,60,65.0,8450,7,5,2003,2003,196.0,4.0,4.0,706,856,856,854,1710,1,0,2,1,3,1,4.0,8,0,2003.0,2,548,0,61,0,0,0,2,2008,208500,9.04204,6.753438,7.444833,6.753438,...,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,0.111702
1,2,60,68.0,11250,7,5,2001,2002,162.0,4.0,4.0,486,920,920,866,1786,1,0,2,1,3,1,4.0,6,1,2001.0,2,608,0,42,0,0,0,9,2008,223500,9.328212,6.82546,7.488294,6.82546,...,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,0.111702
2,3,70,60.0,9550,7,5,1915,1970,0.0,3.0,3.0,216,756,961,756,1717,1,0,1,0,3,1,4.0,7,1,1998.0,3,642,0,35,272,0,0,2,2006,140000,9.164401,6.629363,7.448916,6.869014,...,False,True,False,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,0.032675
3,4,60,84.0,14260,8,5,2000,2000,350.0,4.0,4.0,655,1145,1145,1053,2198,1,0,2,1,4,1,4.0,9,1,2000.0,3,836,192,84,0,0,0,12,2008,250000,9.565284,7.044033,7.695758,7.044033,...,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,0.026596
4,5,50,85.0,14115,5,5,1993,1995,0.0,3.0,4.0,732,796,796,566,1362,1,0,1,1,1,1,3.0,5,0,1993.0,2,480,40,30,0,1,0,10,2009,143000,9.555064,6.680855,7.217443,6.680855,...,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,0.032675


In [10]:
run = wandb.init(project="predict_house_price", job_type="upload_preprocessing")

os.makedirs("preprocessed_data", exist_ok=True)

clean_data.to_csv("preprocessed_data/preprocessed_data.csv", index=False)

artifact = wandb.Artifact(
    name="preprocessed_data",       # tên artifact Kun muốn
    type="processed_data",           # kiểu dữ liệu
    description="Preprocessed dataset before training models"
)

artifact.add_file("preprocessed_data/preprocessed_data.csv")

run.log_artifact(artifact)

run.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


In [None]:
#ProfileReport(clean_data, title='kun', explorative=True)

In [12]:
# 2. Tách X và y
X = clean_data.drop(columns=["SalePrice"])
y = np.log1p(clean_data["SalePrice"])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
run = wandb.init(project="predict_house_price", job_type='upload_pca_data')

output_dir = "pca_data"
os.makedirs(output_dir, exist_ok=True)


X_train.to_csv(os.path.join(output_dir, "X_train.csv"), index=True)
X_test.to_csv(os.path.join(output_dir, "X_test.csv"), index=True)
y_train.to_csv(os.path.join(output_dir, "y_train.csv"), index=True)
y_test.to_csv(os.path.join(output_dir, "y_test.csv"), index=True)

# Tạo artifact chung
artifact = wandb.Artifact(
    name="pca_X_y_train_test",
    type="processed_data",
    description="Dữ liệu X và y đã giảm chiều bằng PCA cho tập train/test"
)

# Add file vào artifact
artifact.add_file(os.path.join(output_dir, "X_train.csv"))
artifact.add_file(os.path.join(output_dir, "X_test.csv"))
artifact.add_file(os.path.join(output_dir, "y_train.csv"))
artifact.add_file(os.path.join(output_dir, "y_test.csv"))

# Upload lên wandb
run.log_artifact(artifact)

run.finish()


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.
