In [4]:
from ydata_profiling import ProfileReport
import os
from sklearn.preprocessing import LabelEncoder
import wandb
import pandas as pd 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [5]:
run = wandb.init(project="predict_house_price", job_type="load_data")

artifact = run.use_artifact('clean_data_csv:latest', type='processed_data')

artifact_dir = artifact.download()

file_path = os.path.join(artifact_dir, "clean_data.csv")

clean_data = pd.read_csv(file_path, keep_default_na=False)

run.finish()

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [6]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1460 non-null   int64  
 1   Id             1460 non-null   int64  
 2   MSSubClass     1460 non-null   int64  
 3   MSZoning       1460 non-null   object 
 4   LotFrontage    1460 non-null   float64
 5   LotArea        1460 non-null   float64
 6   Street         1460 non-null   object 
 7   Alley          1460 non-null   object 
 8   LotShape       1460 non-null   object 
 9   LandContour    1460 non-null   object 
 10  Utilities      1460 non-null   object 
 11  LotConfig      1460 non-null   object 
 12  LandSlope      1460 non-null   object 
 13  Neighborhood   1460 non-null   object 
 14  Condition1     1460 non-null   object 
 15  Condition2     1460 non-null   object 
 16  BldgType       1460 non-null   object 
 17  HouseStyle     1460 non-null   object 
 18  OverallQ

In [7]:
clean_data = clean_data.drop(columns='Id')
clean_data = clean_data.drop(columns='Unnamed: 0')
clean_data = clean_data.drop(columns='Street')
clean_data = clean_data.drop(columns='Utilities')
cols_to_drop = ['Heating', 'CentralAir', 'Electrical', 'KitchenAbvGr', 'Functional',
                'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'MiscFeature',
                'SaleType', 'SaleCondition', 'LandSlope', 'LandContour','Alley']
clean_data.drop(columns=cols_to_drop, inplace=True)


In [8]:
count_exterior = clean_data['Exterior1st'].value_counts()

rare_types = count_exterior[count_exterior < 10].index.tolist()

clean_data['Exterior1st'] = clean_data['Exterior1st'].replace(rare_types, 'Other')

count_exterior = clean_data['Exterior2nd'].value_counts()

rare_types = count_exterior[count_exterior < 10].index.tolist()

clean_data['Exterior2nd'] = clean_data['Exterior2nd'].replace(rare_types, 'Other')
count_BsmtFinType2 = clean_data['BsmtFinType2'].value_counts()
rare_type = count_BsmtFinType2[count_BsmtFinType2 < 100].index.tolist()
clean_data['BsmtFinType2'] = clean_data['BsmtFinType2'].replace(rare_type, 'Other')
count_GarageType = clean_data['GarageType'].value_counts()
rare_type = count_GarageType[count_GarageType < 30].index.tolist()
clean_data['GarageType'] = clean_data['GarageType'].replace(rare_type, 'Other')
count = clean_data['Fence'].value_counts()
rare_type = count[count < 60].index.tolist()
clean_data['Fence'] = clean_data['Fence'].replace(rare_type, 'Other')

In [9]:
clean_data['GarageYrBlt'] = clean_data['GarageYrBlt'].replace('None', 0)
clean_data['GarageYrBlt'] = clean_data['GarageYrBlt'].astype(str)

In [10]:
label_encoder = LabelEncoder()

clean_data['MSSubClass'] = label_encoder.fit_transform(clean_data['MSSubClass'])
clean_data['LotShape'] = label_encoder.fit_transform(clean_data['LotShape'])
clean_data['Neighborhood'] = label_encoder.fit_transform(clean_data['Neighborhood'])
clean_data['Condition1'] = label_encoder.fit_transform(clean_data['Condition1'])
clean_data['Condition2'] = label_encoder.fit_transform(clean_data['Condition2'])
clean_data['HouseStyle'] = label_encoder.fit_transform(clean_data['HouseStyle'])
clean_data['RoofMatl'] = label_encoder.fit_transform(clean_data['RoofMatl'])
clean_data['Exterior1st'] = label_encoder.fit_transform(clean_data['Exterior1st'])
clean_data['MasVnrType'] = label_encoder.fit_transform(clean_data['MasVnrType'])
cols_to_encode = [
    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
    'BsmtExposure', 'BsmtFinType1', 'KitchenQual','BsmtFinType2',
    'FireplaceQu', 'GarageFinish','HeatingQC','BsmtFullBath','Exterior2nd'
]

for col in cols_to_encode:
    clean_data[col] = label_encoder.fit_transform(clean_data[col])


In [11]:

clean_data = pd.get_dummies(clean_data, columns=["MSZoning"],drop_first=True)
clean_data = pd.get_dummies(clean_data, columns=["LotConfig"],drop_first=True)
clean_data = pd.get_dummies(clean_data, columns=["BldgType"],drop_first=True)
clean_data = pd.get_dummies(clean_data, columns=["YearBuilt"],drop_first=True)
clean_data = pd.get_dummies(clean_data, columns=["YearRemodAdd"],drop_first=True)
clean_data = pd.get_dummies(clean_data, columns=["RoofStyle"],drop_first=True)
cols_to_onehot = ['Foundation', 'YrSold', 'MoSold', 'GarageType','GarageYrBlt','Fence']

clean_data = pd.get_dummies(clean_data, columns=cols_to_onehot, drop_first=True)



In [12]:
clean_data['LotShape'] = clean_data['LotShape'].astype(int)

In [13]:
clean_data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,Neighborhood,Condition1,Condition2,HouseStyle,OverallQual,OverallCond,...,GarageYrBlt_2003.0,GarageYrBlt_2004.0,GarageYrBlt_2005.0,GarageYrBlt_2006.0,GarageYrBlt_2007.0,GarageYrBlt_2008.0,GarageYrBlt_2009.0,GarageYrBlt_2010.0,Fence_None,Fence_Other
0,5,65.0,8450.0,3,5,2,2,5,7,5,...,True,False,False,False,False,False,False,False,True,False
1,0,80.0,9600.0,3,24,1,2,2,6,8,...,False,False,False,False,False,False,False,False,True,False
2,5,68.0,11250.0,0,5,2,2,5,7,5,...,False,False,False,False,False,False,False,False,True,False
3,6,60.0,9550.0,0,6,2,2,5,7,5,...,False,False,False,False,False,False,False,False,True,False
4,5,84.0,14260.0,0,15,2,2,5,8,5,...,False,False,False,False,False,False,False,False,True,False


In [None]:
ProfileReport(clean_data, title='kun', explorative=True)

In [15]:
clean_data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,Neighborhood,Condition1,Condition2,HouseStyle,OverallQual,OverallCond,...,GarageYrBlt_2003.0,GarageYrBlt_2004.0,GarageYrBlt_2005.0,GarageYrBlt_2006.0,GarageYrBlt_2007.0,GarageYrBlt_2008.0,GarageYrBlt_2009.0,GarageYrBlt_2010.0,Fence_None,Fence_Other
0,5,65.0,8450.0,3,5,2,2,5,7,5,...,True,False,False,False,False,False,False,False,True,False
1,0,80.0,9600.0,3,24,1,2,2,6,8,...,False,False,False,False,False,False,False,False,True,False
2,5,68.0,11250.0,0,5,2,2,5,7,5,...,False,False,False,False,False,False,False,False,True,False
3,6,60.0,9550.0,0,6,2,2,5,7,5,...,False,False,False,False,False,False,False,False,True,False
4,5,84.0,14260.0,0,15,2,2,5,8,5,...,False,False,False,False,False,False,False,False,True,False


In [16]:
run = wandb.init(project="predict_house_price", job_type="upload_encoded_data")

artifact = wandb.Artifact(
    name="encoded_data_csv",
    type="processed_data",
    description="Data đã encode xong"
)

clean_data.to_csv("encoded_data.csv", index=False)

artifact.add_file("encoded_data.csv")

run.log_artifact(artifact)
run.finish()


In [17]:
run = wandb.init(project="predict_house_price", job_type="load_encoded_data")

artifact = run.use_artifact('encoded_data_csv:latest', type='processed_data')

artifact_dir = artifact.download()

file_path = os.path.join(artifact_dir, "encoded_data.csv")
encode_data = pd.read_csv(file_path, keep_default_na=False)

run.finish()

encode_data.info()

[34m[1mwandb[0m:   1 of 1 files downloaded.  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 355 entries, MSSubClass to Fence_Other
dtypes: bool(311), float64(12), int64(32)
memory usage: 945.4 KB


In [18]:
train_data, test_data = train_test_split(encode_data, test_size=0.2, random_state=42)
train_price = train_data['SalePrice']
test_price = test_data['SalePrice']

In [19]:
scaler = StandardScaler()
train_data = train_data.drop(columns='SalePrice')
test_data = test_data.drop(columns='SalePrice')


scaled_train = scaler.fit_transform(train_data)

pca = PCA(n_components=0.9) 
train_pca = pca.fit_transform(scaled_train)

scaled_test = scaler.transform(test_data) 
test_pca = pca.transform(scaled_test)

In [27]:
run = wandb.init(project="predict_house_price", job_type='upload_pca_data')

output_dir = "pca_data"
os.makedirs(output_dir, exist_ok=True)

train = pd.DataFrame(train_pca)
test = pd.DataFrame(test_pca)

train = pd.concat([train, train_price], axis=1)
test = pd.concat([test, test_price], axis=1)

# Lưu các tệp CSV
train.to_csv(os.path.join(output_dir, "train_pca.csv"), index=True)
test.to_csv(os.path.join(output_dir, "test_pca.csv"), index=True)

# Tạo artifact chung cho cả train và test data
artifact = wandb.Artifact(
    name="pca_train_test_data",
    type="processed_data",
    description="Dữ liệu đã giảm chiều bằng PCA cho tập huấn luyện và kiểm tra"
)
artifact.add_file(os.path.join(output_dir, "train_pca.csv"))
artifact.add_file(os.path.join(output_dir, "test_pca.csv"))

run.log_artifact(artifact)

run.finish()

In [28]:
train 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,172,173,174,175,176,177,178,179,180,SalePrice
0,-3.465046,2.003614,-2.337112,0.625203,1.749804,-1.644305,0.848225,-0.079741,0.489529,-0.012128,...,2.388997,-0.387202,-0.625418,-0.124685,0.058783,1.469777,-1.112187,0.146271,0.277965,35.571272
1,1.156931,-1.923973,3.162625,0.384509,1.834586,-1.454244,1.347688,-0.061673,-1.408657,-0.728276,...,0.076233,0.921250,-0.081282,-0.526487,0.448888,-1.109097,0.088652,0.420363,0.273267,34.695145
2,-4.181091,-1.874748,-1.647798,1.056050,1.396762,-0.753138,-0.485954,0.727332,-0.307305,0.171797,...,-0.360859,0.895835,-0.161059,0.144963,0.936472,-0.313252,0.907100,-0.196673,0.369800,36.017124
3,-1.920495,-0.731728,1.313737,0.716718,0.805871,-1.232299,0.484163,-1.219782,-0.335806,-0.317952,...,0.818367,-0.075121,1.414949,-1.267170,-0.166323,1.095086,0.862464,0.836658,-1.881756,33.103676
4,-2.773087,-1.568648,-0.636241,0.751213,-0.441466,-0.682491,0.635083,0.323721,0.044335,0.540443,...,0.306696,-0.242849,-1.288103,0.487185,0.045176,-0.819731,0.366608,-0.526518,1.173175,36.746176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,,,,,,,,,,,...,,,,,,,,,,38.812289
1332,,,,,,,,,,,...,,,,,,,,,,31.131946
1396,,,,,,,,,,,...,,,,,,,,,,33.914432
1238,,,,,,,,,,,...,,,,,,,,,,33.210201
