In [158]:
import hashlib
import os
import tarfile
import zipfile
import requests

#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

In [159]:
def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """下载一个DATA_HUB中的文件，返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

def download_extract(name, folder=None):  #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """下载DATA_HUB中的所有文件"""
    for name in DATA_HUB:
        download(name)

In [160]:
%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

In [161]:
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

In [162]:
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))
train_num = train_data.shape[0]

In [163]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
# all_features.describe()
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean())/ (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)

all_features = pd.get_dummies(all_features, dummy_na=True)
all_features

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,0.067320,-0.184443,-0.217841,0.646073,-0.507197,1.046078,0.896679,0.523038,0.580708,-0.29303,...,0,1,0,0,0,0,0,1,0,0
1,-0.873466,0.458096,-0.072032,-0.063174,2.187904,0.154737,-0.395536,-0.569893,1.177709,-0.29303,...,0,1,0,0,0,0,0,1,0,0
2,0.067320,-0.055935,0.137173,0.646073,-0.507197,0.980053,0.848819,0.333448,0.097840,-0.29303,...,0,1,0,0,0,0,0,1,0,0
3,0.302516,-0.398622,-0.078371,0.646073,-0.507197,-1.859033,-0.682695,-0.569893,-0.494771,-0.29303,...,0,1,0,1,0,0,0,0,0,0
4,0.067320,0.629439,0.518814,1.355319,-0.507197,0.947040,0.753100,1.381770,0.468770,-0.29303,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.419286,-2.069222,-1.043758,-1.481667,1.289537,-0.043338,-0.682695,-0.569893,-0.968860,-0.29303,...,0,1,0,0,0,0,0,1,0,0
1455,2.419286,-2.069222,-1.049083,-1.481667,-0.507197,-0.043338,-0.682695,-0.569893,-0.415757,-0.29303,...,0,1,0,1,0,0,0,0,0,0
1456,-0.873466,3.884968,1.246594,-0.772420,1.289537,-0.373465,0.561660,-0.569893,1.717643,-0.29303,...,0,1,0,1,0,0,0,0,0,0
1457,0.655311,-0.312950,0.034599,-0.772420,-0.507197,0.682939,0.370221,-0.569893,-0.229194,-0.29303,...,0,1,0,0,0,0,0,1,0,0


In [164]:
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = pd.concat((train_data['Id'], all_features[:train_num], train_data['SalePrice']), axis=1)
test_data = pd.concat((test_data['Id'], all_features[train_num:]), axis=1)

In [165]:
train_data

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan,SalePrice
0,1,0.067320,-0.184443,-0.217841,0.646073,-0.507197,1.046078,0.896679,0.523038,0.580708,...,1,0,0,0,0,0,1,0,0,208500
1,2,-0.873466,0.458096,-0.072032,-0.063174,2.187904,0.154737,-0.395536,-0.569893,1.177709,...,1,0,0,0,0,0,1,0,0,181500
2,3,0.067320,-0.055935,0.137173,0.646073,-0.507197,0.980053,0.848819,0.333448,0.097840,...,1,0,0,0,0,0,1,0,0,223500
3,4,0.302516,-0.398622,-0.078371,0.646073,-0.507197,-1.859033,-0.682695,-0.569893,-0.494771,...,1,0,1,0,0,0,0,0,0,140000
4,5,0.067320,0.629439,0.518814,1.355319,-0.507197,0.947040,0.753100,1.381770,0.468770,...,1,0,0,0,0,0,1,0,0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,0.067320,-0.312950,-0.285421,-0.063174,-0.507197,0.914028,0.753100,-0.569893,-0.968860,...,1,0,0,0,0,0,1,0,0,175000
1456,1457,-0.873466,0.672275,0.381246,-0.063174,0.391170,0.220763,0.178782,0.093673,0.765076,...,1,0,0,0,0,0,1,0,0,210000
1457,1458,0.302516,-0.141607,-0.142781,0.646073,3.086271,-1.000704,1.040259,-0.569893,-0.365275,...,1,0,0,0,0,0,1,0,0,266500
1458,1459,-0.873466,-0.055935,-0.057197,-0.772420,0.391170,-0.703591,0.561660,-0.569893,-0.861312,...,1,0,0,0,0,0,1,0,0,142125


In [166]:
train_data_TD = TabularDataset(train_data)
test_data_TD = TabularDataset(test_data)

In [167]:
train_data_TD

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan,SalePrice
0,1,0.067320,-0.184443,-0.217841,0.646073,-0.507197,1.046078,0.896679,0.523038,0.580708,...,1,0,0,0,0,0,1,0,0,208500
1,2,-0.873466,0.458096,-0.072032,-0.063174,2.187904,0.154737,-0.395536,-0.569893,1.177709,...,1,0,0,0,0,0,1,0,0,181500
2,3,0.067320,-0.055935,0.137173,0.646073,-0.507197,0.980053,0.848819,0.333448,0.097840,...,1,0,0,0,0,0,1,0,0,223500
3,4,0.302516,-0.398622,-0.078371,0.646073,-0.507197,-1.859033,-0.682695,-0.569893,-0.494771,...,1,0,1,0,0,0,0,0,0,140000
4,5,0.067320,0.629439,0.518814,1.355319,-0.507197,0.947040,0.753100,1.381770,0.468770,...,1,0,0,0,0,0,1,0,0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,0.067320,-0.312950,-0.285421,-0.063174,-0.507197,0.914028,0.753100,-0.569893,-0.968860,...,1,0,0,0,0,0,1,0,0,175000
1456,1457,-0.873466,0.672275,0.381246,-0.063174,0.391170,0.220763,0.178782,0.093673,0.765076,...,1,0,0,0,0,0,1,0,0,210000
1457,1458,0.302516,-0.141607,-0.142781,0.646073,3.086271,-1.000704,1.040259,-0.569893,-0.365275,...,1,0,0,0,0,0,1,0,0,266500
1458,1459,-0.873466,-0.055935,-0.057197,-0.772420,0.391170,-0.703591,0.561660,-0.569893,-0.861312,...,1,0,0,0,0,0,1,0,0,142125


In [168]:
target_label = 'SalePrice'
id = 'Id'
predictor = TabularPredictor(label=target_label, path='./automl_model/').fit(train_data_TD.drop(columns=[id]), presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "./automl_model/"
AutoGluon Version:  0.8.2
Python Version:     3.9.17
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Jan 27 02:56:13 UTC 2023
Disk Space Avail:   993.96 GB / 1081.10 GB (91.9%)
Train Data Rows:    1460
Train Data Columns: 331
Label Column: SalePrice
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == int and many unique label-values observed).
	Label info (max, min, mean, stddev): (755000, 34900, 180921.19589, 79442.50288)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting Au

In [169]:
sale_predict = predictor.predict(test_data_TD.drop(columns=['Id']))

In [170]:
sale_predict

0       127640.117188
1       160349.515625
2       187352.484375
3       193601.171875
4       189786.781250
            ...      
1454     86373.156250
1455     84214.257812
1456    166373.062500
1457    120168.976562
1458    224862.140625
Name: SalePrice, Length: 1459, dtype: float32

In [171]:
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': sale_predict})

In [172]:
submission.to_csv('house-automl-submission.csv', index=False)