In [119]:
import os
import pandas as pd
import torch

## 数据集读取
---

In [120]:
os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')   # 列名
    f.write('NA,Pave,127500\n')         # 每行一个样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [121]:
data = pd.read_csv(data_file)
print(data)
print(type(data))
print(type(data.iloc[1:]))
print(type(data.iloc[1]))
print(type(data.iloc[:, 2:]))
print(type(data.iloc[:, 2]))
data.iloc[1:]

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


Unnamed: 0,NumRooms,Alley,Price
1,2.0,,106000
2,4.0,,178100
3,,,140000


## 缺失值处理
---

In [122]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2:]

In [123]:
outputs

Unnamed: 0,Price
0,127500
1,106000
2,178100
3,140000


In [124]:
inputs_0, inputs_1, outputs = data.iloc[:, 0], data.iloc[:, 1], \
        data.iloc[:, 2] ## inputs_0, inputs_1, outputs与data独立

In [125]:
inputs_0 = inputs_0.fillna(inputs_0.mean())
inputs_0

0    3.0
1    2.0
2    4.0
3    3.0
Name: NumRooms, dtype: float64

In [126]:
inputs_1 = pd.get_dummies(inputs_1, dummy_na=True)
inputs_1

Unnamed: 0,Pave,NaN
0,True,False
1,False,True
2,False,True
3,False,True


## To Tensor
---

In [127]:
outputs = torch.tensor(outputs.to_numpy())
outputs

tensor([127500, 106000, 178100, 140000])

## Exercises
---

1. 删除缺失值最多的列

In [128]:
sum = data.isna().sum()
data = data.drop(columns=sum.index[sum.argmax()])
data

Unnamed: 0,NumRooms,Price
0,,127500
1,2.0,106000
2,4.0,178100
3,,140000


2. 将预处理后的数据集转换为张量格式

In [129]:
data['NumRooms'] = data['NumRooms'].fillna(data['NumRooms'].mean())
data = torch.tensor(data.to_numpy())
data

tensor([[3.0000e+00, 1.2750e+05],
        [2.0000e+00, 1.0600e+05],
        [4.0000e+00, 1.7810e+05],
        [3.0000e+00, 1.4000e+05]], dtype=torch.float64)