In [6]:
import torch
import os
import pandas as pd

### 1.读取数据集 

In [7]:
os.makedirs(os.path.join('.', 'data'), exist_ok=True)

In [8]:
data_file = os.path.join('.', 'data', 'house_tiny.csv')

In [9]:
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [12]:
data = pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


### 2.处理缺失值

In [40]:
inputs,outputs = data.iloc[:,0:2],data.iloc[:,2]

In [41]:
inputs

Unnamed: 0,NumRooms,Alley
0,,Pave
1,2.0,
2,4.0,
3,,


In [42]:
inputs = inputs.fillna(inputs.mean())

In [43]:
inputs

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [44]:
inputs = pd.get_dummies(inputs, dummy_na=True)

In [45]:
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


In [51]:
outputs

0    127500
1    106000
2    178100
3    140000
Name: Price, dtype: int64

### 3.转换为张量格式

方式一

In [47]:
x = torch.tensor(inputs.to_numpy(dtype=float))

In [48]:
x

tensor([[3., 1., 0.],
        [2., 0., 1.],
        [4., 0., 1.],
        [3., 0., 1.]], dtype=torch.float64)

In [49]:
y = torch.tensor(outputs.to_numpy(dtype=float))

In [50]:
y

tensor([127500., 106000., 178100., 140000.], dtype=torch.float64)

方式二

In [77]:
torch.tensor(inputs.values)

tensor([[3., 1., 0.],
        [2., 0., 1.],
        [4., 0., 1.],
        [3., 0., 1.]], dtype=torch.float64)

In [78]:
torch.tensor(outputs.values)

tensor([127500, 106000, 178100, 140000])

### 练习
1. 删除缺失值最多的列。
2. 将预处理后的数据集转换为张量格式。

#### 1.删除缺失值最多的列。

In [68]:
data = pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


In [69]:
x = data.isna().sum()
x

NumRooms    2
Alley       3
Price       0
dtype: int64

In [70]:
x.index

Index(['NumRooms', 'Alley', 'Price'], dtype='object')

In [71]:
x.argmax()

1

In [72]:
x.index[x.argmax()]

'Alley'

In [76]:
data = data.drop(columns = x.index[x.argmax()])
data

Unnamed: 0,NumRooms,Price
0,,127500
1,2.0,106000
2,4.0,178100
3,,140000


#### 2.将预处理后的数据集转换为张量格式。

略