In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import torch

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# 房价，要拟合的目标值
target = df_train['SalePrice']
# test_target = df_test['SalePrice']

# 输入特征，可以将SalePrice列扔掉
df_train.drop(['SalePrice'],axis = 1 , inplace = True)

# 将train和test合并到一起，一块进行特征工程，方便预测test的房价
combined = pd.concat([df_train,df_test])
combined.reset_index(inplace=True)
combined.drop(['index', 'Id'], inplace=True, axis=1)

In [None]:
## 区分 num,str 可以去除 含有缺失值的特征？
def exclude_nans(df,col_type):
    '''
        num : to only get numerical columns with no nans
        str : to only get nun-numerical columns with no nans
    '''
    if (col_type == 'num'):
        predictors = df.select_dtypes(exclude=['object'])
    elif (col_type == 'str'):
        predictors = df.select_dtypes(include=['object'])

    cols_with_no_nans = []
    for col in predictors.columns:
        if not df[col].isnull().any():
            cols_with_no_nans.append(col)
    return cols_with_no_nans

In [None]:
num_cols = exclude_nans(combined, 'num')
cat_cols = exclude_nans(combined, 'str')
combined2 = combined[num_cols + cat_cols]

In [None]:
#  数据标准化
numeric_features = combined2.dtypes[combined2.dtypes != 'object'].index
combined2[numeric_features] = combined2[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))

In [None]:
# dummy variables / One-Hot编码 / 离散数值转成指示特征
combined3 = pd.get_dummies(combined2,dummy_na=True)

In [None]:
###  分类数据  转化为 tensor
n_train = df_train.shape[0]
combined3 = combined3.astype(float)  # numpy强制类型转换
train_features = torch.tensor(combined3[:n_train].values, dtype=torch.float)
test_features = torch.tensor(combined3[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(target.values, dtype=torch.float).view(-1, 1)

In [None]:
#数据分批
batch_size = 32
dataset = torch.utils.data.TensorDataset(train_features, train_labels)
train_loader = torch.utils.data.DataLoader(dataset,                   # 数据
                                          batch_size = batch_size,    # 每个batch大小
                                          shuffle = True,             # 是否打乱数据
                                          num_workers = 0,            # 工作线程
                                          pin_memory = True)
print(f"每一批{len(next(iter(train_loader))[0])}个，一共{len(train_loader)}批")

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# device = torch.device("cpu")

In [None]:
#定义网络模型
class Net(torch.nn.Module):
    def __init__(self, in_put, hidden, hidden1, out_put):
        super().__init__()
        self.linear1 = torch.nn.Linear(in_put, hidden)
        self.linear2 = torch.nn.Linear(hidden, hidden1)
        self.linear3 = torch.nn.Linear(hidden1, out_put)
    def forward(self, data): 
        x = self.linear1(data)
        x = torch.relu(x)
        x = self.linear2(x)
        x = torch.relu(x)
        x = self.linear3(x)
        return x

In [None]:
#取出输入特征个数
in_features = train_features.shape[1]
hidden, hidden1 ,out_put = 200, 100, 1

model = Net(in_features, hidden, hidden1, out_put)
model = model.to(device)

#损失函数 loss(xi,yi)=(xi−yi)2
criterion = torch.nn.MSELoss()

#梯度优化算法
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

print("in_features:",in_features)
print("in_features:",train_features.shape)
print(model)

In [None]:
losses = []
accuracies = []
epochs = 200

for epoch in range(epochs):
    loss_sum = 0
    for train_batch, labels_batch in train_loader:
        train_batch, labels_batch = train_batch.to(device), labels_batch.to(device)
        y_pred = model(train_features)
        loss = criterion(y_pred, train_labels)
        #  显示并记录 loss
     
        # 计算准确率
        # fit the model
        # history = model.fit(X_train, y_train, batch_size=32,
        #       epochs=epochs, verbose=1, validation_data=(X_test, y_test))
        # scores = model.evaluate(X_test, y_test, verbose=0)
        # print("Accuracy: %.4f" % (scores[1]))
        # accuracies.append(scores[1])
        if torch.isnan(loss):
            break
        # 将模型中各参数的梯度清零。
        # PyTorch的backward()方法计算梯度会默认将本次计算的梯度与缓存中已有的梯度加和。
        # 必须在反向传播前先清零。
        optimizer.zero_grad()
    
        # 反向传播，计算各参数对于损失loss的梯度
        loss.backward()

        # 根据刚刚反向传播得到的梯度更新模型参数
        optimizer.step()
    print("epoch:%d ,loss:%.6f" %(epoch,loss.item()))   
    loss_sum += loss.item()  
    losses.append(loss_sum)
    

In [None]:
# 绘制loss 绘制accuracy曲线图
plt.figure()
plt.plot(range(epochs), losses, label='Loss')
plt.plot(range(epochs), losses, label='Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss and Accuracy ')
plt.legend()

plt.show()