In [None]:
#invite people for the Kaggle party
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import torch

In [None]:
## 在使用PyTorch时，可能会遇到一个常见的错误：“RuntimeError: Placeholder storage has not been allocated on MPS device”。
## 这个错误通常是由于在图形处理器上使用多进程时导致的。
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
#descriptive statistics summary
df_train['SalePrice'].describe()

In [None]:
sns.distplot(df_train['SalePrice']);

In [None]:
#skewness and kurtosis
print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())

In [None]:
#scatter plot grlivarea/saleprice
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

In [None]:
#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000)

In [None]:
var = 'YearBuilt'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);

In [None]:
#correlation matrix
corrmat = df_train.corr(method='pearson',numeric_only=True)
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
#saleprice correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
#scatterplot
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size = 2.5)
plt.show();

# 缺失数据处理

In [None]:
#missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
missing_data[missing_data['Total']>1].index

In [None]:
#dealing with missing data
df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,axis=1)
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)
df_train.isnull().sum().max() #just checking that there's no missing data missing...

# 标准化

In [None]:
np.array(df_train['SalePrice'])[:,np.newaxis]

In [None]:
#standardizing data
saleprice_scaled = StandardScaler().fit_transform(np.array(df_train['SalePrice'])[:,np.newaxis])
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)

# Bivariate analysis 双变量分析

In [None]:
#bivariate analysis saleprice/grlivarea
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000))

In [None]:
#deleting points
df_train.sort_values(by = 'GrLivArea', ascending = False)[:5]

In [None]:
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)

In [None]:
#bivariate analysis saleprice/grlivarea
var = 'TotalBsmtSF'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

# In the search for normality 检查正态性

In [None]:
#histogram and normal probability plot
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)

# 由于没有呈现出正太分布，因此 通常使用 log 转化来解决这个问题

In [None]:
#applying log transformation
df_train['SalePrice'] = np.log(df_train['SalePrice'])

In [None]:
#transformed histogram and normal probability plot
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)

In [None]:
#histogram and normal probability plot
sns.distplot(df_train['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)

In [None]:
#data transformation
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
#transformed histogram and normal probability plot
sns.distplot(df_train['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)

# 带有 0 的数据无法使用 log transformation 

In [None]:
#histogram and normal probability plot
sns.distplot(df_train['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'], plot=plt)

In [None]:
#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
# 创建一个新变量，如果有地下室：1 ，没有地下室 ：0
df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0 
df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1
#transform data
df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])

# 画出有地下室的数据的分布

In [None]:
#histogram and normal probability plot
sns.distplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)

# homoscedasticity 同方差性

In [None]:
#scatter plot
plt.scatter(df_train['GrLivArea'], df_train['SalePrice']);

In [None]:
plt.scatter(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], df_train[df_train['TotalBsmtSF']>0]['SalePrice']);

# dummy variables

In [None]:
#convert categorical variable into dummy
df_train2 = pd.get_dummies(df_train)

In [None]:
%pwd

In [None]:
df_train2.to_csv('/Users/lucc/Desktop/lcc_new_script/House_Prices/df_train2.txt', sep='\t', index=False)

#  预测训练

In [None]:
df_train['SalePrice'].describe()

In [None]:
df_test.describe()

In [None]:
# 房价，要拟合的目标值
target = df_train['SalePrice']
# test_target = df_test['SalePrice']

# 输入特征，可以将SalePrice列扔掉
df_train.drop(['SalePrice'],axis = 1 , inplace = True)

# 将train和test合并到一起，一块进行特征工程，方便预测test的房价
combined = pd.concat([df_train,df_test])
combined.reset_index(inplace=True)
combined.drop(['index', 'Id'], inplace=True, axis=1)

In [None]:
combined.head()

## 区分 num,str 可以去除 含有缺失值的特征？

In [None]:
def exclude_nans(df,col_type):
    '''
        num : to only get numerical columns with no nans
        str : to only get nun-numerical columns with no nans
    '''
    if (col_type == 'num'):
        predictors = df.select_dtypes(exclude=['object'])
    elif (col_type == 'str'):
        predictors = df.select_dtypes(include=['object'])

    cols_with_no_nans = []
    for col in predictors.columns:
        if not df[col].isnull().any():
            cols_with_no_nans.append(col)
    return cols_with_no_nans

In [None]:
num_cols = exclude_nans(combined, 'num')
cat_cols = exclude_nans(combined, 'str')

In [None]:
combined[num_cols + cat_cols]

In [None]:
combined2 = combined[num_cols + cat_cols]

In [None]:
combined2.shape

In [None]:
print(num_cols[:5])
print ('Number of numerical columns with no nan values: ',len(num_cols))
print(cat_cols[:5])
print ('Number of non-numerical columns with no nan values: ',len(cat_cols))

In [None]:
# 对分类特征进行One-Hot编码
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('object')):
            # pandas.get_dummies 可以对分类特征进行One-Hot编码
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            # drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df

In [None]:
combined2 = oneHotEncode(combined2,num_cols + cat_cols)

In [None]:
num_of_train_data = df_train.shape[0]
combined2 = combined2.astype(float)  # numpy强制类型转换

# 训练数据集特征
X_train= torch.tensor(combined2[:num_of_train_data].values, dtype=torch.float)
# 训练数据集目标
y_train = torch.tensor(target.values, dtype=torch.float).view(-1, 1)
# 测试数据集特征
X_test = torch.tensor(combined2[num_of_train_data:].values, dtype=torch.float)
# 测试数据集目标
# X_test = torch.tensor(test_target.values, dtype=torch.float).view(-1, 1)

print("train data size: ", X_train.shape)
print("label data size: ", y_train.shape)
print("test data size: ", X_test.shape)

# 构建神经网络

In [None]:
device = torch.device("cpu")

In [None]:
#定义网络模型
class Net(torch.nn.Module):
    def __init__(self, in_put, hidden, hidden1, out_put):
        super().__init__()
        self.linear1 = torch.nn.Linear(in_put, hidden)
        self.linear2 = torch.nn.Linear(hidden, hidden1)
        self.linear3 = torch.nn.Linear(hidden1, out_put)
    def forward(self, data): 
        x = self.linear1(data)
        x = torch.relu(x)
        x = self.linear2(x)
        x = torch.relu(x)
        x = self.linear3(x)
        return x

In [None]:
#取出输入特征个数
in_features = X_train.shape[1]
hidden, hidden1 ,out_put = 128, 64, 1

model = Net(in_features, hidden, hidden1, out_put).to(device)

#损失函数 loss(xi,yi)=(xi−yi)2
criterion = torch.nn.MSELoss()

#梯度优化算法
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

print("in_features:",in_features)
print("in_features:",X_train.shape)
print(model)

In [None]:
losses = []
accuracies = []
epochs = 2000

for epoch in range(epochs):
    y_pred = model(X_train)
    
    loss = criterion(y_pred, y_train)
    #  显示并记录 loss
    print("epoch:%d ,loss:%.6f" %(epoch,loss.item()))
    loss_sum = 0
    loss_sum += loss.item()
    losses.append(loss_sum)
    # 计算准确率
    # fit the model
    # history = model.fit(X_train, y_train, batch_size=32,
    #       epochs=epochs, verbose=1, validation_data=(X_test, y_test))
    # scores = model.evaluate(X_test, y_test, verbose=0)
    # print("Accuracy: %.4f" % (scores[1]))
    # accuracies.append(scores[1])



    if torch.isnan(loss):
        break
    # 将模型中各参数的梯度清零。
    # PyTorch的backward()方法计算梯度会默认将本次计算的梯度与缓存中已有的梯度加和。
    # 必须在反向传播前先清零。
    optimizer.zero_grad()
    
    # 反向传播，计算各参数对于损失loss的梯度
    loss.backward()

    # 根据刚刚反向传播得到的梯度更新模型参数
    optimizer.step()

In [None]:
# 绘制loss曲线图
plt.figure()
plt.plot(range(epochs), losses, label='Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()

# 绘制accuracy曲线图
# plt.figure()
# plt.plot(range(epochs), accuracies, label='Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.title('Training Accuracy')
# plt.legend()

plt.show()