# 引入对应的包

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# 导入数据

In [None]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test_nolabel.csv')

# 查看数据整体信息

In [None]:
ID = test_data['ID']

In [None]:
train_data.info()

# 合并数据

In [None]:
train_data['label'] = 'train'
test_data['label'] = 'test'
data = pd.concat([train_data,test_data],axis = 0)

In [None]:
#data['sale_date'].astype(str).str[0:4].astype(int)

# 数据预处理部分

In [None]:
data['sale_date_1'] = data['sale_date'].astype(str).str[0:4].astype(int)###取房屋出售年份

In [None]:
data['house_age'] = data['sale_date_1'] - data['year_built'] ##生成房屋的年龄

In [None]:
data['is_repair'] = data['year_repair'].apply(lambda x: 0 if x==0 else 1)       #生成是否维修特征   

In [None]:
data['sale_date_2'] = data['sale_date'].astype(str).str[0:6].astype(int)  #哪个月出售

In [None]:
data['area_parking'] = data['area_parking'].apply(lambda x: data['area_parking'].mean() if x >500000 else x)# 异常点处理

In [None]:
drop = ['year_repair','sale_date','sale_date_1','year_built']#需要删除的列

In [None]:
data.drop(drop,axis=1,inplace=True)

# 特征重要程度分析

In [None]:
#,'is_repair','house_age'

In [None]:
feature = ['num_bedroom','num_bathroom','area_house','area_parking','floor','rating','floorage','area_basement','latitude','longitude']
target=['price']

In [None]:
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr

#选择K个最好的特征，返回选择特征后的数据
#第一个参数为计算评估特征是否好的函数，该函数输入特征矩阵和目标向量，输出二元组（评分，P值）的数组，数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
#参数k为选择的特征个数
SelectKBest(lambda X, Y: train_data(map(lambda x:pearsonr(x, Y), X.T)).T, k=6).fit_transform(train_data['num_bedroom','num_bathroom','area_house','area_parking','floor','rating','floorage','area_basement','latitude','longitude'], train_data['price'])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['sale_date_2']
for col in var_to_encode:
    data[col] = le.fit_transform(data[col])
data = pd.get_dummies(data, columns=var_to_encode)
######变量转换

In [None]:
data.columns

In [None]:
columns = ['area_basement','area_house','area_parking','floorage','latitude','longitude','num_bathroom','num_bedroom','rating','house_age',
           'floor']

# 标准化,归一化

In [None]:
data['area_basement']

In [None]:
for i in columns:
    data[i] = (data[i]-data[i].min())/(data[i].max() - data[i].min())

In [None]:
data

In [None]:
train_data

# 数据特征关系

In [None]:
x = data['is_repair']
y = data['price']
plt.scatter(x,y)
plt.show

In [None]:
#num_bedroom，num_bathroom，floor，rating，floorage
x = data['longitude']
y = data['price']
plt.scatter(x,y)
plt.show

In [None]:
#num_bedroom，num_bathroom
x = data['num_bedroom']
y = data['price']
plt.scatter(x,y)
plt.show

In [None]:
#停车面积和房价关系
x = data['area_parking']
y = data['price']
plt.scatter(x,y)
plt.show

# 训练数据和测试数据拆分

In [None]:
train_data_re = data[data['label']=='train']
test_data_re = data[data['label']=='test']

In [None]:
train_data_re.drop(['label','ID'],axis = 1,inplace=True)
test_data_re.drop(['label','ID','price'],axis = 1,inplace=True)

In [None]:
#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#train_data_re = sc.fit_transform(train_data_re)
#test_data_re = sc.transform(test_data_re)

In [None]:
train_data_re

# 模型选择

In [None]:
feature = ['area_basement', 'area_house', 'area_parking', 'floor',
       'floorage', 'num_bathroom', 'num_bedroom','latitude','longitude',
        'rating', 'house_age', 'is_repair', 'sale_date_2_0',
       'sale_date_2_1', 'sale_date_2_2', 'sale_date_2_3', 'sale_date_2_4',
       'sale_date_2_5', 'sale_date_2_6', 'sale_date_2_7', 'sale_date_2_8',
       'sale_date_2_9', 'sale_date_2_10', 'sale_date_2_11', 'sale_date_2_12']

In [None]:
target = ['price']

In [None]:
X = train_data_re[feature]

In [None]:
y = train_data_re[target]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 42)

# 线性模型

In [None]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train,y_train)
predict = LR.predict(test_data_re)

# 模型性能评估

In [None]:
LR.coef_

In [None]:
LR.score(X_train,y_train)

In [None]:
LR.score(X_test,y_test)

In [None]:
predict = LR.predict(test_data_re)

In [None]:
plt.plot(range(len(predict)),predict,'b',label="predict")

In [None]:
test_data_re['price'] = predict
test_data_re['ID'] = ID

In [None]:
test_data_re[['ID','price']].to_csv(r'D:/dm/DC竞赛/美国King County房价预测训练赛/predict.csv', index=False)