In [617]:
import numpy as np
import pandas as pd
import os
import math
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.regularizers import l1, l2, activity_l2
from keras.optimizers import SGD
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
os.chdir('/Users/alex/Desktop/ml/kaggle/house price')

In [471]:
# 读数据文件
trainFile = pd.read_csv('train.csv')
testFile = pd.read_csv('test.csv')

In [472]:
# 用列均值填补空缺连续变量
train_df = trainFile.fillna(trainFile.mean())
test_df = testFile.fillna(testFile.mean())
price_train_df = train_df['SalePrice']
del train_df['SalePrice']

In [473]:
cols = list(train_df)

In [474]:
# 把连续变量移到末尾
continuous_variables = ['LotFrontage','LotArea','OverallQual','OverallCond','YearBuilt',
            'YearRemodAdd','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF',
           'GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath',
           'BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars',
           'GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch',
           'PoolArea']
for row in continuous_variables:
    cols.insert(len(cols), cols.pop(cols.index(row)))
train_df = train_df.ix[:, cols]
test_df = test.ix[:, cols]

In [475]:
# 建立索引
variable_dict = {}
variable_dict['MasVnrType'] = 'None'
variable_dict['MasVnrArea'] = 0
variable_dict['BsmtFinType1'] = 'Unf'
variable_dict['BsmtFinSF1'] = 0
variable_dict['BsmtFinType2'] = 'Unf'
variable_dict['BsmtFinSF2'] = 0
variable_dict['MiscFeature'] = 'Na'
variable_dict['MiscVal'] = 0
variable_list = ['MasVnrType', 'MasVnrArea', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
                'BsmtFinSF2', 'MiscFeature', 'MiscVal']


# 建立需要转换的dataframe
temp_df0 = train_df[['Id','MasVnrType','MasVnrArea']]
temp_df1 = train_df[['Id','BsmtFinType1','BsmtFinSF1']]
temp_df2 = train_df[['Id','BsmtFinType2','BsmtFinSF2']]
temp_df3 = train_df[['Id','MiscFeature','MiscVal']]

# 删除已经转换好的列
for column in variable_list:
    del train_df[column]

# 填补空白的虚拟变量并转换到虚拟变量（移动到最后）
for k in range(len(variable_list) / 2):
    temp1 = variable_list[k * 2]
    temp2 = variable_list[k * 2 +1]
    if k == 0 :
        temp3 = temp_df0
    elif k == 1:
        temp3 = temp_df1
    elif k == 2:
        temp3 = temp_df2
    else:
        temp3 = temp_df3
    for i in range(len(temp3)):
        if str(temp3.loc[i][1]) == 'nan':
            temp3.loc[i, temp1] = variable_dict[temp1]
            temp3.loc[i, temp2] = variable_dict[temp2]
    temp3 = temp3.pivot_table(index = 'Id', columns = [temp1], 
                              values=[temp2])
    temp3.columns = ['combine_%s_%s' %(col[::-1]) for col in temp3.columns]
    temp3 = temp3.reset_index().fillna(0)
    train_df = pd.merge(train_df, temp3, how='inner', on='Id', left_on=None, right_on=None, left_index=False, 
         right_index=False, sort=False, suffixes=('_x', '_y'), copy=True) 

In [476]:
# 将涉及到建造时间的变量折算到距离出售时的年数
train_df['YearBuilt'] = (train_df['YrSold'] - train_df['YearBuilt'])
train_df['YearRemodAdd'] = (train_df['YrSold'] - train_df['YearRemodAdd'])
train_df['GarageYrBlt'] = (train_df['YrSold'] - train_df['GarageYrBlt'])

In [477]:
# 虚拟变量索引
categorical_variable = [u'MSSubClass', u'MSZoning', u'Street', u'Alley', u'LotShape',
       u'LandContour', u'Utilities', u'LotConfig', u'LandSlope',
       u'Neighborhood', u'Condition1', u'Condition2', u'BldgType',
       u'HouseStyle', u'RoofStyle', u'RoofMatl', u'Exterior1st',
       u'Exterior2nd', u'ExterQual', u'ExterCond', u'Foundation', u'BsmtQual',
       u'BsmtCond', u'BsmtExposure', u'BsmtUnfSF', u'Heating', u'HeatingQC',
       u'CentralAir', u'Electrical', u'KitchenQual', u'Functional',
       u'FireplaceQu', u'GarageType', u'GarageFinish', u'GarageQual',
       u'GarageCond', u'PavedDrive', u'PoolQC', u'Fence', u'MoSold', u'YrSold',
       u'SaleType', u'SaleCondition']

# 转换并合并虚拟变量，并删去合并前变量
for variable in categorical_variable:
    temp = pd.get_dummies(train_df[variable]).rename(columns = lambda x: 'categorical_' + '%s' %(x))
    train_df = pd.concat([train_df, temp], axis = 1)
    del train_df[variable]

In [630]:
# 构建模型
model = Sequential()

model.add(Dense(800, input_dim = 1093, init = 'uniform'))
model.add(Activation('linear'))
model.add(Dropout(0.5))
          
model.add(Dense(300))
model.add(Activation('linear'))
model.add(Dropout(0.5))

model.add(Dense(100))
model.add(Activation('linear'))
model.add(Dropout(0.5))

model.add(Dense(30))
model.add(Activation('linear'))
model.add(Dropout(0.5))


model.add(Dense(10))
model.add(Activation('linear'))
model.add(Dropout(0.5))

model.add(Dense(1))
model.add(Activation('linear'))
          
model.compile(optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06),
              loss = 'mean_squared_error', 
              metrics =['mean_squared_error'])

In [631]:
# 设置训练数据格式
x = train_df.as_matrix().astype("float32")
# x = preprocessing.scale(train_df.as_matrix().astype("float32"))
y = price_train_df.as_matrix().astype("float32")



In [632]:
feature_train, feature_test, target_train, target_test = train_test_split(x, y, test_size=0.2, random_state = None)

In [636]:
model.fit(feature_train,target_train,nb_epoch = 500, batch_size = 1460)
model.evaluate(feature_test, target_test)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500


KeyboardInterrupt: 