In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [30]:
def factorized_object(col, df):
    df[col] = df[col].factorize()[0]+1
    
def prepare_feature(dataset):
    features = dataset.loc[:, 'MSSubClass':'SaleCondition']
    object_cols = list(filter(lambda i: features[i].dtype == 'object', features.columns))
    
    features = features.fillna(0)
    for col in object_cols:
        factorized_object(col, features)
        
    return features

def predict_result(model, dataset, saveto='data/submission.csv'): 
    features = prepare_feature(dataset)
    submission = pd.DataFrame({"Id": dataset["Id"], "SalePrice": model.predict(features)})
    submission.to_csv(saveto, index=False)
    print('saved to', saveto)
    return submission

def train_model(trainX,trainY):
    model = XGBRegressor()
    model.fit(trainX, trainY)
    print("MSE Training data: ", np.sqrt(mean_squared_error(trainY, model.predict(trainX))))
    print("MEA Training data: ", mean_absolute_error(trainY, model.predict(trainX)))
    return model

def eval(model, feat,label):
    pred = model.predict(feat)
    print("MSE : ", np.sqrt(mean_squared_error(label, pred)))
    print("MEA : ", mean_absolute_error(label, pred))

In [3]:
train_set = pd.read_csv('data/train.csv')
test_set = pd.read_csv('data/test.csv')

In [4]:
train_features = prepare_feature(train_set)
train_labels = train_set['SalePrice']

test_features = prepare_feature(test_set)

In [5]:
trainX,testX,trainY,testY = train_test_split(train_features, train_labels)

In [6]:
model = train_model(trainX, trainY)
model.fit(trainX, trainY)

MSE Training data:  14862.347743979835
MEA Training data:  10601.356949200914


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [14]:
eval(model, testX, testY)

MSE :  26853.030451970135
MEA :  16135.855115582192


In [21]:
final_model = train_model(train_features, train_labels)

MSE Training data:  15522.851069832584
MEA Training data:  11122.682272046233


In [29]:
eval(final_model, testX, testY)

MSE :  14641.574977059792
MEA :  10811.455211900686


In [33]:
predict_result(model, test_set, saveto='data/submission2.csv')

saved to data/submission2.csv


Unnamed: 0,Id,SalePrice
0,1461,133647.375000
1,1462,157663.000000
2,1463,176503.718750
3,1464,180361.218750
4,1465,197562.843750
5,1466,174271.546875
6,1467,177185.125000
7,1468,163792.671875
8,1469,177744.265625
9,1470,124027.710938


## Use neural network

In [34]:
import torch

In [243]:
nnmodel = torch.nn.Sequential(
    torch.nn.Linear(len(trainX.columns), 32),
    torch.nn.Linear(32, 16),
    torch.nn.Linear(16, 1)
) 
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(nnmodel.parameters(), lr=0.1)

tensorTrainX = torch.from_numpy(np.array(trainX)).float()
tensorTrainY = torch.from_numpy(np.array(trainY)).float().unsqueeze(dim=1)

tensorTestX = torch.from_numpy(np.array(testX)).float()
tensorTestY = torch.from_numpy(np.array(testY)).float().unsqueeze(dim=1)

In [244]:
# optimizer = torch.optim.SGD(nnmodel.parameters(), lr=0.00001)

In [245]:
e = 1
loss = 15305.782146551466**2
target = 15305.782146551466

while loss > target:
    nnpreds = nnmodel(tensorTrainX)
    loss = criterion(nnpreds, tensorTrainY)
    if e % 10000 == 0:
        print(f'Epoch {e}, train loss', loss.item())
    e += 1
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 10000, train loss 1497819776.0
Epoch 20000, train loss 977285568.0
Epoch 30000, train loss 951975872.0
Epoch 40000, train loss 942873792.0
Epoch 50000, train loss 941021568.0
Epoch 60000, train loss 940151424.0
Epoch 70000, train loss 1061926016.0
Epoch 80000, train loss 1096227968.0


KeyboardInterrupt: 