In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor, XGBRFRegressor
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LassoCV, RidgeCV, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from torch import nn, optim
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

from IPython.display import clear_output
import tqdm
from tqdm.auto import tqdm

class DataSetRegression():
    def __init__(self, X: pd.DataFrame, y: pd.Series):
        self.X = X.reset_index(drop=True)
        self.y = y
        self.X_transform = pd.DataFrame([])
        self.standart_scaler = StandardScaler()
        self.data = pd.concat([self.X, self.y], axis=1)
        self.ordEnc = OrdinalEncoder()
        
    def showMissData(self):
        miss_data = self.X.isnull().sum().sort_values()
        miss_data[miss_data > 0].plot.bar(color='green')
        return miss_data[miss_data > 0].index
        
    def fillNaMode(self, columns):
        for i in columns:
            self.X[i] = self.X[i].fillna(self.X[i].mode()[0])
            
    def ohe(self, columns, drop_first=True):
        self.X[columns] = self.X[columns].astype('str')
        self.X_transform = pd.concat([pd.get_dummies(self.X[columns], drop_first=drop_first), self.X_transform], axis=1)
        
    def notScale(self, columns):
        self.X_transform = pd.concat([self.X[columns], self.X_transform], axis=1)
    
    def standartSc(self, columns):
        sk = pd.DataFrame(self.standart_scaler.fit_transform(self.X[columns]), columns=columns)
        self.X_transform = pd.concat([sk, self.X_transform], axis=1)
    
    def train_test_split(self, *args, **kwargs):
        return train_test_split(self.X_transform[:len(self.y)], self.y, *args, **kwargs)
    
    def pred_data(self):
        return self.X_transform[len(self.y):]
    
    def all_train_data(self):
        return self.X_transform[:len(self.y)]
    
    def ordinalEnc(self, columns):
        self.X[columns] = self.X[columns].astype('str')
        gg = self.ordEnc.fit_transform(self.X[columns])
        oe = pd.DataFrame(gg, columns=columns)
        self.X_transform = pd.concat([self.X_transform, oe], axis=1)
    
    def corrMatrix(self, n2show=25):
        corrM = self.data.corr()
        fig, ax = plt.subplots(dpi = 150, figsize=(10, 8))
        cols = corrM.nlargest(n2show, self.y.name)[self.y.name].index
        sns.heatmap(corrM.loc[cols, cols], annot=True, cmap="YlGnBu", linewidths=0.1,)
        return cols
    
    def quant(self, col='label', q=0.99):
        if col == 'label':
            quant = np.quantile(self.y, 0.99)
            self.y[self.y > quant] = quant
        else:
            quant = np.quantile(self.X[col], q)
            # print('{:15s} {:10.3e}'.format(col, quant))
            self.X.loc[self.X[col] > quant, col] = quant
            
    def showq_quant(self, col):
        d = pd.DataFrame.quantile(self.X[col], [0.0, 0.25, 0.5, 0.75, 1])
        d = d - d.iloc[2]
        d = d.append(d.iloc[0]/d.iloc[1], ignore_index=True)
        d = d.append(d.iloc[4]/d.iloc[3], ignore_index=True)
        d = d.set_index(pd.Index(['0%', '25%', '50%', '75%', '100%', '0/25', '1/75']))
        d = d.replace([np.inf, -np.inf], np.nan)
        return d.dropna(axis=1)

from sklearn.linear_model import LassoCV,  BayesianRidge
from sklearn.svm import SVR
class HoueseData(DataSetRegression):
    def predict(self, models, params):
        X = self.all_train_data()
        kf = KFold(n_splits=10)
        self.ensemble = []
        errors = np.array([])
        for cnt, model in enumerate(models):
            for train, test in kf.split(X):
                if cnt == 0:
                    m = model(**params)
                else:
                    m = model()
                m.fit(X.iloc[train], y.iloc[train])
                self.ensemble.append(m)
                y_pred = m.predict(X.iloc[test])
                error = mean_squared_error(np.log(y_pred), np.log(y.iloc[test]), squared=False)
                errors = np.append(errors, error)
            print('{:5.3f}'.format(errors.mean()))
        print('Total: {:5.3f}'.format(errors.mean()))
    
    def scaleLabel(self):
        self.y = np.log(self.y)
        self.mean = dataset.y.mean()
        self.std = dataset.y.std()
        self.y = (self.y - self.mean)/self.std
    def labelBack(self):
        self.y = self.y*self.std + self.mean
        self.y = np.e**self.y
    
train_data = pd.read_csv('Htrain.csv')
test_data = pd.read_csv('Htest.csv')
all_data = pd.concat([train_data, test_data])
                
y = train_data.SalePrice

X = all_data.drop(columns=['Id', 'SalePrice'])
dataset = HoueseData(X, y)


dataset.X['GarageYrBlt'] = dataset.X['GarageYrBlt'].fillna(dataset.X.loc[dataset.X.GarageYrBlt.isnull()].YearBuilt)
dataset.X = dataset.X.fillna(0)
cols2cut = ['LotArea', 'MasVnrArea', 'TotalBsmtSF', '2ndFlrSF', 'LowQualFinSF', 'GarageYrBlt', 'label']
for i in cols2cut:
    q = 0.99
    dataset.quant(i, q)
na2no = ['MasVnrType', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'BsmtExposure', 'BsmtCond', 'FireplaceQu', 'Fence', 'Alley', 'MiscFeature', 'PoolQC', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
na2mode = ['KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Electrical', 'Functional', 'Utilities', 'MSZoning']
dataset.X[na2no] = dataset.X[na2no].fillna('No') # Nan = no this feature
dataset.fillNaMode(na2mode)    

dataset.ordinalEnc(dataset.X.select_dtypes('object').columns)
dataset.standartSc(dataset.X.select_dtypes(np.number).columns)
dataset.scaleLabel()

  from pandas import MultiIndex, Int64Index


In [3]:
n_splits=10
kf = KFold(n_splits=n_splits)
class Dataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x=x
        self.y=y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx, :], self.y[idx]

device = "cuda" if torch.cuda.is_available() else "cpu"
class ModelBaseline(nn.Module):
    def __init__(self, n_feature=78, n_hidden=64, n_output=1):
        super(ModelBaseline, self).__init__()
        self.full1 = nn.Linear(79, 64)
        self.full2 = nn.Linear(64, 32)
        # self.full3 = nn.Linear(32, 8)
        # self.full4 = nn.Linear(8, 2)
        self.full5 = nn.Linear(32, 1)

    def forward(self, x):
        out = F.leaky_relu(self.full1(x))
        out = F.leaky_relu(self.full2(out))
        # out = F.leaky_relu(self.full3(out))
        # out = F.leaky_relu(self.full4(out))        
        out = self.full5(out)
        return out
    
X = dataset.all_train_data()
label = dataset.y
def checkpoint(model, path):
    torch.save({
            'model_state_dict': model.state_dict(),
            }, f'{path}.pt')
n = 0
total_loss = []
n_epoch = 50
with tqdm(total=n_splits, file=sys.stdout, leave=False) as prbar:
    for train, test in kf.split(X):
        X_train = torch.tensor(X.iloc[train].values, dtype=torch.float32)
        X_test = torch.tensor(X.iloc[test].values, dtype=torch.float32)
        y_train = torch.tensor(label.iloc[train].values, dtype=torch.float32)
        y_test = torch.tensor(label.iloc[test].values, dtype=torch.float32)

        training_data = Dataset(X_train, y_train)
        test_data = Dataset(X_test, y_test)
        bs = 128
        train_dataloader = DataLoader(training_data, batch_size=bs, shuffle=True)
        test_dataloader = DataLoader(test_data, batch_size=bs, shuffle=False)

        model = ModelBaseline(79, 64, 1).to(device)
        optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)

        for epoch in range(n_epoch):
            model.train()
            for x, y in train_dataloader:
                y = torch.unsqueeze(y, 1)
                x, y = x.to(device), y.to(device)
                optimizer.zero_grad()
                y_pred = model(x)
                loss = nn.MSELoss()(y_pred, y)
                loss.backward()
                optimizer.step()
            model.eval()    
            mean_val_loss = []
            mean_val_coerr = []
            if (epoch+1)%n_epoch==0:
                with torch.no_grad():
                    for x_val, y_val in test_dataloader:
                        y_val = torch.unsqueeze(y_val, 1)
                        x_val, y_val = x_val.to(device), y_val.to(device)
                        y_pred = model(x_val)
                        loss1 = np.corrcoef(y_pred.cpu().detach().numpy().flatten(), y_val.cpu().flatten())[0, 1]
                        y_pred = y_pred
                        y_val = y_val
                        loss2 = nn.MSELoss()(y_pred, y_val).cpu().detach().numpy()
                        mean_val_loss.append(loss2) #.cpu().detach().numpy()
                        mean_val_coerr.append(loss1)
                print('model: {:5d}, loss: {:.3f}, coerr: {:.3f}'.format(n, np.mean(mean_val_loss), np.mean(mean_val_coerr)))
        total_loss.append(np.mean(mean_val_loss))        
        checkpoint(model, f'models/{n}')
        prbar.update(1)
        n+=1
print('Total loss: {:.3f}'.format(np.mean(total_loss)))

  0%|          | 0/10 [00:00<?, ?it/s]

model:     0, loss: 0.100, coerr: 0.918
model:     1, loss: 0.076, coerr: 0.955
model:     2, loss: 0.072, coerr: 0.969
model:     3, loss: 0.126, coerr: 0.944
model:     4, loss: 0.150, coerr: 0.922
model:     5, loss: 0.089, coerr: 0.953
model:     6, loss: 0.075, coerr: 0.958
model:     7, loss: 0.094, coerr: 0.933
model:     8, loss: 0.345, coerr: 0.808
model:     9, loss: 0.107, coerr: 0.945
Total loss: 0.124


In [40]:
class SimpleNN:
    def __init__(self, model, optimizer=(optim.SGD, {'lr': 1e-3, 'momentum': 0.9}), n_epoch=100, loss=nn.MSELoss(), device='cuda', batch_size=128, dataloader=DataLoader, dataset=Dataset):
        self.model = model
        self.model.to(device)
        self.n_epoch = n_epoch
        self.device = device
        self.loss = loss
        self.train_dataloader = None
        self.batch_size = batch_size
        self.dataloader = dataloader
        self.dataset = dataset
        self.optimizer = optimizer[0](self.model.parameters(), **optimizer[1])
        
    def fit(self, X, y):
        X = torch.tensor(X.values, dtype=torch.float32)
        y = torch.tensor(y.values, dtype=torch.float32)
        data = self.dataset(X, y)
        self.train_dataloader = self.dataloader(data, batch_size=bs, shuffle=True)
        for i in range(self.n_epoch):
            self.train_one_epoch()
        
    def train_one_epoch(self):
        self.model.train()
        for x, y in self.train_dataloader:
            y = torch.unsqueeze(y, 1)
            x, y = x.to(self.device), y.to(self.device)
            self.optimizer.zero_grad()
            y_pred = self.model(x)
            loss = self.loss(y_pred, y)
            loss.backward()
            self.optimizer.step()
    
    def predict(self, X):
        X = torch.tensor(X.values, dtype=torch.float32).to(self.device)
        self.y_pred = self.model(X).cpu().detach().numpy().flatten()
        return self.y_pred
    
    def score(self, y_true):
        label = y_true.values
        return nn.MSELoss()(torch.tensor(self.y_pred), torch.tensor(y_true)).numpy(), np.corrcoef(self.y_pred, label)[0, 1]
    

with tqdm(total=n_splits, file=sys.stdout, leave=False) as prbar:
    for train, test in kf.split(X):
        model = SimpleNN(ModelBaseline(79, 64, 1), n_epoch=100)
        model.fit(X.iloc[train], label.iloc[train])
        y_pred = model.predict(X.iloc[test])
        print( 'loss: {:8.3f}, coerr: {:8.3f}'.format(*model.score(label.iloc[test])) )
        break

  0%|          | 0/10 [00:00<?, ?it/s]

loss:    0.101, coerr:    0.946


In [3]:
preds = []
for i in range(10):
    model = ModelBaseline(79, 64, 1).to(device)
    chk = torch.load(f'models/{i}.pt')
    model.load_state_dict(chk['model_state_dict'])
    y_pred = model(torch.tensor(dataset.X_transform.values, dtype=torch.float32).to(device)).cpu().detach().numpy()
    preds.append(y_pred)

features = np.hstack(preds)

features  = np.e**(features*dataset.std + dataset.mean)

In [25]:
dataset = HoueseData(pd.DataFrame(features), train_data.SalePrice)
dataset.notScale(dataset.X.columns)

In [51]:
dataset.X_transform.to_csv('000NNFeatures.csv')

In [40]:
n_splits=10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
total_error = []
for train, test in kf.split(dataset.all_train_data()):
    model = Lasso(alpha=1)
    model.fit(dataset.all_train_data().iloc[train].values, dataset.y.iloc[train].values)
    y_pred = model.predict(dataset.all_train_data().iloc[test].values)
    total_error.append(mean_squared_error(y_pred, dataset.y.iloc[test].values, squared=False))
print(np.mean(total_error))

17095.184861103877


In [20]:
X_train, X_test, y_train, y_test = dataset.train_test_split(test_size=0.25, random_state=42)

In [43]:
model = Lasso()
model.fit(dataset.all_train_data(), dataset.y)

Lasso()

In [46]:
predicted = model.predict(dataset.pred_data())

In [47]:
answers = pd.read_csv('HAnswers.csv')
answers['SalePrice'] = predicted

In [48]:
answers.to_csv('NNensembleAnswers.csv', index=False)

In [49]:
answers

Unnamed: 0,Id,SalePrice
0,1461,116483.054688
1,1462,143369.468750
2,1463,176476.687500
3,1464,194437.828125
4,1465,184371.546875
...,...,...
1454,2915,94270.500000
1455,2916,79970.484375
1456,2917,171321.687500
1457,2918,121432.015625


In [None]:
answers = pd.read_csv('HAnswers.csv')