In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor, XGBRFRegressor
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from datasetRegression import DataSetRegression
from tqdm.auto import tqdm
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.base import BaseEstimator, RegressorMixin

from scipy import stats
from scipy.stats import norm
from torch import nn, optim
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from copy import deepcopy

  from pandas import MultiIndex, Int64Index


In [2]:
class Hdata(DataSetRegression):
    def log_y(self):
        self.y = np.log(self.y)
        self.mean = self.y.mean()
        self.std = self.y.std()
        self.y = (self.y - self.mean)/self.std
    
    def unlog_y(self):
        self.y = self.y*self.std + self.mean
        self.y = np.e**self.y
        
    def write_answers(self, out_file, predicted):
        predicted = predicted*self.std + self.mean
        predicted = np.e**predicted
        answers = pd.read_csv('HAnswers.csv')
        answers['SalePrice'] = predicted
        answers.to_csv(f'{out_file}.csv', index=False)
    
    def predict(self, Model, params, n_splits=5):
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=40)
        errors = []
        self.models = []
        for train, test in kf.split(self.all_train_data):
            model = Model(**params)
            model.fit(self.all_train_data.iloc[train], self.y.iloc[train])
            self.models.append(model.copy())
            y_pred = model.predict(self.all_train_data.iloc[test])
            error = mean_squared_error(y_pred*self.std + self.mean, self.y.iloc[test]*self.std + self.mean, squared=False)
            print('{:5.3f}'.format(error))
            errors.append(error)
        print('total: {:5.3f}'.format(np.mean(errors)))

class DjStacking:  
    """Стэкинг моделей scikit-learn"""

    def __init__(self, models, ens_model):
        """
        Инициализация
        models - базовые модели для стекинга
        ens_model - мета-модель
        """
        self.models = models
        self.ens_model = ens_model
        self.n = len(models)
        self.valid = None
        
    def fit(self, X, y=None, p=0.1, cv=3, err=0.001, random_state=None):
        """
        Обучение стекинга
        p - в каком отношении делить на обучение / тест
            если p = 0 - используем всё обучение!
        cv  (при p=0) - сколько фолдов использовать
        err (при p=0) - величина случайной добавки к метапризнакам
        random_state - инициализация генератора
            
        """
        if (p > 0): # делим на обучение и тест
            # разбиение на обучение моделей и метамодели
            train, valid, y_train, y_valid = train_test_split(X, y, test_size=p, random_state=random_state)
            
            # заполнение матрицы для обучения метамодели
            self.valid = np.zeros((valid.shape[0], self.n))
            for t, clf in enumerate(self.models):
                clf.fit(train, y_train)
                self.valid[:, t] = clf.predict(valid)
                
            # обучение метамодели
            self.ens_model.fit(self.valid, y_valid)
            
        else: # используем всё обучение
            
            # для регуляризации - берём случайные добавки
            self.valid = err*np.random.randn(X.shape[0], self.n)
            
            for t, clf in enumerate(self.models):
                # это oob-ответы алгоритмов
                self.valid[:, t] += cross_val_predict(clf, X, y, cv=cv, n_jobs=-1, method='predict')
                # но сам алгоритм надо настроить
                clf.fit(X, y)
            
            # обучение метамодели
            self.ens_model.fit(self.valid, y)  
            

        return self
    


    def predict(self, X, y=None):
        """
        Работа стэкинга
        """
        # заполение матрицы для мета-классификатора
        X_meta = np.zeros((X.shape[0], self.n))
        
        for t, clf in enumerate(self.models):
            X_meta[:, t] = clf.predict(X)
        
        a = self.ens_model.predict(X_meta)
        
        return (a)
        
class Dataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x=x
        self.y=y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx, :], self.y[idx]
        
train_data = pd.read_csv('Htrain.csv')
test_data = pd.read_csv('Htest.csv')
all_data = pd.concat([train_data, test_data])

y = train_data.SalePrice
X = all_data.drop(columns=['Id', 'SalePrice'])
dataset = Hdata(X, y)
dataset.X['GarageYrBlt'] = dataset.X['GarageYrBlt'].fillna(dataset.X.loc[dataset.X.GarageYrBlt.isnull()].YearBuilt)
na2no = ['MasVnrType', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'BsmtExposure', 'BsmtCond', 'FireplaceQu', 'Fence', 'Alley', 'MiscFeature', 'PoolQC', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
na2mode = ['KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Electrical', 'Functional', 'Utilities', 'MSZoning']
na2zero = ['BsmtFinSF2', 'BsmtFinSF1', 'TotalBsmtSF', 'GarageArea', 'GarageCars', 'BsmtUnfSF', 'BsmtHalfBath', 'BsmtFullBath', 'MasVnrArea', 'LotFrontage']
dataset.X[na2no] = dataset.X[na2no].fillna('No') # Nan = no this feature
dataset.X[na2zero] = dataset.X[na2zero].fillna(0)
dataset.X.loc[dataset.X.GarageYrBlt > 2010, ['GarageYrBlt']] = 2010
dataset.fill_na_mode(na2mode)
dataset.log_y()
dataset.ordinal_encoder(dataset.X.select_dtypes('object').columns)
dataset.standart_scale(dataset.X.select_dtypes(np.number).columns)

In [12]:
params0 = {'n_estimators': 4096,
'max_depth': 3,
'eta': 0.05,
'subsample': 0.7,
'colsample_bytree': 0.8,
'lambda': 1,
'alpha': 1,
'random_state': 42,
'tree_method': 'gpu_hist'}

models = [SimpleNN(ModelBaseline(), n_epoch=100), XGBRegressor(**params0)]
par = {'models' : models, 'ens_model': ElasticNetCV()}
dataset.predict(XGBRegressor, params0, 2)

KeyboardInterrupt: 

In [14]:
class SimpleNN(BaseEstimator, RegressorMixin):
    def __init__(self, model, optimizer=(optim.SGD, {'lr': 1e-3, 'momentum': 0.9}), n_epoch=100, loss=nn.MSELoss(), device='cuda', batch_size=128, dataloader=DataLoader, dataset=Dataset):
        self.model = model()
        self.model.to(device)
        self.n_epoch = n_epoch
        self.device = device
        self.loss = loss
        self.train_dataloader = None
        self.batch_size = batch_size
        self.dataloader = dataloader
        self.dataset = dataset
        self.optimizer = optimizer[0](self.model.parameters(), **optimizer[1])
        
    def fit(self, X, y):
        X = torch.tensor(X.values, dtype=torch.float32)
        y = torch.tensor(y.values, dtype=torch.float32)
        data = self.dataset(X, y)
        self.train_dataloader = self.dataloader(data, batch_size=self.batch_size, shuffle=True)
        for i in range(self.n_epoch):
            self.train_one_epoch()
        
    def train_one_epoch(self):
        self.model.train()
        for x, y in self.train_dataloader:
            y = torch.unsqueeze(y, 1)
            x, y = x.to(self.device), y.to(self.device)
            self.optimizer.zero_grad()
            y_pred = self.model(x)
            loss = self.loss(y_pred, y)
            loss.backward()
            self.optimizer.step()
    
    def predict(self, X):
        self.model.to(self.device)
        X = torch.tensor(X.values, dtype=torch.float32).to(self.device)
        self.y_pred = self.model(X).cpu().detach().numpy().flatten()
        return self.y_pred
    
    def score(self, y_true):
        label = y_true.values
        return nn.MSELoss()(torch.tensor(self.y_pred), torch.tensor(y_true)).numpy(), np.corrcoef(self.y_pred, label)[0, 1]
    
    def copy(self):
        return deepcopy(self)

class ModelBaseline(nn.Module):
    def __init__(self):
        super(ModelBaseline, self).__init__()
        self.full1 = nn.Linear(79, 64)
        self.full2 = nn.Linear(64, 32)
        # self.full3 = nn.Linear(32, 8)
        # self.full4 = nn.Linear(8, 4)
        self.full5 = nn.Linear(32, 1)

    def forward(self, x):
        out = F.leaky_relu(self.full1(x))
        out = F.leaky_relu(self.full2(out)) 
        # out = F.leaky_relu(self.full3(out))
        # out = F.leaky_relu(self.full4(out)) 
        out = self.full5(out)
        return out

dataset.predict(SimpleNN, {'model': ModelBaseline, 'n_epoch': 100}, 10)

0.104
0.130
0.116
0.129
0.135
0.261
0.123
0.144
0.138
0.123
total: 0.140


In [19]:
predicted = []
a = np.array([0.104,0.130,0.116,0.129,0.135,0.261,0.123,0.144,0.138,0.123])**-1
a = a/np.sum(a)

In [20]:
for cnt, model in enumerate(dataset.models):
    predicted.append(model.predict(dataset.submission_data)[:, np.newaxis] * a[cnt])

In [22]:
dataset.write_answers('nn10', np.sum(predicted, axis=0))