In [1]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import PowerTransformer, LabelEncoder

In [2]:
import warnings
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
warnings.filterwarnings('ignore')

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler

In [4]:
class DataPreprocessing(BaseEstimator, TransformerMixin ):   
    
    def __init__( self, encoder_type ): 
        
        self._encoder_type = encoder_type
    
    def handle_missing(self, features):
        
        features['Functional'] = features['Functional'].fillna('Typ')
        features['Electrical'] = features['Electrical'].fillna("SBrkr")
        features['KitchenQual'] = features['KitchenQual'].fillna("TA")
        features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
        features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
        features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
        features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

        features["PoolQC"] = features["PoolQC"].fillna("None")
        for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
            features[col] = features[col].fillna(0)
        for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
            features[col] = features[col].fillna('None')
        for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
            features[col] = features[col].fillna('None')

        features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
        objects = []
        for i in features.columns:
            if features[i].dtype == object:
                objects.append(i)
        features.update(features[objects].fillna('None'))

        numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        numeric = []
        for i in features.columns:
            if features[i].dtype in numeric_dtypes:
                numeric.append(i)
        features.update(features[numeric].fillna(0))    
        return features
    
    def fit( self, X, y= None):
        return self
    
    def transform ( self, X, y= None):
        return self
    
    def fit_transform(self, train, test):
        
        train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
        train.drop(train[(train['GrLivArea']>4500) & (train['SalePrice']<300000)].index, inplace=True)
        train.reset_index(drop=True, inplace=True)
        
        train['SalePrice'] = np.log1p(train['SalePrice'])
        
        train_labels = train['SalePrice'].reset_index(drop=True)
        train_features = train.drop(['SalePrice'], axis=1)
        test_features = test

        # Объединяем тестовую и тренировочную выборку
        all_features = pd.concat([train_features, test_features]).reset_index(drop=True)
        
        all_features['MSSubClass'] = all_features['MSSubClass'].apply(str)
        all_features['YrSold'] = all_features['YrSold'].astype(str)
        all_features['MoSold'] = all_features['MoSold'].astype(str)
        all_features['YearRemodAdd'] = all_features['YearRemodAdd'].astype(str)
        all_features['YearBuilt'] = all_features['YearBuilt'].astype(str)
        
        all_features = self.handle_missing(all_features)
        
        numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        numeric = []
        for i in all_features.columns:
            if all_features[i].dtype in numeric_dtypes:
                numeric.append(i)
                
        skew_features = all_features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

        high_skew = skew_features[skew_features > 0.5]
        skew_index = high_skew.index
        skewness = pd.DataFrame({'Skew' :high_skew})
        
        for i in skew_index:
            all_features[i] = boxcox1p(all_features[i], boxcox_normmax(all_features[i] + 1))
        
        for col in all_features.columns[all_features.dtypes != 'O']:
            all_features[col] = MinMaxScaler().fit_transform(all_features[col].values.reshape(-1,1))
        
        all_features['MSSubClass'] = all_features['MSSubClass'].apply(float)
        all_features['YrSold'] = all_features['YrSold'].astype(float)
        all_features['MoSold'] = all_features['MoSold'].astype(float)
        all_features['YearRemodAdd'] = all_features['YearRemodAdd'].astype(float)
        all_features['YearBuilt'] = all_features['YearBuilt'].astype(float)
        
        if self._encoder_type == 'dummy':
            all_features = pd.get_dummies(all_features).reset_index(drop=True)
        else:
            categorical = []
            for col in all_features.columns:
                if all_features[col].dtype=='object':
                    categorical.append(col)

            le = LabelEncoder()
            for x in categorical:
                all_features[x] = le.fit_transform(all_features[x])
                
        X = all_features.iloc[:len(train_labels), :]
        XX = all_features.iloc[len(train_labels):, :]
        
        X_train, X_test, y_train, y_test = train_test_split(X,train_labels, test_size=0.33, random_state=42)
        
        return X_train, X_test, y_train, y_test, XX

In [5]:
train = pd.read_csv('train.csv').drop('Id',axis = 1 )
test = pd.read_csv('test.csv').drop('Id',axis = 1 )

In [6]:
full_pipeline_dummy = Pipeline(steps = [('Data_Preprocessing', DataPreprocessing('dummy'))])
full_pipeline_LE = Pipeline(steps = [('Data_Preprocessing', DataPreprocessing('LE'))])

In [7]:
X_train, X_test, y_train, y_test, Kaggle_X_test = full_pipeline_dummy.fit_transform(train,test)
X_train_l, X_test_l, y_train_l, y_test_l, Kaggle_X_test_l = full_pipeline_dummy.fit_transform(train,test)

In [8]:
LR = LinearRegression()
LR_l = LinearRegression()
LR.fit(X_train,y_train)
LR_l.fit(X_train_l, y_train_l)
y_pred = LR.predict(X_test)
y_pred_l = LR_l.predict(X_test_l)
print(f'MAE: Dummy: {metrics.mean_absolute_error(y_test,y_pred)}, Label Encoder: {metrics.mean_absolute_error(y_test_l,y_pred_l)}')
print(f'MSE: Dummy: {metrics.mean_squared_error(y_test,y_pred)},  Label Encoder: {metrics.mean_squared_error(y_test_l,y_pred_l)}')
print(f'RMSE: Dummy: {np.sqrt(metrics.mean_squared_error(y_test,y_pred))}, Label Encoder: {metrics.mean_squared_error(y_test_l,y_pred_l)}')
print(f'R2 score: Dummy: {metrics.r2_score(y_test,y_pred)}, Label Encoder: {metrics.r2_score(y_test_l,y_pred_l)}')

MAE: Dummy: 0.10239942968652947, Label Encoder: 0.007821042023419888
MSE: Dummy: 0.08104667016454403,  Label Encoder: 0.000402033305511057
RMSE: Dummy: 0.28468696872976823, Label Encoder: 0.000402033305511057
R2 score: Dummy: 0.4566287161855723, Label Encoder: 0.5456116786590393


In [9]:
RF = RandomForestRegressor(n_estimators=100)
RF_l = RandomForestRegressor(n_estimators=100)
RF.fit(X_train, y_train)
RF_l.fit(X_train_l, y_train_l)
y_pred = RF.predict(X_test)
y_pred_l = RF_l.predict(X_test_l)
print(f'MAE: Dummy: {metrics.mean_absolute_error(y_test,y_pred)}, Label Encoder: {metrics.mean_absolute_error(y_test_l,y_pred_l)}')
print(f'MSE: Dummy: {metrics.mean_squared_error(y_test,y_pred)},  Label Encoder: {metrics.mean_squared_error(y_test_l,y_pred_l)}')
print(f'RMSE: Dummy: {np.sqrt(metrics.mean_squared_error(y_test,y_pred))}, Label Encoder: {metrics.mean_squared_error(y_test_l,y_pred_l)}')
print(f'R2 score: Dummy: {metrics.r2_score(y_test,y_pred)}, Label Encoder: {metrics.r2_score(y_test_l,y_pred_l)}')

MAE: Dummy: 0.09279277975661594, Label Encoder: 0.0073676960135850905
MSE: Dummy: 0.01926152477131421,  Label Encoder: 0.00012382324760692667
RMSE: Dummy: 0.13878589543362901, Label Encoder: 0.00012382324760692667
R2 score: Dummy: 0.870862560769448, Label Encoder: 0.8600517995602975


In [10]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) 
feature_sel_model.fit(X_train, y_train)
selected_feat = X_train.columns[(feature_sel_model.get_support())]
X_train_lasso = X_train[selected_feat].reset_index(drop=True)
X_test_lasso =X_test[selected_feat]


In [11]:
LassoLR = LinearRegression().fit(X_train_lasso,y_train)
y_pred_lasso = LassoLR.predict(X_test_lasso)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_lasso))
print('MSE:', metrics.mean_squared_error(y_test, y_pred_lasso))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso)))
print('R2',LassoLR.score(X_test_lasso,y_test))


MAE: 0.09563063195847692
MSE: 0.01872941157935132
RMSE: 0.13685544044484063
R2 0.8744300735082743


In [12]:
LassoRF = RandomForestRegressor().fit(X_train_lasso,y_train)
y_pred_lasso = LassoRF.predict(X_test_lasso)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_lasso))
print('MSE:', metrics.mean_squared_error(y_test, y_pred_lasso))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso)))
print('R2',LassoLR.score(X_test_lasso,y_test))


MAE: 0.10573710274718146
MSE: 0.02345792806783773
RMSE: 0.15315981218269278
R2 0.8744300735082743


In [13]:
from sklearn.ensemble import BaggingRegressor
from sklearn.datasets import make_regression

In [14]:
regr = BaggingRegressor(base_estimator=RandomForestRegressor(), n_estimators=100,
                        random_state=0).fit(X_train,y_train)
y_pred_bagg = regr.predict(X_test)

In [15]:
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_bagg))
print('MSE:', metrics.mean_squared_error(y_test, y_pred_bagg))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_bagg)))
print('R2',regr.score(X_test,y_test))

MAE: 0.09319732338125553
MSE: 0.01950693090022022
RMSE: 0.13966721483662592
R2 0.8692172538981248


In [16]:
import tensorflow as tf
from tensorflow import keras

In [17]:
print(tf.__version__)

2.3.0


In [18]:
model = keras.Sequential()

In [22]:
X_train.shape[1]

302

In [47]:
model.add(keras.layers.Dense(512, activation = 'relu',
                             input_shape = (X_train.shape[1],)))
model.add(keras.layers.Dense(1))
model.compile(optimizer='adam',loss='mse',metrics = ['mae'])

In [48]:
model.fit(X_train,y_train, epochs=80, batch_size=1, verbose= True)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<tensorflow.python.keras.callbacks.History at 0x254caf36da0>

In [49]:
pred = model.predict(X_test)

In [50]:
metrics.r2_score(y_test, pred)

0.8410342733272993