In [399]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, MinMaxScaler , OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.preprocessing import PowerTransformer

In [400]:
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer

In [401]:
train = pd.read_csv('train.csv').drop('Id',axis = 1 )
t = train.isnull().sum()
to_drop = t[t>700].index.values
train.drop(to_drop, axis = 1, inplace = True)

In [402]:
test_X = pd.read_csv('test.csv').drop('Id',axis = 1 )
test_X.drop(to_drop, axis = 1,inplace = True)
test_Y = pd.read_csv('sample_submission.csv').drop("Id", axis = 1)


In [403]:
# Выбор переменных: при создании переменной данного типа с атрибутом 'obj' 
# метод transform выдаст только категориальные столбцы
class FeatureSelector( BaseEstimator, TransformerMixin ):
     
    def __init__( self, feature_type ):
        self._feature_type = feature_type 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        feature_names = []
        if self._feature_type == 'obj':
            for col in X.columns[X.dtypes == 'object']:
                feature_names.append(col)
        else:
            for col in X.columns[X.dtypes != 'object']:
                feature_names.append(col)
        return X[feature_names]  

In [404]:
# Метод transform нормализует смещенные данные
# Проверил PowerTransform(), в данном случае работает хуже чем обычный np.log(), поэтому не стал его использовать
class PrepareNumeric(BaseEstimator, TransformerMixin):

    def fit( self, X, y= None):
        return self
    
    def transform(self,df, y = None):

        date=['YearBuilt','YearRemodAdd','GarageYrBlt',
                  'MoSold','YrSold']
       
        for col in df.columns[df.dtypes != 'object']:
            df[col].fillna(df[col].median(),inplace=True)
        
        num=[]
        
        for col in df.columns:
            if df[col].dtype!='O' and len(df[col].unique()) >15 and col not in date:
                num.append(col)

        skewed_features=[]
       
        for col in num:
            if df[col].skew()>0 or df[col].skew()<0:
                skewed_features.append(col)
        
        
        for col in skewed_features:
            if 0 not in df[col].unique():
                df[col]=np.log(df[col])
            
        return df.values

In [405]:
# Не смог понять, как сделать так, чтобы Pipeline изменял по разному X и  y
# Поэтому пока что, нормализация у - просто в виде функции
def transformY(y):
    y = np.log(y)
    return y.values

In [406]:
categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector('obj') ),
                                                                                    
                                          ('imputer', SimpleImputer(strategy = 'constant', fill_value = "None")),
                                          
                                          ( 'one_hot_encoder', OneHotEncoder(sparse = False, handle_unknown='ignore') ) ] )
    
#Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector('num') ),
                                  
                                        ( 'num_transformer', PrepareNumeric() ),
                                  
                                        ( 'mmx_scaler', MinMaxScaler() ) ] )

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
                                                  ( 'numerical_pipeline', numerical_pipeline ) ] )



In [407]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### Использование функций без пайплайна

У меня было предположение, почему не работает Pipeline. Оно заключалось в том, что из-за того, что Pipline сперва выполняет метод fit() для всех своих шагов, а затем уже transform() - OneHotEncoder производит fit на данных, в которых есть пропущенные значения из-за чего в свою очередь энкодер работает неверно.

Однако, сделав всё вручную получил не сильно отличающийся результат

In [408]:
X_train = train.drop(columns=['SalePrice'],axis=1)
y_train = transformY(train['SalePrice'])
X_test = test_X
y_test = transformY(test_Y['SalePrice'])

In [409]:
#Заполняю пропущенные категориальные значения в тренировочной выборке

X_tr_cat =  SimpleImputer(strategy = 'constant', fill_value = "None").fit_transform(
                          FeatureSelector('obj').transform(X_train))

In [410]:
OHE = OneHotEncoder(sparse= False, handle_unknown='ignore').fit(X_tr_cat)

In [411]:
# Удаляю первоначальные категориальные столбцы данных и вместо них вставляю матрицу OneHotEncoder-a

X_train.drop(FeatureSelector('obj').transform(X_train).columns, axis = 1, inplace= True)
X_train = pd.concat([X_train, pd.DataFrame(OHE.transform(X_tr_cat))], axis = 1)

In [412]:
# Исправляю переменные int и float
X_train = PrepareNumeric().transform(FeatureSelector('num').transform(X_train))

In [413]:
MMX = MinMaxScaler().fit(X_train)

In [414]:
X_train = pd.DataFrame(MMX.transform(X_train))

In [415]:
# Аналогично для тестовой выборки. Энкодер обучен на тренировочной.

X_te_cat = OHE.transform(SimpleImputer(strategy = 'constant', fill_value = "None").fit_transform(
                         FeatureSelector('obj').transform(X_test)))

X_test.drop(FeatureSelector('obj').transform(X_train).columns, axis = 1, inplace= True)
X_test = pd.concat([X_test, pd.DataFrame(X_te_cat)], axis = 1)

In [416]:
X_test = pd.DataFrame(MMX.transform((PrepareNumeric().transform(FeatureSelector('num').transform(X_test)))))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [417]:
LR = LinearRegression()

In [418]:
LR.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [419]:
y_pred = LR.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test.reshape(-1,1), y_pred.reshape(-1,1)))
print('MSE:', metrics.mean_squared_error(y_test.reshape(-1,1), y_pred.reshape(-1,1)))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test.reshape(-1,1), y_pred.reshape(-1,1))))
print('R2',LR.score(X_test,y_test.reshape(-1,1)))

MAE: 294829294.3291251
MSE: 1.0895365209042987e+19
RMSE: 3300812810.360955
R2 -1.3444492723257832e+21


### Pipline 

In [420]:
X_train = train.drop(columns=['SalePrice'],axis=1)
y_train = transformY(train['SalePrice'])
X_test = test_X
y_test =  transformY(test_Y['SalePrice'])

In [421]:
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),
                                  
                                  ( 'model', LinearRegression() ) ] )

In [422]:
full_pipeline_m.fit(X_train,y_train)
y_pred = full_pipeline_m.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2',full_pipeline_m.score(X_test,y_test))

MAE: 20885085348.27173
MSE: 1.3760719979826312e+23
RMSE: 370954444370.5495
R2 -1.6980238485444383e+25


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c