In [1]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.preprocessing import PowerTransformer

In [2]:
train = pd.read_csv('train.csv')

In [3]:
test_X = pd.read_csv('test.csv')
test_Y = pd.read_csv('sample_submission.csv').drop("Id", axis = 1)

In [4]:
class FixNone(BaseEstimator, TransformerMixin):

    def fit( self, X, y= None):
        return self
    
    def drop(self,df):
        if 'Id' in df.columns:
            df.drop('Id',axis=1,inplace=True)
        threshold = df.shape[0]*0.82
        df.dropna(axis = 1, thresh = threshold, inplace = True)
        return df
    
    def transform(self,df, y = None):
        df = self.drop(df)
        for col in df.columns[df.dtypes == 'object']:
            df[col].fillna('NoData',inplace=True)
        for col in df.columns[df.dtypes != 'object']:
            df[col].fillna(df[col].median(), inplace=True)
            
        return df

In [5]:
class PrepareCat(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, df, y = None):
        le = LabelEncoder()
        for col in df.columns[df.dtypes == 'object']:
            df[col] = le.fit_transform(df[col])
        return df

In [6]:
class PrepareNumeric(BaseEstimator, TransformerMixin):

    def fit( self, X, y= None):
        return self
    
    def transform(self,df, y = None):

        date=['YearBuilt','YearRemodAdd','GarageYrBlt',
                  'MoSold','YrSold']
       
        num=[]
        for col in df.columns:
            if df[col].dtype!='O' and len(df[col].unique()) >15 and col not in date:
                num.append(col)

        skewed_features=[]
       
        for col in num:
            if df[col].skew()>0 or df[col].skew()<0:
                skewed_features.append(col)
        
        
        for col in skewed_features:
            if 0 not in df[col].unique():
                df[col]=np.log(df[col])
            
        return df

In [41]:
def transformY(y):
    y = np.log(y)
    return y.values

In [21]:
main_Pipe = Pipeline( steps = [('FixNone',FixNone()),
                                ('FixCat', PrepareCat()),
                                ('FixNum',PrepareNumeric()),
                                ('Scaler', MinMaxScaler())])

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [52]:
X = train.drop(columns=['SalePrice'],axis=1)

In [33]:
X_train = main_Pipe.fit(X).transform(X)

In [42]:
y_train = transformY(train['SalePrice'])

In [34]:
X_test = main_Pipe.fit(test_X).transform(test_X)

In [43]:
y_test = transformY(test_Y['SalePrice'])

In [45]:
print(f'X_train имеет размерность {X_train.shape}, y_test - {y_train.shape}')
print(f'X_test имеет размерность {X_test.shape}, y_test - {y_test.shape}')

X_train имеет размерность (1460, 74), y_test - (1460,)
X_test имеет размерность (1459, 74), y_test - (1459,)


In [25]:
pd.DataFrame([pd.Series(y_train),pd.Series(y_test)]).T

Unnamed: 0,SalePrice,SalePrice.1
0,12.247694,12.039292
1,12.109011,12.142911
2,12.317167,12.120426
3,11.849398,12.096913
4,12.429216,11.923246
...,...,...
1455,12.072541,12.012420
1456,12.254863,12.297842
1457,12.493130,12.127702
1458,11.864462,12.142823


In [47]:
 # X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [48]:
LR = LinearRegression().fit(X_train,y_train)
y_pred = LR.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2',LR.score(X_test,y_test))

MAE: 22.014003274390213
MSE: 485.1931150130167
RMSE: 22.027099559701835
R2 -59870.10279472933


In [49]:
KNN = KNeighborsRegressor(n_neighbors=10)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2',KNN.score(X_test,y_test))

MAE: 0.29490166106226795
MSE: 0.12140812370576229
RMSE: 0.3484366853615765
R2 -13.981330174703535


In [50]:
RF = RandomForestRegressor(n_estimators=100)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2',RF.score(X_test,y_test))

MAE: 0.3751038452541447
MSE: 0.197061000244176
RMSE: 0.44391553278092893
R2 -23.31662576690669


In [51]:
GB = GradientBoostingRegressor()
GB.fit(X_train,y_train)
y_pred = RF.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2',GB.score(X_test,y_test))

MAE: 0.3751038452541447
MSE: 0.197061000244176
RMSE: 0.44391553278092893
R2 -26.946086396658988


#### Для сравнения - разбиение train.csv на train и test

In [53]:
XX = train.drop('SalePrice', axis = 1)
YY = train['SalePrice']

In [54]:
XX_train, XX_test, yy_train, yy_test = train_test_split(main_Pipe.fit(XX).transform(XX), transformY(YY),test_size=0.3,random_state=0)

In [55]:
LR = LinearRegression().fit(XX_train,yy_train)
yy_pred = LR.predict(XX_test)
print("MAE:", metrics.mean_absolute_error(yy_test, yy_pred))
print('MSE:', metrics.mean_squared_error(yy_test, yy_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(yy_test, yy_pred)))
print('R2',LR.score(XX_test,yy_test))

MAE: 0.09579567390360567
MSE: 0.030598521112347513
RMSE: 0.1749243296752842
R2 0.8019655920916235
