In [10]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, MinMaxScaler , OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.preprocessing import PowerTransformer

In [3]:
train = pd.read_csv('train.csv')

In [4]:
test_X = pd.read_csv('test.csv')
test_Y = pd.read_csv('sample_submission.csv').drop("Id", axis = 1)

In [34]:
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_type ):
        self._feature_type = feature_type 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        feature_names = []
        if self._feature_type == 'obj':
            for col in X.columns[X.dtypes == 'object']:
                feature_names.append(col)
        else:
            for col in X.columns[X.dtypes != 'object']:
                feature_names.append(col)
        return X[feature_names]  

In [109]:
class PrepareCat(BaseEstimator, TransformerMixin):
    
    def fit( self, X, y= None):
        return self
    
    def transform(self,df, y = None):
        threshold = df.shape[0]*0.82
        df.dropna(axis = 1, thresh = threshold, inplace = True)
            
        return df.values

In [77]:
class PrepareNumeric(BaseEstimator, TransformerMixin):

    def fit( self, X, y= None):
        return self
    
    def transform(self,df, y = None):
        
        if 'Id' in df.columns:
            df.drop('Id',axis=1,inplace=True)
        threshold = df.shape[0]*0.82
        df.dropna(axis = 1, thresh = threshold, inplace = True)
        
        date=['YearBuilt','YearRemodAdd','GarageYrBlt',
                  'MoSold','YrSold']
       
        num=[]
        for col in df.columns:
            if df[col].dtype!='O' and len(df[col].unique()) >15 and col not in date:
                num.append(col)

        skewed_features=[]
       
        for col in num:
            if df[col].skew()>0 or df[col].skew()<0:
                skewed_features.append(col)
        
        
        for col in skewed_features:
            if 0 not in df[col].unique():
                df[col]=np.log(df[col])
            
        return df.values

In [78]:
def transformY(y):
    y = np.log(y)
    return y.values

In [115]:
categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector('obj') ),
                                          
                                           ('cat_transform', PrepareCat()),
                                                                                    
                                          ('imputer', SimpleImputer(strategy = 'constant', fill_value = "None")),
                                          
                                           ( 'one_hot_encoder', OneHotEncoder(sparse = False, handle_unknown='ignore') ) ] )
    
#Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector('num') ),
                                  
                                  ( 'num_transformer', PrepareNumeric() ),
                                  
                                  ('imputer', SimpleImputer(strategy = 'median') ),
                                  
                                  ( 'std_scaler', MinMaxScaler() ) ] )

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
                                                  ( 'numerical_pipeline', numerical_pipeline ) ] )



In [116]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [117]:
X_train = train.drop(columns=['SalePrice'],axis=1)
y_train = transformY(train['SalePrice'])
X_test = test_X
y_test = transformY(test_Y['SalePrice'])

In [118]:
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),
                                  
                                  ( 'model', LinearRegression() ) ] )

In [119]:
full_pipeline_m.fit(X_train,y_train)
y_pred = full_pipeline_m.predict(X_test)
print("MAE:", metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2',full_pipeline_m.score(X_test,y_test))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of 

MAE: 21305439359.659668
MSE: 1.3192210609924075e+23
RMSE: 363210828719.68555
R2 -1.6278718165555455e+25


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
