In [1]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.preprocessing import PowerTransformer

In [2]:
train = pd.read_csv('train.csv')

In [3]:
test_X = pd.read_csv('test.csv')
test_Y = pd.read_csv('sample_submission.csv').drop("Id", axis = 1)

In [7]:
class FixNone(BaseEstimator, TransformerMixin):

    def fit( self, X, y= None):
        return self
    
    def drop(self,df):
        if 'Id' in df.columns:
            df.drop('Id',axis=1,inplace=True)
        threshold = df.shape[0]*0.82
        df.dropna(axis = 1, thresh = threshold, inplace = True)
        return df
    
    def transform(self,df):
        df = self.drop(df)
        for col in df.columns[df.dtypes == 'object']:
            df[col].fillna('NoData',inplace=True)
        for col in df.columns[df.dtypes != 'object']:
            df[col].fillna(df[col].median(), inplace=True)
            
        return df

In [66]:
class PrepareCat(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        le = LabelEncoder()
        for col in X.columns[X.dtypes == 'object']:
            X[col] = le.fit_transform(X[col])
        return X

In [72]:
class PrepareNumeric(BaseEstimator, TransformerMixin):

    def fit( self, X, y= None):
        return self
    
    def transform(self,df):

        date=['YearBuilt','YearRemodAdd','GarageYrBlt',
                  'MoSold','YrSold']
       
        num=[]
        for col in df.columns:
            if df[col].dtype!='O' and len(df[col].unique()) >15 and col not in date:
                num.append(col)

        skewed_features=[]
       
        for col in num:
            if df[col].skew()>0 or df[col].skew()<0:
                skewed_features.append(col)
        
        
        for col in skewed_features:
            if 0 not in df[col].unique():
                df[col]=np.log(df[col])
            
        return df

In [73]:
def transformY(y):
    y = np.log(y)
    return y

In [75]:
main_Pipe = Pipeline( steps = [('FixNone',FixNone()),
                                ('FixCat', PrepareCat()),
                                ('FixNum',PrepareNumeric()),
                                ('Scaler', MinMaxScaler())])

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [108]:
X=train.drop(columns=['SalePrice'],axis=1)
Y=train['SalePrice']
X_cols=X.columns
full_pipline = Pipeline(steps = [('preprocessing', main_Pipe ), ('model', LinearRegression())])

In [104]:
Y = transformY(Y)
test_Y = transformY(test_Y)

In [109]:
full_pipline.fit(X,Y)

Pipeline(memory=None,
         steps=[('preprocessing',
                 Pipeline(memory=None,
                          steps=[('FixNone', FixNone()),
                                 ('FixCat', PrepareCat()),
                                 ('FixNum', PrepareNumeric()),
                                 ('Scaler',
                                  MinMaxScaler(copy=True,
                                               feature_range=(0, 1)))],
                          verbose=False)),
                ('model',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [110]:
y_pred = full_pipline.predict(test_X)

ValueError: operands could not be broadcast together with shapes (1459,69) (74,) (1459,69) 

In [107]:
full_pipline.score(test_X,test_Y)

ValueError: operands could not be broadcast together with shapes (1459,69) (74,) (1459,69) 

In [100]:
print("MAE:", metrics.mean_absolute_error(test_Y, y_pred))
print('MSE:', metrics.mean_squared_error(test_Y, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(test_Y, y_pred)))
print('R2',full_pipline.score(test_X,test_Y))

MAE: 124941393.25629988
MSE: 2.277550294852425e+19
RMSE: 4772368693.691241


ValueError: operands could not be broadcast together with shapes (1459,69) (74,) (1459,69) 