In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
sns.set()

In [2]:
data = pd.read_csv("data/train.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
from sklearn.model_selection import train_test_split
X = data.drop('SalePrice', axis=1)
y = data.SalePrice
X_train, X_val ,y_train, y_val = train_test_split(X,y, random_state=16)
full_X_train = pd.concat([X_train,y_train],axis=1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

class PandasTransform(BaseEstimator, TransformerMixin):
    columns_dropped = ['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature','YearRemodAdd','YearBuilt','MoSold', 'YrSold','GarageType','GarageYrBlt','GarageFinish','GarageArea','GarageCond','Exterior2nd']
    cat_col = ['MSZoning','Street','LotShape',
     'LandContour','Utilities','LotConfig','LandSlope','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
     'Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','GarageQual','PavedDrive','SaleType','SaleCondition']
    qc_order = {"na":0,"po":1, "fa":2, "ta":3,'gd':4,'ex':5}
    col_to_ordinal = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','GarageQual']
    catcol_wout_ordinal = ['MSZoning','Street', 'LotShape',
 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'Foundation', 'BsmtExposure', 'BsmtFinType1',
 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']
    g1 = [['MeadowV', 'BrDale', 'IDOTRR'],
     ['OldTown',
      'BrkSide',
      'Edwards',
      'Sawyer',
      'NPkVill',
      'SWISU',
      'NAmes',
      'Blueste',
      'Mitchel',
      'SawyerW'],
     ['NWAmes',
      'Gilbert',
      'Blmngtn',
      'CollgCr',
      'Crawfor',
      'ClearCr',
      'Somerst',
      'Veenker'],
     ['Timber', 'StoneBr'],
     ['NridgHt', 'NoRidge']]
    g2 = ['BrkComm',
     ['AsphShn', 'CBlock', 'AsbShng'],
     ['Wd Sdng', 'MetalSd', 'WdShing', 'HdBoard'],
     ['Plywood', 'Stucco', 'BrkFace'],
     ['VinylSd', 'CemntBd'],
     ['ImStucc', 'Stone']]
    def fit(self, X, y= None):
        return self
    def transform(self, X):
        X.drop(self.columns_dropped,axis=1,inplace=True)
        for col in self.cat_col:
            X[col] = X[col].str.lower()
        X[self.col_to_ordinal] = X[self.col_to_ordinal].fillna("na") 
        X[self.col_to_ordinal] = X[self.col_to_ordinal].replace(self.qc_order)
        
        for x in range(len(self.g1)):
            X.replace(self.g1[x], x,inplace=True)
        for x in range(len(self.g2)):
            X.replace(self.g2[x], x,inplace=True)
        X.set_index(X['Id'], inplace=True)
        X.drop('Id',axis=1,inplace=True)
        return X  

In [5]:
def pandas_transform(data):
    columns_dropped = ['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature','YearRemodAdd','YearBuilt','MoSold', 'YrSold','GarageType','GarageYrBlt','GarageFinish','GarageArea','GarageCond']
    data = data.drop(columns_dropped,axis=1)
    cat_col = ['MSZoning','Street','LotShape',
     'LandContour','Utilities','LotConfig','LandSlope','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
     'BsmtFinType2',
     'Heating',
     'HeatingQC',
     'CentralAir',
     'Electrical',
     'KitchenQual',
     'Functional',
     'GarageQual',
     'PavedDrive',
     'SaleType',
     'SaleCondition']
    for col in cat_col:
        data[col] = data[col].str.lower()
        
    qc_order = {"na":0,"po":1, "fa":2, "ta":3,'gd':4,'ex':5}
    col_to_ordinal = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','GarageQual']
    data[col_to_ordinal] = data[col_to_ordinal].fillna("na") 
    data[col_to_ordinal] = data[col_to_ordinal].replace(qc_order)
    
    catcol_wout_ordinal = cat_cols
    for col in col_to_ordinal:
        if col in catcol_wout_ordinal:
            catcol_wout_ordinal.remove(col)
            
    ohed_col = pd.DataFrame(ohe.transform(data[catcol_wout_ordinal]))
    ohed_col.index = data.index
    data = data.drop(catcol_wout_ordinal, axis=1)
    data = pd.concat([data, ohed_col], axis=1)
    
    #ACA
    cat_cols2 = ['Neighborhood', 'Exterior1st', 'Exterior2nd']
    for x in range(len(g1)):
        data.replace(g1[x], x,inplace=True)
    for x in range(len(g2)):
        data.replace(g2[x], x,inplace=True)

    data = data.set_index('Id')
    num_cols = [cname for cname in data.columns if 
                data[cname].dtype in ['int64', 'float64']]


    imputed_data = pd.DataFrame( imputer.transform(data[num_cols]))
    
    # print(imputed_data)
    # imputed_data.columns = data.columns
    # imputed_data.columns = imputed_data.columns.astype(str)
    return imputed_data

In [6]:
a = PandasTransform()
example = a.transform(X_train)

In [7]:
example

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
679,20,rl,11844,pave,ir1,lvl,allpub,inside,gtl,3,...,y,322,82,0,0,0,0,0,new,partial
1341,20,rl,8294,pave,reg,lvl,allpub,inside,gtl,1,...,y,0,0,0,0,0,0,0,wd,normal
1076,70,rl,13125,pave,reg,lvl,allpub,inside,gtl,2,...,y,0,0,0,0,0,0,0,cwd,normal
1131,50,rl,7804,pave,reg,lvl,allpub,inside,gtl,1,...,y,431,44,0,0,0,0,0,wd,normal
1211,60,rl,11218,pave,reg,lvl,allpub,inside,gtl,1,...,y,635,104,0,0,0,0,400,wd,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1346,30,rm,6000,pave,reg,lvl,allpub,inside,gtl,1,...,y,0,0,168,0,0,0,0,wd,normal
582,20,rl,12704,pave,reg,lvl,allpub,inside,gtl,4,...,y,0,90,0,0,0,0,0,new,partial
122,50,rm,6060,pave,reg,lvl,allpub,inside,gtl,0,...,p,0,0,140,0,0,0,0,wd,normal
1263,50,rl,11250,pave,reg,lvl,allpub,inside,gtl,2,...,y,120,0,0,0,0,0,0,wd,normal
