In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

sns.set_style('darkgrid')
sns.set(style="ticks", color_codes=True, font_scale=1.5)

In [4]:
pd.set_option('mode.chained_assignment', None)
sns.set_style('whitegrid')
plt.rcParams['font.size']=16

In [5]:
%matplotlib inline

In [100]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one, _name_estimators
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from scipy import sparse

class FeatureUnion_pd(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(
                transformer=trans,
                X=X,
                y=y,
                weight=weight,
                **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(
                transformer=trans,
                X=X,
                y=None,
                weight=weight)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs
    
def make_union_pd(transformers):
    return FeatureUnion_pd(_name_estimators(transformers))

class ColumnsSelector_pd(BaseEstimator, TransformerMixin):
    # features must be a list
    def __init__(self, columns=None, reset_index=True, group=None):
        self.columns = columns
        self.reset_index = reset_index
        self.group = group
        
    def fit(self, X, y = None):
        if self.group == 'numeric':
            self._group_columns = X.select_dtypes(exclude=['object','category']).columns
        elif self.group == 'float':
            self._group_columns = X.select_dtypes(include='float64').columns
        elif self.group == 'int':
            self._group_columns = X.select_dtypes(include='int64').columns
        elif self.group == 'categorical':
            self._group_columns = X.select_dtypes(include=['object','category']).columns
        elif self.group == 'ordered categorical':
            self._group_columns = [col for col in X.select_dtypes(include='category')
                            if X.select_dtypes(include='category')[col].cat.ordered == True]
        elif self.group == 'unordered categorical':
            self._group_columns = [col for col in X.select_dtypes(include='category')
                            if X.select_dtypes(include='category')[col].cat.ordered == False]
        elif self.group == 'with NaN':
            self._group_columns = X.columns[X.isnull().any()]
        elif self.group == 'without NaN':
            self._group_columns = X.columns[~X.isnull().any()]
        elif self.group == 'exclude':
            self._group_columns = [col for col in X.columns if col not in self.columns]
        return self 
    
    def transform(self, X, y = None):
        df = X.copy()
        if self.reset_index:
            df.reset_index(drop=True, inplace=True)
        if self.group is None:
            return df[self.columns]
        else:
            return df[self._group_columns]
    
class drop_pd(BaseEstimator, TransformerMixin):
    def __init__(self, columns, reset_index=True):
        self.columns = columns
        self.reset_index = reset_index
        
    def fit(self, X, y = None):
        return self 
    
    def transform(self, X, y = None):
        df = X.drop(self.columns, axis=1)
        if self.reset_index:
            df.reset_index(drop=True, inplace=True)
        return df
    
class SimpleImputer_pd(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='most_frequent', missing_values=np.nan, fill_value=np.nan, add_indicator=True):
        self.strategy = strategy
        self.missing_values = missing_values
        self.fill_value = fill_value
        self.add_indicator = add_indicator
        self.transformer = SimpleImputer(missing_values=missing_values, strategy=strategy,
                                          fill_value=fill_value, add_indicator=add_indicator)
        
    def fit(self, X, y = None):
        self.transformer.fit(X)
        return self 
    
    def transform(self, X, y = None):
        transformer_output = self.transformer.transform(X)
        column_names = X.columns.tolist()
        if self.transformer.indicator_ is not None:
            for i in self.transformer.indicator_.features_:
                column_names.append(column_names[i] + '_missing')
        return pd.DataFrame(transformer_output, columns=column_names)
    
class OneHotEncoder_pd(BaseEstimator, TransformerMixin):
    def __init__(self, categories='auto', drop='first', sparse=False, handle_unknown='error', get_categories=True):
        self.categories = categories
        self.drop = drop
        self.sparse = sparse
        self.handle_unknown = handle_unknown
        self.get_categories = get_categories
        self.transformer = OneHotEncoder(categories=categories, drop=drop, sparse=sparse, handle_unknown=handle_unknown)

    def fit(self, X, y = None):
        if self.get_categories == True:
            categories = []
            for col in X.columns:
                categories.append(X[col].cat.categories.tolist())
            self.categories = categories
            self.transformer.categories = categories
        self.transformer.fit(X)
        return self 
    
    def transform(self, X, y = None):
        transformer_output = self.transformer.transform(X)
        original_columns = X.columns.tolist()
        final_columns = []
        for i, categories in enumerate(self.categories):
            if self.transformer.drop_idx_ is not None:
                categories = categories[1:]
            categories = [original_columns[i] + '_' + str(s) for s in categories]
            final_columns.extend(categories)
        return pd.DataFrame(transformer_output, columns=final_columns)

class OrdinalEncoder_pd(BaseEstimator, TransformerMixin):
    def __init__(self, categories='auto', get_categories=True):
        self.categories = categories
        self.get_categories = get_categories
        self.transformer = OrdinalEncoder(categories=categories)

    def fit(self, X, y = None):
        if self.get_categories == True:
            categories = []
            for col in X.columns:
                categories.append(X[col].cat.categories.tolist())
            self.categories = categories
            self.transformer.categories = categories
        self.transformer.fit(X)
        return self 
    
    def transform(self, X, y = None):
        transformer_output = self.transformer.transform(X)
        columns = X.columns.tolist()
        return pd.DataFrame(transformer_output, columns=columns)
    
class StandardScaler_pd(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = True
        self.transformer = StandardScaler(with_mean=with_mean, with_std=with_std)
        
    def fit(self, X, y = None):
        self.transformer.fit(X)
        return self 
    
    def transform(self, X, y = None):
        column_names = X.columns.tolist()
        transformer_output = self.transformer.transform(X)
        return pd.DataFrame(transformer_output, columns=column_names)
    
class MinMaxScaler_pd(BaseEstimator, TransformerMixin):
    def __init__(self, feature_range=(0, 1), copy=True):
        self.feature_range = feature_range
        self.copy = copy
        self.transformer = MinMaxScaler(feature_range=feature_range, copy=copy)
        
    def fit(self, X, y = None):
        self.transformer.fit(X)
        return self 
    
    def transform(self, X, y = None):
        column_names = X.columns.tolist()
        transformer_output = self.transformer.transform(X)
        return pd.DataFrame(transformer_output, columns=column_names)   

In [7]:
def missing_values(df):
    '''Shows percentage of missing values'''
    
    # Get only columns with missing values
    nan_df = df.loc[:,df.isnull().any()]
    # Get Series of the percentages and counts of missing data in descending order
    percent = nan_df.isnull().mean()*100
    count = nan_df.isnull().sum()
    # Return DataFrame of the Series (more visually appealing)
    df = pd.concat([count.rename('Count'), percent.rename('Fraction (%)').round(1).astype(str)+' %'], axis=1)
    return df.sort_values('Count', ascending=False)

# Import data from data_description.txt file

In [8]:
import re

def data_description_parser(ddFileName):
    categories_dict = {}
    with open(ddFileName, 'r') as f:
        lines = f.readlines()
        ln = 0
        while True:
            line = lines[ln]
            firstWord = line.split(' ', 1)[0]

            if ':' in firstWord and ':' not in lines[ln+2].split(' ', 1)[0]:  # if first word has : and it has categories
                categories = []
                descriptions = []
                ln += 2

                while re.search('[^ \t\n]', lines[ln]):
                    # Remove left white spaces, rigth \n and tabs and split at first tab
                    [category , description] = lines[ln].lstrip().rstrip('\n').rstrip('\t').split('\t', 1)
                    category = category.rstrip()
                    categories.append(category)
                    descriptions.append(description)

                    if ln == len(lines)-1:
                        break
                    ln +=1
                if all(map(lambda x: x.isdigit(), categories)):
                    categories = sorted(list(map(int, categories)))
                else:
                    categories.reverse()
                categories_dict[firstWord[:-1]] = categories

            if ln == len(lines)-1:
                break
            ln += 1
    
    return categories_dict

def import_data_description(ddFileName, datasets):
    
    categories_dict = data_description_parser(ddFileName)
    missmatches = {}
    for feature in categories_dict.keys():
        for data in datasets:
            for category in data[feature].unique().tolist():
                if category is not np.nan and category not in categories_dict[feature]:
                    if feature not in missmatches.keys():
                        missmatches[feature] = [category]
                    elif category not in missmatches[feature]:
                        missmatches[feature].append(category)
    
    if bool(missmatches):
        print('Warning: the following categories are not contained in the data description file\n')
        for feature, categories in missmatches.items():
            print(feature+':',end=' ')
            for category in categories:
                if category == categories[-1]:
                    print(category)
                else:
                    print(category, end=', ')
    else:
        print('All categories in the dataset are contained in the data description file')
    
    return categories_dict

# Import and create a copy of the dataset

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_prep = train.copy()
categories_dict = import_data_description('data_description.txt', [train,test])
pred = 'SalePrice'

All categories in the dataset are contained in the data description file


# First look at the dataset

In [10]:
# print(train_prep.info())
train_prep.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Explore the data

### Replacing NaNs that are independent categories

In [11]:
NaNToMissing = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
                'FireplaceQu','GarageFinish','GarageQual','GarageCond','PoolQC','Fence',
                'Alley','GarageType'
               ]

train_prep.loc[:,NaNToMissing] = train_prep.loc[:,NaNToMissing].fillna('NA')

### Defining categorical features

In [12]:
from pandas.api.types import CategoricalDtype

orderedCatFeatures = ['OverallQual', 'LotShape', 'LandContour', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond',
                   'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
                   'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual',
                   'GarageCond', 'PavedDrive', 'PoolQC', 'Fence']

for col, categories in categories_dict.items():
    if col in orderedCatFeatures:
        train_prep[col] = train_prep[col].astype(CategoricalDtype(categories=categories, ordered=True))
    else:
        train_prep[col] = train_prep[col].astype(CategoricalDtype(categories=categories, ordered=False))

In [13]:
sns.reset_defaults()
with sns.axes_style("white"):
    ax = data_cleaning.fit_transform(train).corr()[pred].sort_values().plot.bar()
    ax.yaxis.grid(True)
    sns.despine(left=True, bottom=True)
    plt.gcf().set_size_inches(16,5)

NameError: name 'data_cleaning' is not defined

In [None]:
# sns.set_palette(sns.color_palette("Blues"))
sns.set_palette(sns.color_palette("Blues", n_colors=10))
sns.catplot(kind='box', x='OverallQual', y=pred, data=data_cleaning.fit_transform(train))
plt.gcf().set_size_inches(16,8)
# sns.set_palette(sns.color_palette("Blues", n_colors=10))

In [None]:
data_cleaning.fit_transform(train).select_dtypes(include='category').columns

In [None]:
possible_cat_int = data_cleaning.fit_transform(train).select_dtypes(include='int64')

cat_int_cols = [col for col in cat_int if len(cat_int[col].unique()) < 20]

data_cleaning.fit_transform(train)[cat_int_cols].apply(lambda x: [sorted(x.unique())])

In [None]:
cat_int_cols.remove('PoolArea')
print(cat_int_cols)
cat_int_cols = ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
                'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'MoSold']

In [None]:
train_prep.drop(['Id','YearRemodAdd','YearBuilt','YrSold','GarageYrBlt','MiscFeature'], axis=1, inplace=True)

In [None]:
train_prep['AgeRemodAdd'] = pd.Timestamp('now').year - train['YearRemodAdd']
train_prep['AgeBuilding'] = pd.Timestamp('now').year - train['YearBuilt']
train_prep['TimeSold'] = pd.to_datetime(train['YrSold'].astype(str)+ '-' + train['MoSold'].astype(str), format='%Y-%m')
train_prep['AgeSold'] = (pd.Timestamp('now') - train_prep['TimeSold']).dt.days
train_prep['AgeGarage'] = pd.Timestamp('now').year - train['GarageYrBlt']

In [None]:
from sklearn.preprocessing import MinMaxScaler

variance = MinMaxScaler_pd().fit_transform(
    data_cleaning.fit_transform(train).select_dtypes(exclude='category')).var().sort_values(ascending=False)
variance.plot.bar(figsize=(16,6))
sns.despine()

In [None]:
variance.loc[:'TotRmsAbvGrd'].index.tolist()

In [None]:
mask = np.zeros_like(train_prep.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(train_prep.corr(), vmax=1, vmin=-1, cmap='bwr', mask=mask, xticklabels=True, yticklabels=True)
plt.gcf().set_size_inches(16,10)

In [None]:
train_prep.select_dtypes(include='category').columns

In [None]:
train_prep[['LotFrontage','SalePrice']].plot(kind='scatter', x='LotFrontage', y='SalePrice')

In [None]:
ax = train_prep.hist(column='LotFrontage', bins=20)
ax[0,0].axvline(train_prep['LotFrontage'].mean(), c='red', ls='--', zorder=100, label='mean')
ax[0,0].axvline(train_prep['LotFrontage'].median(), c='lime', ls='-.', zorder=100, label='median')
ax[0,0].legend()

In [None]:
ax = train_prep.hist(column='SalePrice', bins=20)
ax[0,0].axvline(train_prep['SalePrice'].mean(), c='red', ls='--', zorder=100, label='mean')
ax[0,0].axvline(train_prep['SalePrice'].median(), c='lime', ls='-.', zorder=100, label='median')
ax[0,0].legend()

# Custom transformers

In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

gs = {}
for model, [estimator, param_grid] in models.items():
    gs[model] = GridSearchCV(estimator, param_grid=param_grid, n_jobs=-2,
                             cv=RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0), iid=False, verbose=0)
    gs[model].fit(X_train, y_train)
    print('{} training score: {:.3f}'.format(model, gs[model].score(X_train, y_train)))
    print('{} test score: {:.3f}'.format(model, gs[model].score(X_test, y_test)))

RuntimeError: Cannot clone object define_categories(combine_lowfreq_cats=True, new_categories_dict=None,
                  ordered_cat_features=['LotShape', 'LandContour', 'Utilities',
                                        'LandSlope', 'ExterQual', 'ExterCond',
                                        'BsmtQual', 'BsmtCond', 'BsmtExposure',
                                        'BsmtFinType1', 'BsmtFinType2',
                                        'HeatingQC', 'CentralAir',
                                        'KitchenQual', 'FireplaceQu',
                                        'GarageFinish', 'GarageQual',
                                        'GarageCond', 'PavedDrive', 'PoolQC',
                                        'Fence', 'OverallQual', 'OverallCond']), as the constructor either does not set or modifies parameter ordered_cat_features

In [114]:
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.api.types import CategoricalDtype

class define_categories(BaseEstimator, TransformerMixin):
    '''Defining categories for all the categorical features based on new_categories_dict'''
    
    def __init__(self, categories_dict=None, ordered_cat_features=None, combine_lowfreq_cats=True):
        self.combine_lowfreq_cats = combine_lowfreq_cats
        self.categories_dict = categories_dict
        self.ordered_cat_features = ordered_cat_features
            
    def fit(self, X, y = None):
        self._cats_to_combine_dict = {}
        # Remove features that are not in the dataset from the dictionary
        columns = X.columns.tolist()
        self._new_categories_dict = self.categories_dict.copy()
        keys_to_delete = [feature for feature in self._new_categories_dict.keys() if feature not in columns]
        for key in keys_to_delete:
            del self._new_categories_dict[key]
        self._new_ordered_cat_features = self.ordered_cat_features.copy()
        self._new_ordered_cat_features = [feature for feature in self._new_ordered_cat_features if feature in columns]
        self._unordered_cat_features = [feature for feature in self._new_categories_dict.keys()
                               if feature not in self._new_ordered_cat_features]
        # All unordered categories have to be strings
        for feature in self._unordered_cat_features:
            self._new_categories_dict[feature] = list(map(str,self._new_categories_dict[feature]))
        if self.combine_lowfreq_cats:
            # Combine categories with low frequency
            for feature in self._unordered_cat_features:

                s = X[feature].astype(str).value_counts()
                cats_to_combine = s[s<=10].index.tolist() + [cat for cat in self._new_categories_dict[feature]
                                                             if cat not in s.index.tolist()]
        
                if len(cats_to_combine) > 0:
                    # If there is only one and the feature has more than 
                    # 2 categories join it with the second low frequency one
                    if len(cats_to_combine) == 1 and len(s) <= 2:
                        print(s)
                        raise Exception('The feature "{}" is too unbalanced:\n{}'.format(feature, s))
                    elif len(cats_to_combine) == 1:
                        cats_to_combine.append(s.index[-2])
                    self._cats_to_combine_dict[feature] = cats_to_combine
                    # Make a new category and apply it to the corresponding old categories
                    new_cat = '-'.join(cats_to_combine)+'_combined'
                    self._new_categories_dict[feature] = [cat for cat in s.index if cat not in cats_to_combine] +[new_cat]
        return self
    
    def transform(self, X, y = None):
        df = X.copy()
        for feature, cats_to_combine in self._cats_to_combine_dict.items():
            if df[feature].dtype == 'int64' or df[feature].dtype == 'float64':
                df[feature] = df[feature].astype(str)
            # Make a new category and apply it to the corresponding old categories
            new_cat = '-'.join(cats_to_combine)+'_combined'
            df.loc[df[feature].isin(cats_to_combine), feature] = new_cat
            
        for col, categories in self._new_categories_dict.items():
            if col in self._new_ordered_cat_features:
                df[col] = df[col].astype(CategoricalDtype(categories=categories, ordered=True))
            else:
                df[col] = df[col].astype(CategoricalDtype(categories=categories, ordered=False))
                
        return df

class year_to_age(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        df = X.copy()
        # Tranforming time features to age features
        df['AgeRemodAdd'] = pd.Timestamp('now').year - train['YearRemodAdd']
        df['AgeBuilding'] = pd.Timestamp('now').year - train['YearBuilt']
        df['TimeSold'] = pd.to_datetime(train['YrSold'].astype(str)+ '-' + train['MoSold'].astype(str), format='%Y-%m')
        df['AgeSold'] = (pd.Timestamp('now') - df['TimeSold']).dt.days
        df['AgeGarage'] = pd.Timestamp('now').year - train['GarageYrBlt']
        df.drop('TimeSold', axis=1, inplace=True)
        
        return df

# Imputation pipeline

In [115]:
from sklearn.pipeline import Pipeline, make_pipeline

imputation_pipes = [
make_pipeline(
    ColumnsSelector_pd(['BsmtFullBath','BsmtHalfBath','GarageCars']),
    SimpleImputer_pd(strategy='median', add_indicator=False)),
make_pipeline(
    ColumnsSelector_pd(['LotFrontage']),
    SimpleImputer_pd(strategy='median', add_indicator=True)),
make_pipeline(
    ColumnsSelector_pd(['AgeGarage']),
    SimpleImputer_pd(strategy='constant', fill_value=year_to_age().fit_transform(train)['AgeGarage'].max(),
                     add_indicator=False)),
make_pipeline(
    ColumnsSelector_pd(['MasVnrArea']),
    SimpleImputer_pd(strategy='constant', fill_value=0, add_indicator=False)),
make_pipeline(
    ColumnsSelector_pd(['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
                        'FireplaceQu','GarageFinish','GarageQual','GarageCond','PoolQC','Fence',
                        'Alley','GarageType']),
    SimpleImputer_pd(strategy='constant', fill_value='NA', add_indicator=False)),
make_pipeline(
    ColumnsSelector_pd(['MasVnrType','MSZoning','Electrical','SaleType','KitchenQual','Exterior2nd','Exterior1st','Functional']),
    SimpleImputer_pd(strategy='most_frequent', add_indicator=False)),
make_pipeline(
    ColumnsSelector_pd(['BsmtFinSF2','BsmtFinSF1','BsmtUnfSF','GarageArea','TotalBsmtSF']),
    SimpleImputer_pd(strategy='mean', add_indicator=False))
]

# Preprocessing pipelines

In [116]:
categories_dict = data_description_parser('data_description.txt')

ordered_cat_features = ['LotShape', 'LandContour', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond',
                        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
                        'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual',
                        'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'OverallQual', 'OverallCond']

cols_to_drop = ['Id','YearRemodAdd','YearBuilt','YrSold','GarageYrBlt','MiscFeature','Street','Utilities']

cols_to_impute = ['LotFrontage','BsmtFullBath','BsmtHalfBath','GarageCars','AgeGarage','MasVnrArea',
                  'MasVnrType','MSZoning','Electrical','SaleType','KitchenQual','Exterior2nd','Exterior1st',
                  'BsmtFinSF2','BsmtFinSF1','BsmtUnfSF','GarageArea','TotalBsmtSF','BsmtQual','BsmtCond',
                  'BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageFinish','GarageQual',
                  'GarageCond','PoolQC','Fence','Alley','GarageType','Functional']

data_cleaning = make_pipeline(
    year_to_age(),
    drop_pd(cols_to_drop),
    define_categories(categories_dict, ordered_cat_features))

imputation = make_union_pd(
    imputation_pipes
    + [ColumnsSelector_pd(columns=cols_to_impute, group='exclude')]
)

encoding = make_union_pd([
    make_pipeline(
        ColumnsSelector_pd(group='unordered categorical'),
        OneHotEncoder_pd(get_categories=True)),
    make_pipeline(
        ColumnsSelector_pd(group='ordered categorical'),
        OrdinalEncoder_pd(get_categories=True)),
    ColumnsSelector_pd(group='numeric')
])

In [117]:
# preprocessing = make_pipeline(data_cleaning, imputation, encoding, StandardScaler_pd())
preprocessing = make_pipeline(data_cleaning, imputation, encoding, StandardScaler_pd())

testing = make_pipeline(data_cleaning, imputation)

In [122]:
test_t = preprocessing.fit(train.drop(pred, axis=1)).transform(test)
# train_t = preprocessing.fit(train.drop(pred, axis=1)).transform(train)
# print([col for col in test.select_dtypes(include='category')
#                             if test.select_dtypes(include='category')[col].cat.ordered == True])
# print([col for col in train.select_dtypes(include='category')
#                             if train.select_dtypes(include='category')[col].cat.ordered == True])

In [33]:
train['BsmtFullBath'].unique()

array([1, 0, 2, 3], dtype=int64)

In [110]:
missing_values(test_t)

Unnamed: 0,Count,Fraction (%)


In [35]:
test_t.loc[test_t['GarageCars'].isnull()]

Unnamed: 0,MSSubClass_60,MSSubClass_50,MSSubClass_120,MSSubClass_30,MSSubClass_160,MSSubClass_70,MSSubClass_80,MSSubClass_90,MSSubClass_190,MSSubClass_85,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,AgeRemodAdd,AgeBuilding,AgeSold


In [64]:
missing_values(test)

Unnamed: 0,Count,Fraction (%)
PoolQC,1456,99.8 %
MiscFeature,1408,96.5 %
Alley,1352,92.7 %
Fence,1169,80.1 %
FireplaceQu,730,50.0 %
LotFrontage,227,15.6 %
GarageCond,78,5.3 %
GarageYrBlt,78,5.3 %
GarageQual,78,5.3 %
GarageFinish,78,5.3 %


In [1409]:
print(ColumnsSelector_pd(group='unordered categorical').fit_transform(preprocessing.fit(
    train.drop(pred, axis=1)).transform(test)).columns.tolist())
print(ColumnsSelector_pd(group='unordered categorical').fit_transform(preprocessing.fit(
    train.drop(pred, axis=1)).transform(train)).columns.tolist())

LotConfig ['FR3', 'FR2']
LotConfig ['FR3', 'FR2']
['MSSubClass', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Foundation', 'Heating', 'Electrical', 'Electrical', 'SaleCondition']
LotConfig ['FR3', 'FR2']
LotConfig ['FR3', 'FR2']
['MSSubClass', 'MSZoning', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'Foundation', 'Heating', 'Functional', 'SaleType', 'SaleCondition']


In [1412]:
pd.DataFrame([train.columns.tolist(),test.columns.tolist()]).T

Unnamed: 0,0,1
0,Id,Id
1,MSSubClass,MSSubClass
2,MSZoning,MSZoning
3,LotFrontage,LotFrontage
4,LotArea,LotArea
5,Street,Street
6,Alley,Alley
7,LotShape,LotShape
8,LandContour,LandContour
9,Utilities,Utilities


In [1405]:
preprocessing.fit(train.drop(pred, axis=1)).transform(test)['LotConfig'].unique()

LotConfig ['FR3', 'FR2']
LotConfig ['FR3', 'FR2']


ValueError: Found unknown categories ['Gtl', 'Sev', 'Mod'] in column 2 during transform

In [None]:
test['LotConfig'].unique()

In [None]:
p = preprocessing.fit(train.drop(pred, axis=1))
sorted(p.transform(train)['MSSubClass'].unique().tolist(), reverse=True)

# Models

In [126]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

models = {}

models['Linear Regression'] = [
    make_pipeline(preprocessing, LinearRegression()),
    {}
]

# models['Logistic Regression'] = [
#     make_pipeline(preprocessing, LogisticRegression(solver='lbfgs')),
#     {
#         'logisticregression__C': np.linspace(0.1,2,20)
#     }
# ]

In [127]:
from sklearn.model_selection import train_test_split

X = train.drop(pred, axis=1)
y = train[pred]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
train_split = {}
train_split['X_train'] = X_train
train_split['X_test'] = X_test
train_split['y_train'] = y_train
train_split['y_test'] = y_test

In [128]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

gs = {}
for model, [estimator, param_grid] in models.items():
    gs[model] = GridSearchCV(estimator, param_grid=param_grid, n_jobs=-2,
                             cv=RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0), iid=False, verbose=0)
    gs[model].fit(X_train, y_train)
    print('{} training score: {:.3f}'.format(model, gs[model].score(X_train, y_train)))
    print('{} test score: {:.3f}'.format(model, gs[model].score(X_test, y_test)))



Linear Regression training score: 0.891
Linear Regression test score: 0.751
