In [1]:
import os
import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
def ignore_warn(*args, **kwargs):
    pass

warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="ticks", color_codes=True, font_scale=1.5)
color = sns.color_palette()
sns.set_style('darkgrid')
import mpl_toolkits
from mpl_toolkits.mplot3d import Axes3D
import pylab 

from scipy import stats
from scipy.stats import skew, norm, probplot, boxcox
from scipy.special import boxcox1p
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, StandardScaler, LabelEncoder
from sklearn.feature_selection import f_regression, mutual_info_regression, SelectKBest, RFECV, SelectFromModel
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import PCA, KernelPCA
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression, OrthogonalMatchingPursuit, Lasso, LassoLarsIC, ElasticNet, ElasticNetCV
from sklearn.linear_model import SGDRegressor, PassiveAggressiveRegressor, HuberRegressor, BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor
import xgboost as xgb
from xgboost import XGBRegressor, plot_importance
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, cross_val_predict, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

ModuleNotFoundError: No module named 'mlxtend'

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.drop("Id", axis = 1, inplace = True)

In [4]:
# Function to estimate the dataset
def rstr(df, pred=None): 
    obs = df.shape[0]
    types = df.dtypes
    counts = df.apply(lambda x: x.count())
    uniques = df.apply(lambda x: [x.unique()])
    nulls = df.apply(lambda x: x.isnull().sum())
    distincts = df.apply(lambda x: x.unique().shape[0])
    missing_ration = (df.isnull().sum()/ obs) * 100
    skewness = df.skew()
    kurtosis = df.kurt() 
    print('Data shape:', df.shape)
    
    if pred is None:
        cols = ['types', 'counts', 'distincts', 'nulls', 'missing ration', 'uniques', 'skewness', 'kurtosis']
        str = pd.concat([types, counts, distincts, nulls, missing_ration, uniques, skewness, kurtosis], axis = 1)

    else:
        corr = df.corr()[pred]
        str = pd.concat([types, counts, distincts, nulls, missing_ration, uniques, skewness, kurtosis, corr], axis = 1, sort=False)
        corr_col = 'corr '  + pred
        cols = ['types', 'counts', 'distincts', 'nulls', 'missing_ration', 'uniques', 'skewness', 'kurtosis', corr_col ]
    
    str.columns = cols
    dtypes = str.types.value_counts()
    print('___________________________\nData types:\n',str.types.value_counts())
    print('___________________________')
    return str

In [6]:
details = rstr(data, 'SalePrice')
display(details.sort_values(by='corr SalePrice', ascending=False))

Data shape: (1460, 80)
___________________________
Data types:
 object     43
int64      34
float64     3
Name: types, dtype: int64
___________________________


Unnamed: 0,types,counts,distincts,nulls,missing_ration,uniques,skewness,kurtosis,corr SalePrice
SalePrice,int64,1460,663,0,0.000,"[[208500, 181500, 223500, 140000, 250000, 1430...",1.883,6.536,1.000
OverallQual,int64,1460,10,0,0.000,"[[7, 6, 8, 5, 9, 4, 10, 3, 1, 2]]",0.217,0.096,0.791
GrLivArea,int64,1460,861,0,0.000,"[[1710, 1262, 1786, 1717, 2198, 1362, 1694, 20...",1.367,4.895,0.709
GarageCars,int64,1460,5,0,0.000,"[[2, 3, 1, 0, 4]]",-0.343,0.221,0.640
GarageArea,int64,1460,441,0,0.000,"[[548, 460, 608, 642, 836, 480, 636, 484, 468,...",0.180,0.917,0.623
TotalBsmtSF,int64,1460,721,0,0.000,"[[856, 1262, 920, 756, 1145, 796, 1686, 1107, ...",1.524,13.250,0.614
1stFlrSF,int64,1460,753,0,0.000,"[[856, 1262, 920, 961, 1145, 796, 1694, 1107, ...",1.377,5.746,0.606
FullBath,int64,1460,4,0,0.000,"[[2, 1, 3, 0]]",0.037,-0.857,0.561
TotRmsAbvGrd,int64,1460,12,0,0.000,"[[8, 6, 7, 9, 5, 11, 4, 10, 12, 3, 2, 14]]",0.676,0.881,0.534
YearBuilt,int64,1460,112,0,0.000,"[[2003, 1976, 2001, 1915, 2000, 1993, 2004, 19...",-0.613,-0.440,0.523


In [8]:
display(data.describe().transpose())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MSSubClass,1460.0,56.897,42.301,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.05,24.285,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828,9981.265,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099,1.383,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575,1.113,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.268,30.203,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.866,20.645,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685,181.066,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.64,456.098,0.0,0.0,383.5,712.25,5644.0
BsmtFinSF2,1460.0,46.549,161.319,0.0,0.0,0.0,0.0,1474.0


In [10]:
data.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [13]:
for i in data.columns:
    if type(i) == object:
        print(i)

In [14]:
type(data['MSSubClass'])

pandas.core.series.Series

In [17]:
data['Exterior2nd'].dtypes

dtype('O')

In [27]:
cat_features = []
for i in data.columns:
    if (data[i].dtypes == object):
        print(i)
        cat_features.append(i)
print(cat_features)

MSZoning
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Heating
HeatingQC
CentralAir
Electrical
KitchenQual
Functional
FireplaceQu
GarageType
GarageFinish
GarageQual
GarageCond
PavedDrive
PoolQC
Fence
MiscFeature
SaleType
SaleCondition
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',

In [31]:
data_1 = data.drop(cat_features, axis=1)

In [32]:
data_1.shape

(1460, 37)

In [30]:
details = rstr(data, 'SalePrice')
display(details.sort_values(by='corr SalePrice', ascending=False))

Data shape: (1460, 80)
___________________________
Data types:
 object     43
int64      34
float64     3
Name: types, dtype: int64
___________________________


Unnamed: 0,types,counts,distincts,nulls,missing_ration,uniques,skewness,kurtosis,corr SalePrice
SalePrice,int64,1460,663,0,0.000,"[[208500, 181500, 223500, 140000, 250000, 1430...",1.883,6.536,1.000
OverallQual,int64,1460,10,0,0.000,"[[7, 6, 8, 5, 9, 4, 10, 3, 1, 2]]",0.217,0.096,0.791
GrLivArea,int64,1460,861,0,0.000,"[[1710, 1262, 1786, 1717, 2198, 1362, 1694, 20...",1.367,4.895,0.709
GarageCars,int64,1460,5,0,0.000,"[[2, 3, 1, 0, 4]]",-0.343,0.221,0.640
GarageArea,int64,1460,441,0,0.000,"[[548, 460, 608, 642, 836, 480, 636, 484, 468,...",0.180,0.917,0.623
TotalBsmtSF,int64,1460,721,0,0.000,"[[856, 1262, 920, 756, 1145, 796, 1686, 1107, ...",1.524,13.250,0.614
1stFlrSF,int64,1460,753,0,0.000,"[[856, 1262, 920, 961, 1145, 796, 1694, 1107, ...",1.377,5.746,0.606
FullBath,int64,1460,4,0,0.000,"[[2, 1, 3, 0]]",0.037,-0.857,0.561
TotRmsAbvGrd,int64,1460,12,0,0.000,"[[8, 6, 7, 9, 5, 11, 4, 10, 12, 3, 2, 14]]",0.676,0.881,0.534
YearBuilt,int64,1460,112,0,0.000,"[[2003, 1976, 2001, 1915, 2000, 1993, 2004, 19...",-0.613,-0.440,0.523
