In [1]:
import os.path
import datetime
from mlsettings.settings import load_app_config, get_datafolder_path
from mltools.modelbuilder.supervised  import SupervisedDataLoader 
from datatools.customtransformers import LogTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV 
import warnings
warnings.filterwarnings('ignore')
% matplotlib inline 
np.set_printoptions(precision=4)

pd.set_option('display.width', 200)
pd.set_option('precision', 4)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
sns.set_style("whitegrid")

In [2]:
load_app_config()
DIRECTORY="kaggle_housing"
TRAIN_FILE ='train.csv'
TEST_FILE = 'test.csv'
RESPONSE = 'SalePrice'
LOGRESPONSE = 'LogSalePrice'
input_path = get_datafolder_path()

train_file = os.path.join(input_path, DIRECTORY, TRAIN_FILE)
test_file = os.path.join(input_path, DIRECTORY, TEST_FILE)
print(train_file)
print(test_file)

Adding D:\DataSource  to system path
Adding D:\MachineLearning  to system path
D:\DataSource\kaggle_housing\train.csv
D:\DataSource\kaggle_housing\test.csv


In [3]:
#df_loader = DataFrameLoader(train_file, sep=',',header=0 ,encoding='utf8')
sm = SupervisedDataLoader(train_file=train_file,test_file=test_file,response =RESPONSE)
train_dataset ,test_dataset  = sm.train_dataset,sm.test_dataset


Loading train_file :D:\DataSource\kaggle_housing\train.csv
Loading test_file :D:\DataSource\kaggle_housing\test.csv


In [4]:
sm.describe_target()

count      1460.0000
mean     180921.1959
std       79442.5029
min       34900.0000
25%      129975.0000
50%      163000.0000
75%      214000.0000
max      755000.0000
Name: SalePrice, dtype: float64

In [26]:
missing_df =train_dataset.isnull().sum().to_frame().reset_index()
missing_df = missing_df.rename(columns= {0: 'Count','index':'Feature'})
missing_df[missing_df['Count']!=0]

Unnamed: 0,Feature,Count
3,LotFrontage,259
6,Alley,1369
25,MasVnrType,8
26,MasVnrArea,8
30,BsmtQual,37
31,BsmtCond,37
32,BsmtExposure,38
33,BsmtFinType1,37
35,BsmtFinType2,38
42,Electrical,1


In [27]:
missing_df =test_dataset.isnull().sum().to_frame().reset_index()
missing_df = missing_df.rename(columns= {0: 'Count','index':'Feature'})
missing_df[missing_df['Count']!=0]

Unnamed: 0,Feature,Count
2,MSZoning,4
3,LotFrontage,227
6,Alley,1352
9,Utilities,2
23,Exterior1st,1
24,Exterior2nd,1
25,MasVnrType,16
26,MasVnrArea,15
30,BsmtQual,44
31,BsmtCond,45


In [7]:
Checkpoint

NameError: name 'Checkpoint' is not defined

In [None]:
tgt_plot = sm.get_target_plot(continuous=True)
tgt_plot.show()

In [None]:
tgt_plot = sm.get_target_plot(continuous=True,convert_log = True)
tgt_plot.show()

In [None]:
continuous_features ,categorical_features  = sm.get_feature_groups(train_dataset)

In [None]:
# Grid of distribution plots of all numerical features
f = pd.melt(train_dataset, value_vars=sorted(continuous_features))
g = sns.FacetGrid(f, col='variable', col_wrap=4, sharex=False, sharey=False)
g = g.map(sns.distplot, 'value')

In [None]:
 train_dataset[['MSSubClass','MoSold','YrSold']].info()

In [None]:
f = pd.melt(train_dataset, value_vars=sorted(categorical_features))
g = sns.FacetGrid(f, col='variable', col_wrap=4, sharex=False, sharey=False)
plt.xticks(rotation='vertical')
g = g.map(sns.countplot, 'value')
[plt.setp(ax.get_xticklabels(), rotation=60) for ax in g.axes.flat]
g.fig.tight_layout()
plt.show()

In [None]:
correlation = train_dataset[continuous_features].corr()
fig = plt.figure(figsize=(16,15))
ax = fig.add_subplot(111)
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(correlation, 
           xticklabels=correlation.columns.values,
           yticklabels=correlation.index.values,
           cmap=cmap)
ax.xaxis.tick_top()
plt.setp(ax.get_xticklabels(), rotation=90)
plt.show()

In [None]:
correlation = correlation.sort_values('SalePrice', ascending=False)
plt.figure(figsize=(8,10))
sns.barplot( correlation['SalePrice'][1:], correlation.index[1:], orient='h')
plt.show()

In [None]:
train_dataset["TotBath"] = train_dataset["FullBath"] + 0.5*train_dataset["HalfBath"] + train_dataset["BsmtFullBath"] + 0.5*train_dataset["BsmtHalfBath"]
train_dataset["TotArea"] = train_dataset["GrLivArea"] + train_dataset["TotalBsmtSF"]
train_dataset['TotalFloorSF'] = train_dataset['1stFlrSF'] + train_dataset['2ndFlrSF']
train_dataset.drop(["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath","1stFlrSF","2ndFlrSF"],axis=1,inplace=True)

test_dataset["TotBath"] = test_dataset["FullBath"] + 0.5*test_dataset["HalfBath"] + test_dataset["BsmtFullBath"] + 0.5*test_dataset["BsmtHalfBath"]
test_dataset["TotArea"] = test_dataset["GrLivArea"] + test_dataset["TotalBsmtSF"]
test_dataset['TotalFloorSF'] = test_dataset['1stFlrSF'] + test_dataset['2ndFlrSF']
test_dataset.drop(["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath","1stFlrSF","2ndFlrSF"],axis=1,inplace=True)

In [None]:
continuous_features ,categorical_features  = sm.get_feature_groups(train_dataset)
count_features = ['TotBath','BedroomAbvGr','KitchenAbvGr','TotalRmsAbvGr','Fireplaces','GarageCars']
ordinal_features = ['OverallQual','OverallCond','YearBuilt','YearRemodAdd','TotRmsAbvGrd','GarageYrBlt','MSSubClass','MoSold','YrSold']
non_count_features = [f for f in continuous_features if f not in count_features + ordinal_features+ ['Id','SalePrice']]
non_count_features

In [None]:
log_transformer = LogTransformer(non_count_features)
train_dataset = log_transformer.fit_transform(train_dataset)
test_dataset  = log_transformer.fit_transform(test_dataset)

In [None]:
f = pd.melt(train_dataset, value_vars=sorted(non_count_features))
g = sns.FacetGrid(f, col='variable', col_wrap=4, sharex=False, sharey=False)
g = g.map(sns.distplot, 'value')

In [None]:
train_dataset[1000:1005]