In [None]:
%pylab inline
%config InlineBackend.figure_formats = ['retina']

import pandas as pd
import seaborn as sns
sns.set()

In [None]:
## Load in the Ames Housing Data
datafile = "data/Ames_Housing_Data.tsv"
df = pd.read_csv(datafile, sep='\t')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.columns.tolist()

In [None]:
data1 = df

In [None]:
data1

In [None]:
data1.head()

In [None]:
data1.shape[0]

In [None]:
df

In [None]:
df.head()

In [None]:
# This is recommended by the data set author to remove a few outliers

df = df.loc[df['Gr Liv Area'] <= 4000,:]
print("Number of rows in the data:", df.shape[0])
print("Number of columns in the data:", df.shape[1])
data = df.copy() # Keep a copy our original data

In [None]:
data.info()

In [None]:
# A quick look at the data:
df.head()

In [None]:
df.dtypes
#df.dtypes[df.dtypes == np.object]
# Get a Pd.Series consisting of all the string categoricals
one_hot_encode_cols = df.dtypes[df.dtypes == np.object]  # filtering by string categoricals
#print(one_hot_encode_cols)
one_hot_encode_cols = one_hot_encode_cols.index.tolist()  # list of categorical fields

df[one_hot_encode_cols].head()

In [None]:
df[one_hot_encode_cols].head().T

In [None]:
one_hot_encode_cols

In [None]:
x = data.dtypes[data.dtypes == np.object]
x = x.index.tolist()
x

In [None]:
df.loc

In [None]:
# Do the one hot encoding
df = pd.get_dummies(df, columns=one_hot_encode_cols, drop_first=True)
df.describe().T

In [None]:
df.head()

In [None]:
#data[['Order','Sale Type']]
data.loc[:, ['Order', 'Sale Type']]

In [None]:
data.head()

In [None]:
# Create a list of float colums to check for skewing
mask = data.dtypes == np.float
#mask
float_cols = data.columns[mask]
float_cols

In [None]:
skew_limit = 0.75 # define a limit above which we will log transform
skew_vals = data[float_cols].skew()

In [None]:
skew_vals

In [None]:
skew_cols = skew_vals.sort_values(ascending=False).to_frame().rename(columns={0:'Skew'}).query('abs(Skew) > {}'.format(skew_limit))
skew_cols

In [None]:
# Showing the skewed columns
skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {}'.format(skew_limit)))

skew_cols

In [None]:
data.loc[:, 'Sale Type':'Sale Condition']

In [None]:
data.iloc[:, [2,5,7]].describe()

In [None]:
# Let's look at what happens to one of these features, when we apply np.log1p visually.

# Choose a field
field = "BsmtFin SF 1"

# Create two "subplots" and a "figure" using matplotlib
fig, (ax_before, ax_after) = plt.subplots(1, 2, figsize=(10, 5))

# Create a histogram on the "ax_before" subplot
df[field].hist(ax=ax_before)

# Apply a log transformation (numpy syntax) to this column
df[field].apply(np.log1p).hist(ax=ax_after)

# Formatting of titles etc. for each subplot
ax_before.set(title='before np.log1p', ylabel='frequency', xlabel='value')
ax_after.set(title='after np.log1p', ylabel='frequency', xlabel='value')
fig.suptitle('Field "{}"'.format(field));

In [None]:
# Perform the skew transformation:

for col in skew_cols.index.values:
    if col == "SalePrice":
        continue
    df[col] = df[col].apply(np.log1p)

In [None]:
# We now have a larger set of potentially-useful features
df.shape

In [None]:
# There are a *lot* of variables. Let's go back to our saved original data and look at how many values are missing for each variable. 
df = data
data.isnull().sum().sort_values()

In [None]:
smaller_df= df.loc[:,['Lot Area', 'Overall Qual', 'Overall Cond', 
                      'Year Built', 'Year Remod/Add', 'Gr Liv Area', 
                      'Full Bath', 'Bedroom AbvGr', 'Fireplaces', 
                      'Garage Cars','SalePrice']]

In [None]:
smaller_df

In [None]:
smaller_df.describe().T

In [None]:
smaller_df.info()

In [None]:
# There appears to be one NA in Garage Cars - we will take a simple approach and fill it with 0
smaller_df = smaller_df.fillna(0)

In [None]:
smaller_df.info()

In [None]:
# pair plot of features
sns.pairplot(smaller_df, plot_kws=dict(alpha=.1, edgecolor='none'))

In [None]:
#Separate our features from our target

X = smaller_df.loc[:,['Lot Area', 'Overall Qual', 'Overall Cond', 
                      'Year Built', 'Year Remod/Add', 'Gr Liv Area', 
                      'Full Bath', 'Bedroom AbvGr', 'Fireplaces', 
                      'Garage Cars']]

y = smaller_df['SalePrice']

In [None]:
X.info()

In [None]:
X2 = X.copy()

X2['OQ2'] = X2['Overall Qual'] ** 2
X2['GLA2'] = X2['Gr Liv Area'] ** 2

In [None]:
X3 = X2.copy()

# multiplicative interaction
X3['OQ_x_YB'] = X3['Overall Qual'] * X3['Year Built']

# division interaction
X3['OQ_/_LA'] = X3['Overall Qual'] / X3['Lot Area']

In [None]:
data['House Style'].value_counts()

In [None]:
pd.get_dummies(df['House Style'], drop_first=True).head()

In [None]:
pd.get_dummies(df['House Style']).head()

In [None]:
nbh_counts = df.Neighborhood.value_counts()
nbh_counts

In [None]:
other_nbhs = list(nbh_counts[nbh_counts <= 8].index)

other_nbhs

In [None]:
X4 = X3.copy()

X4['Neighborhood'] = df['Neighborhood'].replace(other_nbhs, 'Other')

In [None]:
def add_deviation_feature(X, feature, category):
    
    # temp groupby object
    category_gb = X.groupby(category)[feature]
    
    # create category means and standard deviations for each observation
    category_mean = category_gb.transform(lambda x: x.mean())
    category_std = category_gb.transform(lambda x: x.std())
    
    # compute stds from category mean for each feature value,
    # add to X as new feature
    deviation_feature = (X[feature] - category_mean) / category_std 
    X[feature + '_Dev_' + category] = deviation_feature  

In [None]:
X5 = X4.copy()
X5['House Style'] = df['House Style']
add_deviation_feature(X5, 'Year Built', 'House Style')
add_deviation_feature(X5, 'Overall Qual', 'Neighborhood')

In [None]:
X5

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
#Instantiate and provide desired degree; 
#   Note: degree=2 also includes intercept, degree 1 terms, and cross-terms

pf = PolynomialFeatures(degree=2)

In [None]:
features = ['Lot Area', 'Overall Qual']
pf.fit(df[features])

In [None]:
pf.get_feature_names()  #Must add input_features = features for appropriate names

In [None]:
feat_array = pf.transform(df[features])
pd.DataFrame(feat_array, columns = pf.get_feature_names(input_features=features))