In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


In [None]:
df = pd.read_csv('../inputs/train.csv')
df.head()

In [None]:
print(f'DataFrame has {df.shape[0]} of Rows, and {df.shape[1]} of Columns')
print(df.columns)

In [None]:
# dopping features (date, street, and country)
df = df.drop(['date', 'street', 'country'], axis=1)

In [None]:
# create a pandas dataframe of each features, data-type, percentage of num of null values

df_details = pd.DataFrame(df.dtypes).T.rename(index={0:'datatype'})
df_details = df_details.append(pd.DataFrame(df.isnull().sum()/len(df)*100).T.rename(index={0:'pct_null_values'}))
df_details

In [None]:
# DATA CLEANING 

# set price as target values 
# fix yr_built and yr_renovated
# check city value counts and one-hot-encode
# clean state-zip and mean encode it 

In [None]:
# get the number of years since being buily
df.yr_built = 2021 - df.yr_built

In [None]:
# get the number of years since being renovated, if not set to 0
df.yr_renovated = 2021 - df.yr_renovated
df.yr_renovated = np.where(df.yr_renovated > 2020, 0, df.yr_renovated)

In [None]:
sns.set(style="ticks")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, figsize=(11.7, 8.27),
                                    gridspec_kw={"height_ratios": (.15, .85)})

sns.boxplot(df.yr_built, ax=ax_box)
sns.histplot(df.yr_built, ax=ax_hist)

ax_box.set(yticks=[])
sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)

In [None]:
sns.set(style="ticks")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, figsize=(11.7, 8.27),
                                    gridspec_kw={"height_ratios": (.15, .85)})

sns.boxplot(df.price, ax=ax_box)
sns.histplot(df.price, ax=ax_hist)

ax_box.set(yticks=[])
sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)

In [None]:
# filter out outliers that are greater than 3 std from mean on target values 

outliers = []
def detect_outliers(col):
    mu = np.mean(col)
    std = np.std(col)
    
    for i in col:
        z_score = (i - mu)/ std
        if np.abs(z_score) > 3:
            outliers.append(i)
    return outliers
    

outlier_pt=detect_outliers(df.price)
print(df.price.shape)
df = df[~df.price.isin(outlier_pt)]
print(df.shape)

In [None]:
sns.set(style="ticks")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, figsize=(11.7, 8.27),
                                    gridspec_kw={"height_ratios": (.15, .85)})

sns.boxplot(df.price, ax=ax_box)
sns.histplot(df.price, ax=ax_hist)

ax_box.set(yticks=[])
sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.style as style
import matplotlib.gridspec as gridspec

def plotting_3_chart(df, feature):
    ## Importing seaborn, matplotlab and scipy modules. 
    style.use('fivethirtyeight')

    ## Creating a customized chart. and giving in figsize and everything. 
    fig = plt.figure(constrained_layout=True, figsize=(15,15))
    ## creating a grid of 3 cols and 3 rows. 
    grid = gridspec.GridSpec(ncols=4, nrows=2, figure=fig)
    
    ## Customizing the histogram grid. 
    ax1 = fig.add_subplot(grid[0, :2])
    ax1.set_title('Histogram')
    sns.distplot(df.loc[:,feature], norm_hist=True, ax = ax1)

    # customizing the QQ_plot. 
    ax2 = fig.add_subplot(grid[1, :2])
    ax2.set_title('QQ_plot')
    stats.probplot(df.loc[:,feature], plot = ax2)

    ## Customizing the Box Plot. 
    ax3 = fig.add_subplot(grid[:, 2])
    ax3.set_title('Box Plot')
    # sns.boxplot(df.loc[:,feature], orient='v', ax = ax3)
    sns.boxplot(y=df.price, orient='v', ax = ax3)

plotting_3_chart(df, 'price')

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
ax1.set(yscale = "log")
sns.stripplot(x = "bedrooms", y = "price", data = df, ax = ax1, jitter=True, palette="Blues_d")

In [None]:
fig = plt.figure(figsize=(10,10))
ax = plt.axes(projection="3d")

z_points = df['price']
x_points = df['condition']
y_points = df['yr_built']
ax.scatter3D(x_points, y_points, z_points, c=z_points, cmap='hsv')

ax.set_xlabel('Condition')
ax.set_ylabel('Number of Years since being built')
ax.set_zlabel('Price in Milions ($)')
plt.show()

In [None]:
# remove WA from statezip zip column and convert to int64
df.statezip = df.statezip.str.replace(r'\D', '')
df.statezip = pd.to_numeric(df.statezip,errors='coerce')

In [None]:
# get city unique values
print(len(df.city.value_counts()))
# extract the top ten and one hot encode 
top_10 = [x for x in df.city.value_counts().sort_values(ascending=False).head(10).index]
print(top_10)

In [None]:
df_ohe_city = df.copy()

def one_hot_top_x(df, variable, top_x_labels):
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)
        
        
one_hot_top_x(df_ohe_city, 'city', top_10)

df_ohe_city.drop('city', axis=1, inplace=True)

df_ohe_city.columns

In [None]:
df_ohe_city.statezip.value_counts()

In [None]:
# feature engineer the zipcode by target encoding 
# getting the mean price (target value) for each zipcode 
city_target_encode = df_ohe_city.groupby(['statezip'])['price'].mean().to_dict()
df_ohe_city['statezip'] =  df_ohe_city['statezip'].map(city_target_encode)


In [None]:
df_target_encode = df.copy()

In [None]:
city_target_encode = df_target_encode.groupby(['city'])['price'].mean().to_dict()
df_target_encode['city'] =  df_target_encode['city'].map(city_target_encode)


city_target_encode = df_target_encode.groupby(['statezip'])['price'].mean().to_dict()
df_target_encode['statezip'] =  df_target_encode['statezip'].map(city_target_encode)

df_target_encode

In [None]:
city_target_encode = df.groupby(['city'])['price'].mean().to_dict()
df['city'] =  df['city'].map(city_target_encode)
df

In [None]:
df

In [None]:
# plt correlation with df with city being traget encoded

corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})



In [None]:
# plt correlation with df with city and zipcode being traget encoded
corr = df_target_encode.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})


In [None]:
# plot correlation with df with city and zipcode being traget encoded
corr = df_ohe_city.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})


In [None]:
df.to_csv('../inputs/train_clean.csv')
df_ohe_city.to_csv('../inputs/train_ohe.csv')
df_target_encode.to_csv('../inputs/train_target_encode.csv')