# 1. Import Libraries

In [None]:
#install all needed libraries 
#!pip install -r requirements.txt

#import all libraries
import pandas as pd
import numpy as np 

#Statistics Module 
from pylab import savefig

#visualization
import matplotlib.pyplot as plt 
import seaborn as sns 

%matplotlib inline

# 2. Load Dataset

In [None]:
#load data from csv
data = pd.read_csv('datasets/house_price.csv')

In [None]:
#Show top 5 rows by default
data.head()

In [None]:
#print the dimension of data
data.shape

In [None]:
#print information of data
data.info()

In [None]:
#column names (features)
list(data.columns)

# 3. Descriptive analysis

In [None]:
#print descriptive data
data.describe()

# 4. Numerical and Categorical Feature

In [None]:
#separate numerical and categorical feature

#numerical
numerical = ['TotalSqFeet', 'TotBathroom', 'OverallQual', 'GarageCars', 'Age']

#categorical
#categorical = ['']

# 5. Missing values

In [None]:
#count how many missing values per field
count_missing = data[numerical].isnull().sum().sort_values(ascending = False)
count_missing.head()

In [None]:
#count how much the percentage of missing value for each field
percentage = round(data[numerical].isnull().sum()/len(data)*100,2).sort_values(ascending = False)
percentage.head()

## Exercise

In [None]:
#1. Merge two dataframes (count_missing and percentage) together to one dataframe
#please put your code here




# 6. Univariate Analysis

In [None]:
#distribution plot
fig = sns.distplot(data['TotalSqFeet'], kde = True, color = 'darkblue', label = 'TotalSqFeet').set_title('Distribution Plot of Sq')

In [None]:
#box plot
sns.boxplot(data = data, x = 'TotalSqFeet', color = 'cyan', orient = "h").set_title('Boxplot - TotalSqFeet')

## Exercise

In [None]:
# Create distribution plot for GarageCars
#please put your code here




In [None]:
# Create boxplot for TotBathroom
#please put your code here



## Univariate Analysis - one click to get all features

In [None]:
#create funtion to obtain the detail of each feature
def univariate_analysis(data, features):

    #looping through all feature
    for index, feature in enumerate(features):
        
        #create space for two graphs
        fig, ax = plt.subplots(nrows = 2, ncols = 1, figsize = (10,10))
        
        #Distribution plot
        #set title
        ax[0].set_title("%d. Distribution Plot - %s" % (index+1, feature), fontsize = 16)
        #create distribution plot
        fig = sns.distplot(data[feature], kde = True, color = 'darkblue', label = feature, ax = ax[0])

        #Boxplot
        #set title
        ax[1].set_title("%d. Box Plot - %s" % (index+1, feature),fontsize = 16)
        #create boxplot
        fig = sns.boxplot(data = data, x = feature, color = 'cyan',orient = "h", ax = ax[1])

        #store distribution plot and boxplot
        plt.savefig('exploration\\%d. Univariate analysis of %s.png' % (index+1, feature))

In [None]:
univariate_analysis(data, numerical)

# 7. Bivariate Analysis

In [None]:
#create scatterplot for two variables, usually y is the target
sns.scatterplot(x = 'Age', y = 'SalePrice', data = data)

## Exercise

In [None]:
# Create scatterplot for TotBathroom and SalePrice
#please put your code here



## Bivariate Analysis - one click to get all features

In [None]:
#create funtion to investigate the relationship of each feature to the label
def scatterplot(data, features, target):
    
    #loop though all features
    for index, feature in enumerate(features):
        
        #create joint plot
        sns.scatterplot(x = feature, y = target, data = data)
        
        #show the figure
        plt.show()
        
         #store distribution plot and boxplot
        plt.savefig('exploration\\%d. Bivariate analysis of %s and %s.png' % (index+1, feature, target))

In [None]:
scatterplot(data, numerical, 'SalePrice')

# 8. Correlation

In [None]:
#correlation with pearson method
data.corr(method = 'pearson').style.background_gradient().set_precision(2)

## Exercise

In [None]:
#correlation with spearman method
#put your code here


## Correlation with hitmap

In [None]:
#create correlation with hitmap

#create correlation
corr = data.corr(method = 'pearson')

#convert correlation to numpy array
mask = np.array(corr)

#to mask the repetitive value for each pair
mask[np.tril_indices_from(mask)] = False
fig, ax = plt.subplots(figsize = (15,12))
fig.set_size_inches(20,5)
sns.heatmap(corr, mask = mask, vmax = 0.9, square = True, annot = True)