In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [14]:
dataset = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', keep_default_na = True) 
print(dataset.shape)

In [15]:
## Display all the columns of the dataframe (81 columns in this dataset)
pd.pandas.set_option('display.max_columns', None)
## print the top5 records
dataset.head()

In [16]:
## Missing Values
## 1. Findout the features with missing values
features_with_na = [features for features in dataset.columns if dataset[features].isnull().sum() >1]

## 2. Print feature name and the percentage of missing value
for feature in features_with_na:
    print(feature, np.round(dataset[feature].isnull().mean(), 4), ' %missing values')

In [17]:
import matplotlib.pyplot as plt
## Trying to find relationship between missing values and SalePrice --- need more analysis in the Feature Engineering section
for feature in features_with_na:
    data = dataset.copy()
    
    # create a variable that indicates 1 if the observation was missing or zero otherwise
    data[feature] = np.where(data[feature].isnull(), 1, 0)
    
    # calculate the median SalePrice where the informaiton is missing (1) or present (0)
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title(feature)
    plt.show()

In [18]:
## Working on Numerical Variables
numerical_variables = [features for features in dataset.columns if dataset[features].dtype != 'O']

print('Number of Numerical Variables: ', len(numerical_variables))

In [19]:
## Temporal Variables found (Year/Date/Time Variables)
year_feature = [features for features in numerical_variables if 'Yr' in features or 'Year' in features]
year_feature

In [20]:
## Explore the content of these year variables
for feature in year_feature:
    print(feature, dataset[feature].unique())

In [21]:
## Lets analyze the Temporal Datetime Variables
## We will check whether there is a relation between year the house is sold and the sales price

dataset.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('YrSold')
plt.ylabel('SalePrice')
plt.title('House Price vs Year Sold')

In [23]:
## Compare the difference between ALL year features with SalePrice
for feature in year_feature:
    if feature != 'YrSold':
        data = dataset.copy()
        # compare the difference between year variable and year the house was sold for
        data[feature] = data['YrSold'] - data[feature]
        
        plt.scatter(data[feature], data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()

In [28]:
## 2 Types of Numerical Variables: Continous variables and Discrete Variables

discrete_feature = [feature for feature in numerical_variables if len(dataset[feature].unique()) < 25 and feature not in year_feature + ['Id']]

print("Discrete Variables Count: {}".format(len(discrete_feature)))

In [29]:
discrete_feature

In [31]:
# Find the relationship between Discrete Variables and SalePrice
for feature in discrete_feature:
    data = dataset.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.show()
    

In [33]:
## Find the continuous variables 
continuous_feature=[feature for feature in numerical_variables if feature not in discrete_feature+year_feature+['Id']]

In [35]:
## Create histgrams of continuous variables to understand their distributions

for feature in continuous_feature:
    data = dataset.copy()
    data[feature].hist(bins = 25)
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.show()

In [39]:
## EDA - Part 2 with logarithmic transformation - right skewed

for feature in continuous_feature:
    data = dataset.copy()
    
    if 0 in data[feature].unique():
        pass
    elif feature == 'SalePrice':
        pass
    else:
        data[feature] = np.log(data[feature])
        data['SalePrice'] = np.log(data['SalePrice'])
        
        plt.scatter(data[feature], data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()


In [40]:
## Take a look at the Outliers
for feature in continuous_feature:
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data.boxplot(column = feature)
        plt.show()

In [41]:
## Categorical Variables
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtypes == 'O']
categorical_features

In [42]:
for feature in categorical_features:
    print("The feature is {} and number of categories are {}".format(feature, len(dataset[feature].unique())))

In [43]:
## Find out the relationship between categorical variable and dependent feature (SalePrice)
for feature in categorical_features:
    data = dataset.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.show()