# Exploratory Data Analysis (EDA)

## EDA functions

Secondary functions

In [1]:
import pandas as pd

# ---------------------------------------------------------

# Display a pandas dataframe format using print()
pd.options.display.float_format = '{:,.3f}'.format

# ---------------------------------------------------------

""" 
Function to highlight rows based on data type
"""
def dtype_highlight(x):
    if x['type'] == 'object':
        color = '#2b83ba'
    elif (x['type'] == 'int64') | (x['type'] == 'int32'):
        color = '#abdda4'
    elif (x['type'] == 'float64') | (x['type'] == 'float32'):
        color = '#ffffbf'
    elif x['type'] == 'datetime64[ns]':
        color = '#fdae61'
    else:
        color = ''
        
    return ['background-color : {}'.format(color) for val in x]

""" 
Function to highlight rows based on data type
"""
def highlight_row_unique(x):
    if x.name == 'unique':
        color = 'mediumpurple'
    else:
        color = ''
    return ['background-color: {}'.format(color) for val in x]

"""
Color zero values in red
"""
def color_zero_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'background-color: red'` for negative
    strings, black otherwise.
    """
    color = 'indianred' if val <= 0 else ' '
    return 'background-color: %s' % color

"""
Color zero values in green
"""
def color_zero_green(val):
    """
    Takes a scalar and returns a string with
    the css property `'background-color: green'` for negative
    strings, black otherwise.
    """
    color = 'lightgreen' if val == 0 else ' '
    return 'background-color: %s' % color

# ---------------------------------------------------------

## Principal functions. They all start with the word 'fun'.

In [2]:
# ---------------------------------------------------------

"""
Create table with types of DataFrame
"""
def fun_df_types(df):
    df_dtypes = pd.DataFrame(df.dtypes.reset_index())    
    df_dtypes.columns = ['column', 'type']
    
    return df_dtypes.style.apply(dtype_highlight, axis=1)

"""
Return non-numeric columns summary
"""
def fun_non_numeric_summary(df):
    try:
        desc = df.describe(include=['O'])
        return desc.style.apply(highlight_row_unique, axis =1)
    except ValueError:
        print("Oops! There aren't variables of object type")

"""
Return numeric columns summary
"""
def fun_numeric_summary(df):
    try:
        desc = df.describe(exclude='object') \
            .sort_values('mean', axis = 1)
        return desc.style.applymap(color_zero_red)
    except ValueError:
        print("Oops! There aren't variables of numeric type")

"""
Return how many missing values in each column
"""
def fun_missing_value(df):
    missing = df.isnull().sum()
    df_missing = pd.DataFrame(
        missing.to_frame().reset_index(), columns=['index',0]
                             )
    df_missing.columns = ['column','missing_number']
    df_missing['pct_missing'] = 100 * df_missing['missing_number'] / df.shape[0]
    
    return df_missing.sort_values(by=['pct_missing'], ascending =False) \
                    .style.applymap(color_zero_green)

"""
Return rows with any NaN value
"""
def fun_any_missing_row(df):

    return df[df.isnull().any(axis = 1)]

"""
Return frequency for each dataframe column
"""
def fun_freq_column(column):
    ct = pd.crosstab(index=column, columns="ptc_frequency") \
           .sort_values('ptc_frequency', ascending=False)
    
    return ct / (ct.sum()/100)

# ---------------------------------------------------------

## Examples to use EDA functions

In [3]:
from sklearn.datasets import load_iris

def change(number):
    try:
        return iris.target_names[number]
    except IndexError:
        return "error"

    # loading data
df_iris = pd.DataFrame(load_iris().data, columns = load_iris().feature_names)
iris = load_iris()
iris_target = pd.DataFrame(iris.target, columns=['target']).applymap(change)

In [5]:
# Dataframe types
fun_df_types(df_iris)

Unnamed: 0,column,type
0,sepal length (cm),float64
1,sepal width (cm),float64
2,petal length (cm),float64
3,petal width (cm),float64


In [6]:
# Non-Numeric columns summary
fun_non_numeric_summary(df_iris)

Oops! There aren't variables of object type


In [7]:
# Numeric columns summary
fun_numeric_summary(df_iris)

Unnamed: 0,petal width (cm),sepal width (cm),petal length (cm),sepal length (cm)
count,150.0,150.0,150.0,150.0
mean,1.19867,3.054,3.75867,5.84333
std,0.763161,0.433594,1.76442,0.828066
min,0.1,2.0,1.0,4.3
25%,0.3,2.8,1.6,5.1
50%,1.3,3.0,4.35,5.8
75%,1.8,3.3,5.1,6.4
max,2.5,4.4,6.9,7.9


In [8]:
# Missing Values in each column
fun_missing_value(df_iris)

Unnamed: 0,column,missing_number,pct_missing
0,sepal length (cm),0,0
1,sepal width (cm),0,0
2,petal length (cm),0,0
3,petal width (cm),0,0


In [9]:
# Rows any missing values
fun_any_missing_row(df_iris)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)


In [38]:
# 
fun_freq_column(iris_target['target'])

col_0,ptc_frequency
target,Unnamed: 1_level_1
setosa,33.333
versicolor,33.333
virginica,33.333
