<a href="https://colab.research.google.com/github/koleshjr/ALL_MY_TEMPLATES/blob/main/Data_Driven_EDA_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA DRIVEN EDA

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

####Finding Missing values, Number of categorical and numerical columns

In [None]:
def Common_data_analysis(data):
    print("{:=^100}".format(" Common data analysis "))
    # default settings.
    column = data.columns
    total_samples = data.shape[0]
    value_dict = {}
    
    # calculate values.
    missing_values = data.isnull().sum().values
    missing_value_percentage = [round((col_missing_count / total_samples) * 100, 2) for col_missing_count in missing_values ]
    datatype = [data.iloc[:,i].dtype for i in range(data.shape[1])]
    
    categorical_data = list(data.loc[:,data.dtypes == 'object'].columns)
    numerical_data = [d for d  in column if d not in categorical_data]
    # print the diff datatype and count.
    print()
    print("Numerical data list {} ---> total {} numerical values".format(numerical_data, len(numerical_data)))
    print("Categorical data list {} ---> total {} categorical values".format(categorical_data, len(categorical_data)))
    print()
    
    # organise values.
    value_dict["data type"] = datatype
    value_dict["Missing Value"] = missing_values
    value_dict["% of Missing value"] = missing_value_percentage
    df = pd.DataFrame(value_dict, columns = value_dict.keys(), index = column)
    
    # make a highlight for col has high missing value percentage. (>55% say)
    # the particular row will be highlighted if it above missing value threshold.
    def highlight_high_missing_value(sample):
        threshold = 10.0
        style = sample.copy()
        highlight = 'background-color: red;'
        if sample[2] > threshold:
            style[:] = highlight
        else:
            style[:] = ''
        return style
    df = df.style.apply(highlight_high_missing_value, axis = 1)
    display(df)
    return (column, categorical_data, numerical_data)

data_columns, categorical_data, numerical_data = Common_data_analysis(data)


#### Column wise null values distribution

In [None]:
test_null = pd.DataFrame(test.isna().sum())
test_null = test_null.sort_values(by = 0 ,ascending = False)[:-5]
train_null = pd.DataFrame(train.isna().sum())
train_null = train_null.sort_values(by = 0 ,ascending = False)[:-6]
fig, axes = plt.subplots(1,2, figsize=(18,10))
sns.barplot( y =test_null.index ,  x  = test_null[0] ,ax = axes[1] ,palette = "viridis")
sns.barplot( y =train_null.index ,  x  = train_null[0],ax = axes[0],palette = "viridis")
axes[0].set_xlabel("TRAIN DATA COLUMNS")
axes[1].set_xlabel("TEST DATA COLUMNS");

#### Row wise null values distribution

In [None]:


missing_train_row = train.isna().sum(axis=1)
missing_train_row = pd.DataFrame(missing_train_row.value_counts()/train.shape[0]).reset_index()
missing_test_row = test.isna().sum(axis=1)
missing_test_row = pd.DataFrame(missing_test_row.value_counts()/test.shape[0]).reset_index()
missing_train_row.columns = ['no', 'count']
missing_test_row.columns = ['no', 'count']
missing_train_row["count"] = missing_train_row["count"]*100
missing_test_row["count"] = missing_test_row["count"]*100
fig, axes = plt.subplots(1,2, figsize=(18,6))
sns.barplot( y =missing_train_row["count"] ,  x  = missing_train_row["no"],ax = axes[1] ,palette = "viridis")
sns.barplot( y =missing_test_row["count"] ,  x  = missing_test_row["no"],ax = axes[0] ,palette = "viridis")
axes[0].set_ylabel("Percentage of Null values")
axes[1].set_ylabel("Percentage of Null values")
axes[0].set_xlabel("TRAIN DATASET")
axes[1].set_xlabel("TEST DATASET");



#### Missing values visualization

In [None]:
# missing value visualization.
plt.figure(figsize=(15,10))
sns.heatmap(data.isnull().transpose(),
            cbar_kws={'label': 'Missing Data'})
plt.title('Heatmap showing Missing Values ', size = 20, color = 'red')
plt.xticks(size = 12)
plt.yticks(size = 12)
plt.show()

#### Categorical and Continuous diff distribution

In [None]:
df = pd.concat([train[FEATURES], test[FEATURES]], axis=0)

cat_features = [col for col in FEATURES if df[col].nunique() < 15]
cont_features = [col for col in FEATURES if df[col].nunique() >= 15]

del df
print(f'Total number of features: {len(FEATURES)}')
print(f'\033[92mNumber of categorical features: {len(cat_features)}')
print(f'\033[96mNumber of continuos features: {len(cont_features)}')

plt.pie([len(cat_features), len(cont_features)], 
        labels=['Categorical', 'Continuos'],
        colors=['#DE3163', '#58D68D'],
        textprops={'fontsize': 13},
        autopct='%1.1f%%')
plt.show()


#### Numerical Data stats description

In [None]:
def numerical_data_analysis(num_data):
  '''takes in a dataframe with every attribute of numerical data type'''
    print("{:=^100}".format(" Numerical data analysis "))

    column = data.columns

    min_value = [data[col].min() if col in numerical_data else "NA" for col in column]
    max_value = [data[col].max() if col in numerical_data else "NA" for col in column]
    #mode_value = [data[col].mode() if col in numerical_data else "NA" for col in column]
    mean_value = [data[col].mean() if col in numerical_data else "NA" for col in column]
    std_value = [data[col].std() if col in numerical_data else "NA" for col in column]
    #print(mode_value)
    skewness_value = [data[col].skew() if col in numerical_data else "NA" for col in column]
    kurtosis_value = [data[col].kurtosis() if col in numerical_data else "NA" for col in column]

    q1_value = [data[col].quantile(0.25) if col in numerical_data else "NA" for col in column]
    q2_meadian_value = [data[col].quantile(0.50) if col in numerical_data else "NA" for col in column]
    q3_value = [data[col].quantile(0.75) if col in numerical_data else "NA" for col in column]

    # find the range value.
    def find_range(min_value_list, max_value_list):
        range_value = [(max_value - min_value)  if min_value != "NA" else "NA" for max_value, min_value in zip(max_value_list, min_value_list)]
        return range_value

    # find the inter quartile range. (q3-q1)
    def iqr(q1_value_list, q3_value_list):
        range_value = [(q3 - q1) if q1 != "NA" else "NA" for q3, q1 in zip(q3_value_list, q1_value_list)]
        return range_value

    range_value = find_range(min_value, max_value)
    iqr_value = iqr(q1_value, q3_value)

    # organise everything inside a dataframe.
    df_dict = {}
    df_dict["min"] = min_value
    df_dict["max"] = max_value
    df_dict["range(max-min)"] = range_value
    #df_dict["mode"] = mode_value
    df_dict["mean/average"] = mean_value
    df_dict["standard deviation"] = std_value
    df_dict["Q1"] = q1_value
    df_dict["meadian/Q2"] = q2_meadian_value
    df_dict["Q3"] = q3_value
    df_dict["Inter quantile range"] = iqr_value
    df_dict["kurtosis"] = kurtosis_value
    df_dict["Skewness"] = skewness_value

    df = pd.DataFrame(df_dict, columns = df_dict.keys(), index = column)

    # highlight the data based on its skewness.
    def highlight_skewness(sample):
        # make a style as the sample shape and property.
        style = sample.copy()
        # make other cell_value as empty style , because i am focusing in coloring skewness column only.
        style[:] = ''
        # set the colors for skewness cells.
        highly_skewed = 'background-color: red;'
        moderatly_skewed = 'background-color: blue;'
        perfect_normal_destribution = 'background-color: green;'

        # color the cells
        if sample[-1] > 1 or sample[-1] < -1:
            style[-1] = highly_skewed
        elif (sample[-1] > 0.5 or sample[-1]<=1) or (sample[-1] > -0.5 or sample[-1]<=-1):
            style[-1] = moderatly_skewed
        elif sample[-1] == 0:
            style[:] = perfect_normal_destribution
        else:
            style[:] = ''
        return style

    df = df.style.apply(highlight_skewness, axis = 1)
    display(df)



numerical_data_analysis(data)

### Numerical Feature Distribution of both train and test data

In [None]:
ncols = 5
nrows = int(len(cont_features) / ncols + (len(FEATURES) % ncols > 0))-1

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 10), facecolor='#EAEAF2')

for r in range(nrows):
    for c in range(ncols):
        col = cont_features[r*ncols+c]
        sns.histplot(x=train[col], ax=axes[r, c], color='#58D68D', label='Train data' , fill =True , kde = True)
        sns.histplot(x=test[col], ax=axes[r, c], color='#DE3163', label='Test data', fill =True, kde = True)
        axes[r,c].legend()
        axes[r, c].set_ylabel('')
        axes[r, c].set_xlabel(col, fontsize=8)
        axes[r, c].tick_params(labelsize=5, width=0.5)
        axes[r, c].xaxis.offsetText.set_fontsize(4)
        axes[r, c].yaxis.offsetText.set_fontsize(4)
plt.show()

#### Categorical Features train and test distribution

In [None]:
if len(cat_features) == 0 :
    print("No Categorical features")
else:
    ncols = 3
    nrows = 1

    fig, axes = plt.subplots(nrows, ncols, figsize=(18, 5))
    for r in range(nrows):
        for c in range(ncols):
            col = cat_features[c]
            sns.countplot(train[col],ax = axes[c] ,palette = "viridis", label='Train data')
            sns.countplot(test[col],ax = axes[c] ,palette = "magma", label='Test data')
            axes[c].legend()
            axes[c].set_ylabel('')
            axes[c].set_xlabel(col, fontsize=20)
            axes[c].tick_params(labelsize=10, width=0.5)
            axes[c].xaxis.offsetText.set_fontsize(4)
            axes[c].yaxis.offsetText.set_fontsize(4)
    plt.show()

#### Outlier Detection + discrete target

In [None]:
cotinuous_column_list = ['song_duration_ms', 'acousticness', 'danceability', 'energy',
                        'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence',
                        ]
fig, ax = plt.subplots(3,3, figsize=(15,20))
row, col = 3, 3
col_count = 0
for r in range(row):
    for c in range(col):
        sns.boxplot(data=data, x="target", y=cotinuous_column_list[col_count], ax=ax[r,c], palette="Set2")
        
        col_count += 1

#### Finding the number of outliers using different methods
* for normal distributions use the Z score
* for non-normal distributuions use the IQR

In [None]:
# Find how much outliers it has using the IQR methods(if data skewed) and z-score method(if data is normally distributed).
def find_outlier_z_score_method(data, new_feature=False, col_name=None):
    """ Find the outliers in the given dataset. 
    :param data: dataset has number of features to find the outliers.
    :param new_feature: If True create a new feature in the dataFrame to indicate this sample has any outlier feature else do nothing.

    :return data with new feature and number of outlier in each features if new_feature is True, else only number of outlier in each features.
    """
    df = data.copy()
    mean_each_features = df.mean(axis=0)
    std_each_features = df.std(axis=0)
    lower_limit_each_feature = mean_each_features - (3 * std_each_features)
    upper_limit_each_feature = mean_each_features + (3 * std_each_features)

    # find the data is a outlier value or not.
    # print(lower_limit_each_feature, upper_limit_each_feature)
    outlier_df = (df > upper_limit_each_feature) | (df < lower_limit_each_feature)
    # find the number of outliers per feature.
    number_of_outlier_each_feature = outlier_df.sum(axis= 0)
    if df.ndim == 1:
        # if the given data is features than handle diffrently. because it has no .values and .index function.
        number_of_outlier_each_feature_df = pd.DataFrame({"Feature": [col_name if col_name else 'Given feature'], "Number of outliers":number_of_outlier_each_feature})
    else:
        number_of_outlier_each_feature_df = pd.DataFrame({"Features":number_of_outlier_each_feature.index, "Number of outliers": number_of_outlier_each_feature.values})

    if new_feature:
        # add the new feature indicating this row has outlier data to the data and return.
        df["num_of_outliers"] = outlier_df.sum(axis= 1)
        return df, number_of_outlier_each_feature_df
    return number_of_outlier_each_feature_df

def find_outliers_iqr_method(data, new_feature=False, col_name=None):
    """ Find the outliers in the given dataset. 
    :param data: dataset has number of features to find the outliers.
    :param new_feature: If True create a new feature in the dataFrame to indicate this sample has any outlier feature else do nothing.

    :return data with new feature and number of outlier in each features if new_feature is True, else only number of outlier in each features.
    """
    df = data.copy()
    q1_each_features = df.quantile(0.25,)
    q3_each_features = df.quantile(0.75,)
    iqr_each_feature = q3_each_features - q1_each_features
    lower_limit_each_feature = q1_each_features - (iqr_each_feature * 1.5)
    upper_limit_each_feature = q3_each_features + (iqr_each_feature * 1.5)

    # find the data is a outlier value or not.
    # print(lower_limit_each_feature, upper_limit_each_feature)
    outlier_df = (df > upper_limit_each_feature) | (df < lower_limit_each_feature)

    # find the number of outliers per feature.
    number_of_outlier_each_feature = outlier_df.sum(axis= 0)
    if df.ndim == 1:
        # if the given data is features than handle diffrently. because it has no .values and .index function.
        number_of_outlier_each_feature_df = pd.DataFrame({"Feature": [col_name if col_name else 'Given feature'], "Number of outliers":number_of_outlier_each_feature})
    else:
        number_of_outlier_each_feature_df = pd.DataFrame({"Features":number_of_outlier_each_feature.index, "Number of outliers": number_of_outlier_each_feature.values})
    if new_feature:
        # add the new feature indicating this row has outlier data to the data and return.
        df["num_of_outliers"] = outlier_df.sum(axis= 1)
        return df, number_of_outlier_each_feature_df
    return number_of_outlier_each_feature_df
    

# find the number of outliers on each features.
cotinuous_column_list = ['acousticness', 'danceability', 'energy',
                        'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence',
                        ] # we are not using the 'song_duration_ms' because it follows a normal distribution so need to use z-score method.
outliers = find_outliers_iqr_method(data[cotinuous_column_list])
display(outliers)
find_outlier_z_score_method(data["song_duration_ms"], col_name="song_duration_ms") # col name is must if you giving a single feature for test



#### Categorical Data Distribution (univariate)

In [None]:
descrete_column_list = [ 'key', 'audio_mode','time_signature']
fig, ax = plt.subplots(3,1, figsize = (8,14))
row = 3
col_count = 0
for r in range(row):
    sns.countplot(data=data, x=descrete_column_list[col_count], ax=ax[r])
    col_count +=1

#### Correlation Analysis

In [None]:
# can be done by heatmap.
fig = plt.figure(figsize = (18,12))
sns.heatmap(data=data.corr(), annot=True, vmin=0, vmax=1,)
plt.show()

