In [1]:
import os
import numpy as np
import pandas as pd
from scipy import stats

import plotly.graph_objects as go

In [2]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [3]:
from read_missing_data import read_missing_df

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
data = read_missing_df()

In [6]:
class CramersVCorr():
    r"""
    Calculate correlation Matrix between categorical-categorical columns
    using Cramer's V correlation matrix
    https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
    """
    def _score(self, cat_var1, cat_var2):
        crosstab = np.array(pd.crosstab(cat_var1, cat_var2))
        stat = stats.chi2_contingency(crosstab)[0]
        obs = np.sum(crosstab)
        mini = min(crosstab.shape) - 1
        return np.sqrt(stat/(obs*mini))

    def calculate_matrix(self, df, cat_var):
        rows = []
        for cat_col1 in cat_var:
            col = []
            for cat_col2 in cat_var:
                cramers = self._score(df[cat_col1], df[cat_col2])
                col.append(round(cramers, 5))
            rows.append(col)
        cramers_results = np.array(rows)
        cramers_df = pd.DataFrame(cramers_results, columns=cat_var, index=cat_var)
        return cramers_df
    
    def __call__(self, df, *args, **kwargs):
        return self.calculate_matrix(df, *args, **kwargs)

### Missing value correlation matrix
This plot is similar to correlation matrix.

### This plot shows
- Missing value correlation through heatmap.
- Use a button to decide whether to show annotations or not.

#### Main trace
```python
trace = go.Heatmap(
    x=x,
    y=y,
    z=z,
    colorscale='YlGnBu',
    showscale=True,
    reversescale=False,
    zmin=-1,
    zmax=1,
    hovertemplate="Corr ( %{x}, %{y} ) = %{z} <extra></extra>"
)
```

In [7]:
def gen_annotations(data, reverse_font_color=False):
    x = np.arange(data.shape[1])
    y = np.arange(data.shape[0])
    annotation_text = np.round(data.values, 2).tolist()
    annotations = []
    
    for n, row in enumerate(annotation_text):
        for m, val in enumerate(row):
            font_color = "#000000" if (val < 0.5) ^ reverse_font_color else "#FFFFFF"
            annotations.append(
                go.layout.Annotation(
                    text=str(annotation_text[n][m]),
                    x=x[m],
                    y=y[n],
                    xref="x1",
                    yref="y1",
                    font=dict(color=font_color),
                    showarrow=False,
                )
            )
    return annotations

In [8]:
def plot_missing_value_correlation(data, fig=None, width=500, height=500, colorscale='YlGnBu', reverse_font_color=False):
    if fig is None:
        fig = go.Figure()

    x = data.columns.values.tolist()
    y = data.index.values.tolist()
    z = np.round(data.values, 2).tolist()

    fig.add_trace(
        go.Heatmap(
            x=np.arange(len(x)),
            y=np.arange(len(y)),
            z=z,
            colorscale=colorscale,
            showscale=True,
            reversescale=False,
            zmin=0,
            zmax=1,
            hovertemplate="Corr ( %{x}, %{y} ) = %{z} <extra></extra>"
        )
    )

    annotations = gen_annotations(data, reverse_font_color=False)

    fig.update_layout(
        title={
            'text': 'Missing Value Correlation',
            'y':0.99,
            'x':0,
            'xanchor': 'left',
            'yanchor': 'top',
            'font': {'size': 22},
        },
        annotations=annotations,
        xaxis = dict(scaleanchor='y', constrain='range'),
        width=width,
        height=height,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(238,238,238,0.2)',
        modebar=dict(bgcolor='rgba(0,0,0,0)', activecolor='rgba(68,68,68,0.7)', color='rgba(68,68,68,0.3)'),
    )
    
    fig.update_yaxes(tickvals=np.arange(len(y)), ticktext=y,
                     autorange='reversed', showgrid=False)
    fig.update_xaxes(tickvals=np.arange(len(x)), ticktext=x, tickangle=-45, 
                     showgrid=False)
    
    fig.update_layout(
        updatemenus=[
            dict(
                type = "buttons",
                direction = "right",
                buttons=list([
                    dict(
                        args=[{"annotations": annotations}],
                        args2=[{"annotations": []}],
                        label="Show value",
                        method="relayout"
                    )
                ]),
                active=0,
                x=0.0, xanchor="left",
                y=1, yanchor="top",
                pad={"t": -50},
                bgcolor='rgba(255,255,255,1)',
            )
        ]
    )

    return fig, annotations

In [9]:
data_ms = data.copy(deep=True)
data_ms = data_ms.isna()

data_ms = data_ms.replace(True, 1)
data_ms = data_ms.replace(False, 0)

ms_cols = list(data.isna().sum()[data.isna().sum()!=0].index)

#### This plot shows features with missing value

In [10]:
cramer_corr = CramersVCorr()(data_ms[ms_cols], data_ms[ms_cols].columns)
fig, _ = plot_missing_value_correlation(data=cramer_corr, width=800, height=800)
fig.show(config={'displaylogo': False})
# fig.write_html('./example_plots/missing_value_corr.html', config={'displaylogo':False}, include_plotlyjs='cdn', full_html=False)

#### This plot shows all features; hence, the correlation between features that one of them has no missing value is `NaN`.

In [11]:
cramer_corr_all = CramersVCorr()(data_ms, data_ms.columns)
fig, _ = plot_missing_value_correlation(data=cramer_corr_all, width=800, height=800)
fig.show(config={'displaylogo': False})
# fig.write_html('./example_plots/missing_value_corr_all.html', config={'displaylogo':False}, include_plotlyjs='cdn', full_html=False)