In [None]:
import pandas as pd
import altair as at
at.data_transformers.disable_max_rows()

def read_knmi_data(filename, names):
    return pd.read_csv(filename, 
                       comment='#',               # Skip all comments
                       header=None,               # No header
                       names=names,
                       skipinitialspace=True,     # Fix the trailing spaces after the ','-separator
                       parse_dates=[1])           # Let pandas try and transform the second column to a date

knmi_data = (read_knmi_data('KNMI_20200218.txt', 
                           names=['station', 'datum', 'Wsp_avg', 'Wsp_1hravg', 'Wsp_max', 
                                                      'T_avg', 'T_min', 'T_max', 
                                                      'Sol_duration', 'Global_radiation', 'Precip_total',
                                                      'Precip_hrmax', 'Rel_humid', 'Evaporation']))
knmi_data.head()

In [None]:
at.Chart(knmi_data).mark_line().encode(
    x='yearmonth(datum):T',
    y='mean(T_gem)'
).properties(
    width=400,
    height=100
).interactive()

In [None]:
at.Chart(knmi_data).mark_circle().encode(
    x='T_gem',
    y='Neerslag'
)

# Linked corr plot

Aantal design beslissingen:

- Alles zoveel mogelijk in pandas voorbewerken voordat het naar Altair gaat. 

## Oplossing met subsampling

In [None]:
cor_data = (knmi_data.drop(columns=['station'])
              .corr().stack()
              .reset_index()
              .rename(columns={0: 'correlation', 'level_0': 'variable', 'level_1': 'variable2'}))
cor_data['correlation_label'] = cor_data['correlation'].map('{:.2f}'.format)

knmi_data_long = knmi_data.melt(id_vars=['station', 'datum'])

knmi_data_long = knmi_data_long.merge(knmi_data, 
                     on=['station', 'datum']).melt(id_vars=['station', 'datum', 'variable', 'value'],
                                                   var_name='variable2', value_name='value2')
knmi_long_subsample = knmi_data_long.groupby(['variable', 'variable2'], as_index=False).apply(lambda df: df.sample(frac=0.1))
knmi_long_subsample = knmi_long_subsample.reset_index().drop(columns=['level_0', 'level_1'])
knmi_long_subsample['variable'].unique()

In [None]:
var_sel_cor = at.selection_single(fields=['variable', 'variable2'], clear=False, 
                                  init={'variable': 'T_avg', 'variable2': 'T_avg'})

base = at.Chart(cor_data).properties(
    width=300,
    height=300
).encode(
    x='variable2:O',
    y='variable:O'    
)

text = base.mark_text().encode(
    text='correlation_label',
    color=at.condition(
        at.datum.correlation > 0.5, 
        at.value('white'),
        at.value('black')
    )
)

cor_plot = base.mark_rect().encode(
    color=at.condition(var_sel_cor, at.value('pink'), 'correlation:Q')
).add_selection(var_sel_cor)


scat_plot = at.Chart(knmi_long_subsample).transform_filter(
    var_sel_cor
).mark_point().encode(
    x='value:Q',
    y='value2:Q'
)

scat_plot | cor_plot + text

In [None]:
## Oplossing met aggregatie

In [None]:

import pandas as pd
import altair as at
at.data_transformers.disable_max_rows()

def read_knmi_data(filename, names):
    return pd.read_csv(filename, 
                       comment='#',               # Skip all comments
                       header=None,               # No header
                       names=names,
                       skipinitialspace=True,     # Fix the trailing spaces after the ','-separator
                       parse_dates=[1])           # Let pandas try and transform the second column to a date

knmi_data = (read_knmi_data('KNMI_20200218.txt', 
                           names=['station', 'datum', 'Wsp_avg', 'Wsp_1hravg', 'Wsp_max', 
                                                      'T_avg', 'T_min', 'T_max', 
                                                      'Sol_duration', 'Global_radiation', 'Precip_total',
                                                      'Precip_hrmax', 'Rel_humid', 'Evaporation']))
knmi_data.head()

In [None]:
cor_data = (knmi_data.drop(columns=['station'])
              .corr().stack()
              .reset_index()
              .rename(columns={0: 'correlation', 'level_0': 'variable', 'level_1': 'variable2'}))
cor_data['correlation_label'] = cor_data['correlation'].map('{:.2f}'.format)
cor_data.head()

In [None]:
knmi_data_long = knmi_data.melt(id_vars=['station', 'datum'])
knmi_data_long = knmi_data_long.merge(knmi_data, 
                     on=['station', 'datum']).melt(id_vars=['station', 'datum', 'variable', 'value'],
                                                   var_name='variable2', value_name='value2')
knmi_data_long.head()

In [None]:
import numpy as np

def compute_2d_histogram(var1, var2):
    x = knmi_data[var1]
    y = knmi_data[var2]
    H, xedges, yedges = np.histogram2d(x, y, density=True)
    H[H == 0] = np.nan

    xedges = pd.Series(['{0:.4g}'.format(num) for num in xedges])
    xedges = pd.DataFrame({"a": xedges.shift(), "b": xedges}).dropna().agg(' - '.join, axis=1)
    yedges = pd.Series(['{0:.4g}'.format(num) for num in yedges])
    yedges = pd.DataFrame({"a": yedges.shift(), "b": yedges}).dropna().agg(' - '.join, axis=1)

    res = pd.DataFrame(H, index=yedges, columns=xedges).reset_index().melt(id_vars='index').rename(columns={'index': 'value2', 
                                                                                                            'value': 'count',
                                                                                                            'variable': 'value'})
    

    res['raw_left_value'] = res['value'].str.split(' - ').map(lambda x: x[0]).astype(float)
    res['raw_left_value2'] = res['value2'].str.split(' - ').map(lambda x: x[0]).astype(float)
    res['variable'] = var1
    res['variable2'] = var2 
    return res.dropna()

value_columns = knmi_data.columns.drop(['station', 'datum'])
knmi_data_2dbinned = pd.concat([compute_2d_histogram(var1, var2) for var1 in value_columns for var2 in value_columns])
knmi_data_2dbinned

In [None]:
var_sel_cor = at.selection_single(fields=['variable', 'variable2'], clear=False, 
                                  init={'variable': 'T_avg', 'variable2': 'T_avg'})

base = at.Chart(cor_data).properties(
    width=300,
    height=300
).encode(
    x='variable2:O',
    y='variable:O'    
)

text = base.mark_text().encode(
    text='correlation_label',
    color=at.condition(
        at.datum.correlation > 0.5, 
        at.value('white'),
        at.value('black')
    )
)

cor_plot = base.mark_rect().encode(
    color=at.condition(var_sel_cor, at.value('pink'), 'correlation:Q')
).add_selection(var_sel_cor)


scat_plot = at.Chart(knmi_data_2dbinned).transform_filter(
    var_sel_cor
).mark_rect().encode(
    at.X('value:N', sort=at.EncodingSortField(field='raw_left_value')), 
    at.Y('value2:N', sort=at.EncodingSortField(field='raw_left_value2', order='descending')),
    at.Color('count:Q', scale=at.Scale(scheme='greenblue'))
)

at.vconcat(scat_plot, cor_plot + text).resolve_scale(color='independent')

## Oplossing met 1 var tegelijk

In [None]:
focus_var = 'Rel_humid'

# Read and process weather data
knmi_data = (read_knmi_data('KNMI_20200218.txt', 
                           names=['station', 'datum', 'Wsp_avg', 'Wsp_1hravg', 'Wsp_max', 
                                                      'T_avg', 'T_min', 'T_max', 
                                                      'Sol_duration', 'Global_radiation', 'Precip_total',
                                                      'Precip_hrmax', 'Rel_humid', 'Evaporation']))
knmi_data['fixed'] = knmi_data[focus_var]
knmi_data_long = knmi_data.melt(id_vars=['station', 'datum', 'fixed']) 

# Calculate correlation
cor_data = (knmi_data.drop(columns=['station', 'fixed'])
              .corr().stack()
              .reset_index()
              .rename(columns={0: 'correlation', 'level_0': 'variable', 'level_1': 'variable2'}))
cor_data['correlation_label'] = cor_data['correlation'].map('{:.2f}'.format)

# Correlation heat map
base = at.Chart(cor_data).properties(
    width=400,
    height=400
).encode(
    x='variable2:O',
    y='variable:O'    
)

text = base.mark_text().encode(
    text='correlation_label',
    color=at.condition(
        at.datum.correlation > 0.5, 
        at.value('white'),
        at.value('black')
    )
)

cor_plot = base.mark_rect().encode(
    color='correlation:Q'
)

# Make Altair plot with bar chart with cor
cor_data = cor_data.query('variable2 == "%s"' % focus_var)

var_sel_cor = at.selection_single(fields=['variable'], clear=False, init={'variable': 'T_avg'})

cor_plot_bar = at.Chart(cor_data).mark_bar().properties(
    width=300,
    height=300
).encode(
    at.X('correlation', title='Correlation with %s' % focus_var),
    at.Y('variable', sort=list(cor_data.sort_values(by='correlation', ascending=False)['variable'])),
    color=at.condition(var_sel_cor, at.value('pink'), 'correlation:Q')
).add_selection(var_sel_cor)

scat_plot = at.Chart(knmi_data_long).transform_filter(
    var_sel_cor
).properties(
    width=400,
    height=400
).mark_point().encode(
    at.X('fixed:Q', title=focus_var),
    y='value:Q'
)
cor_plot + text | scat_plot | cor_plot_bar

In [None]:

linked_cor_bar_scatter(knmi_data)

In [None]:
def altair_linked_cor_bar_scatter(dataframe, focus_var, exclude):
    knmi_data['fixed'] = dataframe[focus_var]
    knmi_data_long = dataframe.melt(id_vars=exclude + ['fixed']) 

    # Calculate correlation
    cor_data = (dataframe.drop(columns= exclude + ['fixed'])
                  .corr().stack()
                  .reset_index()
                  .rename(columns={0: 'correlation', 'level_0': 'variable', 'level_1': 'variable2'}))

    # Make Altair plot with bar chart with cor
    cor_data = cor_data.query('variable2 == "%s"' % focus_var)

    var_sel_cor = at.selection_single(fields=['variable'], clear=False, init={'variable': focus_var})

    cor_plot_bar = at.Chart(cor_data).mark_bar().properties(
        width=300,
        height=300
    ).encode(
        at.X('correlation', title='Correlation with %s' % focus_var),
        at.Y('variable', sort=list(cor_data.sort_values(by='correlation', ascending=False)['variable'])),
        color=at.condition(var_sel_cor, at.value('pink'), 'correlation:Q')
    ).add_selection(var_sel_cor)

    scat_plot = at.Chart(knmi_data_long).transform_filter(
        var_sel_cor
    ).properties(
        width=400,
        height=400
    ).mark_point().encode(
        at.X('fixed:Q', title=focus_var),
        y='value:Q'
    )

    return scat_plot | cor_plot_bar

def altair_cor_heatmap(dataframe, exclude):
    cor_data = (dataframe.drop(columns=exclude)
                  .corr().stack()
                  .reset_index()
                  .rename(columns={0: 'correlation', 'level_0': 'variable', 'level_1': 'variable2'}))
    cor_data['correlation_label'] = cor_data['correlation'].map('{:.2f}'.format)

    # Correlation heat map
    base = at.Chart(cor_data).encode(
        x='variable2:O',
        y='variable:O'    
    )

    text = base.mark_text().encode(
        text='correlation_label',
        color=at.condition(
            at.datum.correlation > 0.5, 
            at.value('white'),
            at.value('black')
        )
    )

    cor_plot = base.mark_rect().encode(
        color='correlation:Q'
    )
    
    return cor_plot + text

In [None]:
import pandas as pd
import altair as at
at.data_transformers.disable_max_rows()

def read_knmi_data(filename, names):
    return pd.read_csv(filename, 
                       comment='#',               # Skip all comments
                       header=None,               # No header
                       names=names,
                       skipinitialspace=True,     # Fix the trailing spaces after the ','-separator
                       parse_dates=[1])           # Let pandas try and transform the second column to a date

knmi_data = (read_knmi_data('KNMI_20200218.txt', 
                           names=['station', 'datum', 'Wsp_avg', 'Wsp_1hravg', 'Wsp_max', 
                                                      'T_avg', 'T_min', 'T_max', 
                                                      'Sol_duration', 'Global_radiation', 'Precip_total',
                                                      'Precip_hrmax', 'Rel_humid', 'Evaporation']))

altair_cor_heatmap(knmi_data, ['station']).properties(width=400, height=400)

In [None]:
altair_linked_cor_bar_scatter(knmi_data, 'Wsp_max', ['station', 'datum'])


# Linked Count/NA plot
