In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import visualizations
import plotly.express as px

### *This notebook covers several use cases, using random data, for using the `agg_plot` and `correlation_plot` functions in the associated visualizations.py file*

***
#### `agg_plot` function

#### 1. make random data
- ##### 1 subject info variable -- `Subject`
- ##### 3 independent variables -- `Var`, `Var2`, `Time`
- ##### 1 dependent variable -- `Readout`

In [None]:
subject_list = ['S{}'.format(m) for m in np.arange(1,11)]
ab = ['A', 'B']
xy = ['X', 'Y']
time = np.arange(1,21,1)

agg_data = pd.DataFrame()

for subject in subject_list:
    for var_1 in ab:
        for var_2 in xy:
            for t in time:
                agg_data = pd.concat([agg_data,
                                       pd.DataFrame({'Subject':subject,
                                                     'Var1':var_1,
                                                     'Var2':var_2,
                                                     'Time':t,
                                                     'Readout':np.random.randn(1)[0]}, index=[0])])
                
agg_data

#### 2. minimal example

In [None]:
data = agg_data.copy()
plot_var = 'Readout'
group_var = 'Time'
data[group_var] = pd.Categorical(data[group_var])
visualizations.agg_plot(data=data, plot_var=plot_var, group_var=group_var)

#### 3. minimal example with labels

In [None]:
data = agg_data.copy()
plot_var = 'Readout'
group_var = 'Time'
data[group_var] = pd.Categorical(data[group_var])
visualizations.agg_plot(
    data=data,
    plot_var=plot_var,
    group_var=group_var,
    x_title=group_var,
    y_title=plot_var,
    title='Random data grouped by time'
    )

#### 3. highly configured example

In [None]:
data = agg_data.copy()
plot_var = 'Readout'
group_var = 'Time'
overlay_var = 'Var1'
sep_var = 'Var2'
datapoint_var = 'Subject'

color_var = 'Var1'
colors = {'A':'steelblue', 'B': 'darkred'} # colors must have a color for each unique value in color_var

# convert variables to categoricals
data[group_var] = pd.Categorical(data[group_var])
data[overlay_var] = pd.Categorical(data[overlay_var])
data[sep_var] = pd.Categorical(data[sep_var])

visualizations.agg_plot(
    data=data,
    plot_var=plot_var,
    group_var=group_var,
    overlay_var=overlay_var,
    sep_var=sep_var,
    datapoint_var=datapoint_var,
    color_var=color_var,
    colors=colors,
    plot_mode='line', # switch from bar to line plot
    central_tendency='median', # switch from mean to median
    error_type='std', # switch from sem to std
    plot_datapoints=True, # plot datapoints for each subject
    plot_datalines=True, # plot datalines for each subject
    color_datapoints=True, # color code each subject by its grouping color
    plot_width=1000, # the rest are spacing and text configurations
    y_title=plot_var,
    x_title=group_var,
    h_spacing=0.05,
    x_dtick=2,
    tick_angle=0
    )
# there are more configurations that I did not include here; take a look at the documentation for the rest

#### 4. plot without aggregate data, only subjects

In [None]:
data = agg_data.copy()
plot_var = 'Readout'
group_var = 'Time'
overlay_var = 'Var1'
sep_var = 'Var2'
datapoint_var = 'Subject'

color_var = 'Var1'
colors = {'A':'steelblue', 'B': 'darkred'} # colors must have a color for each unique value in color_var

# convert variables to categoricals
data[group_var] = pd.Categorical(data[group_var])
data[overlay_var] = pd.Categorical(data[overlay_var])
data[sep_var] = pd.Categorical(data[sep_var])

visualizations.agg_plot(
    data=data,
    plot_var=plot_var,
    group_var=group_var,
    overlay_var=overlay_var,
    sep_var=sep_var,
    datapoint_var=datapoint_var,
    color_var=color_var,
    colors=colors,
    plot_agg=False, # remove the plotting of aggregate data
    plot_datapoints=False,
    plot_datalines=True,
    color_datapoints=True, 
    plot_width=1000,
    y_title=plot_var,
    x_title=group_var
    )

#### 5. add shapes to plot

In [None]:
data = agg_data.copy()
plot_var = 'Readout'
group_var = 'Time'
overlay_var = 'Var1'
datapoint_var = 'Subject'

color_var = 'Var1'
colors = {'A':'steelblue', 'B': 'darkred'} # colors must have a color for each unique value in color_var

# convert variables to categoricals
data[group_var] = pd.Categorical(data[group_var])
data[overlay_var] = pd.Categorical(data[overlay_var])

# create list of shapes
# see here for how to make shapes for plotly: https://plotly.com/python/shapes/
shapes = [dict(type='rect', x0=1, y0=-1, x1=5, y1=1, line=dict(color="royalblue", width=2), fillcolor="lightskyblue"),
          dict(type='circle', x0=10, y0=-1, x1=15, y1=1, line=dict(color="darkred", width=2), fillcolor="pink")]

visualizations.agg_plot(
    data=data,
    plot_var=plot_var,
    group_var=group_var,
    overlay_var=overlay_var,
    datapoint_var=datapoint_var,
    color_var=color_var,
    colors=colors,
    plot_mode='line', # switch from bar to line plot
    shapes_to_add=shapes
    )

***
#### `correlation_plot` function

#### 1. make random data
- ##### 1 subject info variable -- `Subject`
- ##### 1 grouping variable -- `Group`
- ##### 2 plotting variable -- `X`, `Y`

In [2]:
subject_list = ['S{}'.format(m) for m in np.arange(1,1001)]
ab = ['A', 'B']

corr_data = pd.DataFrame()
for subject in subject_list:
    for group in ab:
        corr_data = pd.concat([corr_data,
                               pd.DataFrame({'Subject':subject,
                                             'Group':group,
                                             'X':np.random.randn(1)[0],
                                             'Y':np.random.randn(1)[0]}, index=[0])])

color_dict = {'A':'steelblue', 'B':'darkred'}
corr_colors = [color_dict[g] for g in corr_data['Group']]

corr_data

Unnamed: 0,Subject,Group,X,Y
0,S1,A,1.576580,-0.391501
0,S1,B,0.469214,-1.229127
0,S2,A,0.070666,-0.862379
0,S2,B,-0.352049,0.264908
0,S3,A,0.559928,-1.887533
...,...,...,...,...
0,S998,B,-0.500282,-0.576150
0,S999,A,0.433775,-0.829318
0,S999,B,-0.727267,1.455353
0,S1000,A,-2.081743,1.164869


#### 2. minimal example

In [None]:
visualizations.correlation_plot(data_x=corr_data['X'].values, data_y=corr_data['Y'].values)

#### 3. minimal example with a line of fit for each group, color-coded

In [None]:
visualizations.correlation_plot(data_x=corr_data['X'].values,
                                data_y=corr_data['Y'].values,
                                groups=corr_data['Group'].values,
                                colors=corr_colors)

#### 4. highly configured example

In [None]:
visualizations.correlation_plot(
    data_x=corr_data['X'].values,
    data_y=corr_data['Y'].values,
    groups=corr_data['Group'],
    colors=None,
    x_title='X',
    y_title='Y',
    color_palette=px.colors.qualitative.Safe, # use color palette if colors is None
    corr_method='kendall', # switch pearson for kendall-tau
    title="X by Y correlation",
    textinfo=corr_data['Group'], # add hover text beyond the X and Y coordinates
    plot_fits=True,
    plot_identity=True, # plot identity line
    same_xy_scale=True, # plot the axes on the same scale
    showlegend=False, # remove legend
    x_range=None,
    y_range=None,
    text_size=10, # small text size
    font_family='Times', # change font from Arial
    marker_size=4, # tiny markers
    outline_width=0, # get rid of black outline around individual points
    line_width=5, # thicker lines
    opacity=0.8,
    plot_height=600,
    plot_width=600,
    save_path=None, # set absolute path including file name and extension to save plot (png, svg, or html)
    plot_scale=5 # only used for saving, if file extension is png
    )

#### 5. Plot only lines of best fit with confidence intervals, and without points

In [109]:
visualizations.correlation_plot(data_x=corr_data['X'].values,
                                data_y=corr_data['Y'].values,
                                groups=corr_data['Group'].values,
                                colors=corr_colors,
                                plot_points=False,
                                plot_fits=True,
                                plot_ci=True)