In [1]:
import altair as alt
import plotly.express as px
import pandas as pd

In [33]:
def draw_plot(data_path, variables, design, hypothesis_var, hypothesis_stat):
'''
Currently, only support hypothesis test with two defined variables;
Support type pairs: (ratio, ratio), (nominal, ordinal), (nominal, ratio);
Normality test and Equal variance test have not been wrapped into the function yet;
'''
    x, y = None, None
    alt_plt = True
    if design['study type'] == 'experiment':
        x = design['independent variables']
        y = design['dependent variables']
    else:
        x = design['contributor variables']
        y = design['outcome variables']
        
    format_var = [v['name'] for v in variables]
    var_idx = {}
    for v in hypothesis_var:
        var_idx[v] = format_var.index(v)
    if not isinstance(x, str):
        x = hypothesis_var[0]
    if not isinstance(y, str):
        y = hypothesis_var[1]
    
    data = data_path
    if isinstance(data_path,str):
        data = pd.read_csv(data_path)
    data_select = data[[x,y]]
    if data_select.shape[0] > 5000:
        alt_plt = False
    x_type = variables[var_idx[x]]['data type']
    y_type = variables[var_idx[y]]['data type']
    if (x_type == 'ratio') & (y_type == 'ratio'):
        if alt_plt:
            return alt.Chart(data_select).mark_point().encode(
                        x=x,
                        y=y)
        else:
            return px.scatter(data_select, x=x, y=y)
    elif x_type == 'nominal':
        x_cate = variables[var_idx[x]]['categories']
        data_select = data_select[data_select[x].isin(x_cate)]
        if y_type == 'ordinal':
            if alt_plt:
                if len(x_cate) <=2:
                    cate1 = alt.Chart(data_select).transform_filter(
                                alt.datum.Condition==x_cate[0]
                            ).mark_circle().transform_window(
                                id='rank()',
                                groupby=[y]
                            ).encode(x=alt.X(y,type='ordinal'),
                                    y = alt.Y('id',type='ordinal',axis=None, sort='descending'),
                            ).properties(title=x_cate[0],height=300,width=250)
                    cate2 = alt.Chart(data_select).transform_filter(
                                alt.datum.Condition==x_cate[1]
                            ).mark_circle().transform_window(
                                id='rank()',
                                groupby=[y]
                            ).encode(x=alt.X(y,type='ordinal'),
                                    y = alt.Y('id',type='ordinal',axis=None, sort='descending'),
                            ).properties(title=x_cate[1],height=300,width=250)
                    return alt.concat(cate1, cate2)
                else:
                    return alt.Chart(data_select).mark_bar(
                                opacity=0.3).encode(
                                alt.X(y,type='ordinal', bin=alt.Bin(maxbins=len(x_cate))),
                                alt.Y('count()'),
                                alt.Column(x),
                                color=alt.Color(x)
                            ).properties(width=200)
            else:
                return px.histogram(data_select, x=y, color=x)
        if y_type == 'ratio':
            if alt_plt:
                box = alt.Chart(data_select).mark_boxplot().encode(
                    x='%s:N' %x, 
                    y='%s:Q' %y,
                    color=alt.Color('%s:N' %x)
                ).properties(width=200)
                point = alt.Chart(data_select).mark_point(color='red').encode(
                  x=alt.X('%s:N' %x),
                  y=alt.Y('%s:Q' %y,aggregate='mean'),
                  tooltip = [alt.Tooltip('mean(%s):Q' %y),
                             alt.Tooltip('%s:N' %x)]
                )
                histogram = alt.Chart(data_select).mark_bar(opacity=0.3).encode(
                    alt.X('%s:Q' %y, bin=True),
                    alt.Y('count()'),
                    alt.Column('%s:N' %x),
                    alt.Color('%s:N' %x)
                ).properties(width=200)

                return box + point & histogram
            else:
                com_plot = px.histogram(
                    data_select, 
                    x=y, 
                    histnorm='probability density',
                    color=x, 
                    opacity = 0.5,
                    marginal="box",
                    nbins=70)
                com_plot.update_layout(yaxis_title='density',barmode='overlay')
                return com_plot
    
    

'\nCurrently, only support hypothesis test with two defined variables;\nSupport type pairs: (ratio, ratio), (nominal, ordinal), (nominal, ratio)\n'

## AR_TV

#### Variables: 
- Condition
    - nominal
    - two values: AR, TV
- Score
    - Ordinal
    - five values: 1, 2, 3, 4, 5
    
#### Hypothesis 
- Condition: AR > TV on Score

When there are one nomial variable with only two categories and one ordinal variable, Wilkinson dot plot is recommended to compare mean rank.

In [25]:
variables = [
    {
        'name': 'ID',
        'data type': 'ratio'
    },
    {
        'name': 'Condition',
        'data type': 'nominal',
        'categories': ['AR', 'TV']
    },
    {
        'name': 'Score',
        'data type': 'ordinal',
        'categories': [1,2,3,4,5]
    }
]

experimental_design = {
    'study type': 'experiment',
    'independent variables': 'Condition',
    'dependent variables': 'Score'
}

assumptions = {
    'Type I (False Positive) Error Rate': 0.01969
}

In [26]:
draw_plot('data/ar_tv_long.csv', variables, experimental_design, 
          ['Score', 'Condition'], ['Condition:AR > TV'])

From the dot plot, we can observe that the Score mean ranks for `AR` and `TV` are the same, which is consistent with the Mann Whitney U Test result.

In [None]:
# ar_tv_long = pd.read_csv('data/ar_tv_long.csv')

# ar = alt.Chart(ar_tv_long).transform_filter(
#     alt.datum.Condition=='AR'
# ).mark_circle().transform_window(
#     id='rank()',
#     groupby=['Score']
# ).encode(x=alt.X('Score',type='ordinal'),
#         y = alt.Y('id',type='ordinal',axis=None, sort='descending'),
# ).properties(title='AR',height=300,width=250)

# tv = alt.Chart(ar_tv_long).transform_filter(
#     alt.datum.Condition=='TV'
# ).mark_circle().transform_window(
#     id='rank()',
#     groupby=['Score']
# ).encode(x=alt.X('Score',type='ordinal'),
#         y = alt.Y('id',type='ordinal',axis=None, sort='descending'),
# ).properties(title='TV',height=300,width=250)

# alt.concat(ar,tv)

## CO2

#### Variables

- Plant
    - nominal  
    - six values: 'Qn1', 'Qn2', 'Qn3', 'Qc1', 'Qc2', 'Qc3'
- uptake
    - ratio (continuous)
    
#### Hypothesis  
- Plant: Qn1 < Qn2, Qn2 < Qn3 on uptake

Box plot and histogram plot are recommended to compare more than two groups difference on continuous (ratio) variables.

In [27]:
variables = [
    {
        'name' : 'id',
        'data type' : 'ratio'
    },
    {
        'name' : 'Plant',
        'data type': 'nominal',
        'categories': ['Qn1', 'Qn2', 'Qn3', 'Qc1', 'Qc2', 'Qc3']
    },
    {
        'name' : 'uptake',
        'data type' : 'ratio'
    }
]

study_design = {
    'study type': 'observational study',
    'contributor variables': 'Plant',
    'outcome variables': 'uptake'
}

assumptions = {
    'Type I (False Positive) Error Rate': 0.05,
}

In [34]:
draw_plot('data/co2.csv', variables, study_design, 
          ['Plant', 'uptake'], ['Plant: Qn1 < Qn2', 'Plant: Qc2 < Qc3'])

The median values for categories of `Plant` are close to each other, which is consistent with Kruskall Wallis test result.

In [None]:
# co2 = pd.read_csv('data/co2.csv')
# categories = ['Qn1', 'Qn2', 'Qn3', 'Qc1', 'Qc2', 'Qc3']
# co2_select = co2[co2.Plant.isin(categories)]

In [None]:
# co2_box = alt.Chart(co2_select).mark_boxplot().encode(
#     x='Plant', 
#     y='uptake',
#     color=alt.Color('Plant')
# ).properties(width=500)

# co2_histogram = alt.Chart(co2_select).mark_bar(
#     opacity=0.3).encode(
#     alt.X('uptake:Q', bin=True),
#     alt.Y('count()'),
#     alt.Column('Plant:N'),
#     color=alt.Color('Plant')
# ).properties(width=200)

# co2_box & co2_histogram


## Drug  

#### Variables
- drug
    - nominal   
    - two values: Ecstasy, Alcohol  
- sundayBDI  
    - ratio
- wedsBDI
    - ratio
- BDIchange
    - ratio

#### Hypothesis
- drug: Ecstasy > Alcohol on wedsBDI  
- sundayBDI ~ BDIchange 

Scatter plot: association/relationship between two continuous variables  
Box plot: comparison between continuous variables of two groups
    

In [6]:
d = {'drug': pd.Series(["Ecstasy", "Ecstasy", "Ecstasy", "Alcohol", "Alcohol", "Alcohol"],
                       index=['0', '1', '2', '3', '4', '5']),
     'sundayBDI': pd.Series([15, 35, 16, 16, 15, 20], index=['0', '1', '2', '3', '4', '5']),
     'wedsBDI': pd.Series([28, 35, 35, 5, 6, 30], index=['0', '1', '2', '3', '4', '5']),
     'BDIchange': pd.Series([13, 0, 19, -11, -9, 10], index=['0', '1', '2', '3', '4', '5'])}

drug = pd.DataFrame(d)

###  drug: Ecstasy > Alcohol on wedsBDI 

if normality test necessary: q-q plot  
if equal variance test necessary: error bar plot with standard deviation  

#### Normality

In [37]:
alt.Chart(drug).transform_quantile(
    'wedsBDI', step=0.01
).mark_point().encode(
    x='prob:Q',
    y='value:Q'
).properties(title='Q-Q Plot - wedsBDI')

**Note: The normality q-q plot does not show a normal distribution for `wedsBDI`**

#### Equal Variance

In [38]:
alt.Chart(drug).mark_errorbar(extent='stdev').encode(
  x=alt.X('wedsBDI:Q', scale=alt.Scale(zero=False)),
  y=alt.Y('drug:N')
).properties(height=100) + alt.Chart(drug).mark_point(filled=True, color='black').encode(
  x=alt.X('wedsBDI:Q', aggregate='mean'),
  y=alt.Y('drug:N')
).properties(height=100) + alt.Chart(drug).mark_tick().encode(
  x=alt.X('wedsBDI:Q'),
  y=alt.Y('drug:N')
).properties(height=100)

**Note: Equal variance seems not to be achieved**

In [35]:
variables = [
    {
        'name': 'drug',
        'data type': 'nominal',
        'categories': ['Ecstasy', 'Alcohol']
    },
    {
        'name': 'sundayBDI',
        'data type': 'ratio'
    },
    {
        'name': 'wedsBDI',
        'data type': 'ratio'
    },
    {
        'name': 'BDIchange',
        'data type': 'ratio'
    }
]

study_design = {
    'study type': 'observational study',
    'contributor variables': ['drug', 'sundayBDI'],
    'outcome variables': ['BDIchange', 'wedsBDI']
}

assumptions = {
    'Type I (False Positive) Error Rate': 0.01
}

In [36]:
draw_plot(drug, variables, study_design, 
          ['drug', 'wedsBDI'], ['drug:Ecstasy > Alcohol'])

The plots indicate that the mean and median for two groups are very different, which is **against the results of Mann Whitney U test and kruskall_wallis test**. 

**Note: against**

In [None]:
# alt.Chart(drug).mark_boxplot().encode(
#     x='drug', 
#     y='wedsBDI'
# ).properties(width=200) + \
# alt.Chart(drug).mark_point(color='red').encode(
#   x=alt.X('drug:N'),
#   y=alt.Y('wedsBDI:Q',aggregate='mean'),
#   tooltip = [alt.Tooltip('mean(wedsBDI):Q'),
#              alt.Tooltip('drug:N')]
# ) &\
# alt.Chart(drug).mark_bar(opacity=0.3).encode(
#     alt.X('wedsBDI:Q', bin=True),
#     alt.Y('count()'),
#     alt.Column('drug:N')
# ).properties(width=200)

### sundayBDI ~ BDIchange 

In [24]:
draw_plot(drug, variables, study_design, 
          ['sundayBDI', 'BDIchange'], ['sundayBDI ~ BDIchange'])

Consistent with test result that there is no (monotonic) relationship between sundayBDI and BDIchange

In [10]:
# alt.Chart(drug).mark_point().encode(
#     x='sundayBDI',
#     y='BDIchange'
# )

## Olympics  

#### Variables 
- Sport
    - nominal  
    - two values: Swimming, Wrestling  
- Sex
    - nominal
    - two values: M, F
- Weight
    - ratio
    
#### Hypothesis  
- Sport: Wrestling > Swimming on Weight  
    - histogram  
- Sex: F < M on Weight  
    - histogram

In [11]:
olympics = pd.read_csv('data/athlete_events_cleaned_weight.csv')

### Sport: Wrestling > Swimming on Weight  

If the number of row of dataset excesses 5000, use plotly combined chart instead

In [39]:
variables = [
    {
        'name': 'ID',
        'data type': 'ratio'
    },
    {
        'name': 'Sport',
        'data type': 'nominal',
        'categories': ['Swimming', 'Wrestling']
    },
    {
        'name': 'Sex',
        'data type': 'nominal',
        'categories': ['M', 'F']
    },
    {
        'name': 'Weight',
        'data type': 'ratio'
        # 'data type' : 'ordinal',
        # 'categories' : [22,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106]
    }
]

study_design = {
    'study type': 'observational study',
    'contributor variables': ['Sport', 'Sex'],
    'outcome variables': 'Weight',
}

In [40]:
draw_plot(olympics, variables, study_design, 
          ['Sport', 'Weight'], ['Sport:Wrestling > Swimming'])

The weight for wrestling is a bit right skewed. Their median are close to each other. 

In [None]:
# categories = ['Swimming', 'Wrestling']
# olympics_select = olympics[olympics.Sport.isin(categories)]

In [None]:
# olympics_select.shape[0] > 5000

In [None]:
# sport = px.histogram(
#     olympics_select, 
#     x="Weight", 
#     histnorm='probability density',
#     color="Sport", 
#     opacity = 0.5,
#     marginal="box",
#     nbins=70)
# sport.update_layout(yaxis_title='density',barmode='overlay')
# sport.show()

### Sex: F < M on Weight

In [16]:
draw_plot(olympics, variables, study_design, 
          ['Sex', 'Weight'], ['Sex:F < M'])

It seems that the distributions for M and F are different.

**Note: against**

In [None]:
# sport = px.histogram(
#     olympics_select, 
#     x="Weight", 
#     histnorm='probability density',
#     color="Sex", 
#     opacity = 0.5,
#     marginal="box",
#     nbins=70)
# sport.update_layout(yaxis_title='density',barmode='overlay')
# sport.show()

## US Crime

#### Variables 
- Prob
    - ratio
- Ineq
    - ratio
    
#### Hypothesis
- Ineq ~ -Prob
    - Scatter Plot

In [17]:
crime = pd.read_csv('data/UScrime.csv')

In [18]:
variables = [
    {
        'name': 'So',
        'data type': 'nominal',
        'categories': ['0', '1']
    },
    {
        'name': 'Prob',
        'data type': 'ratio',
        'range': [0,1]
    },
    {
        'name': 'Ineq',
        'data type': 'ratio'
    }
]

study_design = {
    'study type': 'observational study',
    'contributor variables': ['So', 'Prob'],
    'outcome variables': ['Prob', 'Ineq']
}

In [21]:
draw_plot(crime, variables, study_design, 
          ['Ineq', 'Prob'], ['Ineq ~ -Prob'])

In [None]:
# alt.Chart(crime).mark_point().encode(
#     x='Ineq',
#     y='Prob'
# )