In [2]:
import altair as alt
import plotly.express as px
import pandas as pd

## AR_TV

#### Variables: 
- Condition
    - nominal
    - two values: AR, TV
- Score
    - Ordinal
    - five values: 1, 2, 3, 4, 5
    
#### Hypothesis 
- Condition: AR > TV on Score

When there are one nomial variable with only two categories and one ordinal variable, Wilkinson dot plot is recommended to compare mean rank.

In [12]:
ar_tv_long = pd.read_csv('data/ar_tv_long.csv')

ar = alt.Chart(ar_tv_long).transform_filter(
    alt.datum.Condition=='AR'
).mark_circle().transform_window(
    id='rank()',
    groupby=['Score']
).encode(x=alt.X('Score',type='ordinal'),
        y = alt.Y('id',type='ordinal',axis=None, sort='descending'),
).properties(title='AR',height=300,width=250)

tv = alt.Chart(ar_tv_long).transform_filter(
    alt.datum.Condition=='TV'
).mark_circle().transform_window(
    id='rank()',
    groupby=['Score']
).encode(x=alt.X('Score',type='ordinal'),
        y = alt.Y('id',type='ordinal',axis=None, sort='descending'),
).properties(title='TV',height=300,width=250)

alt.concat(ar,tv)

## CO2

#### Variables

- Plant
    - nominal  
    - six values: 'Qn1', 'Qn2', 'Qn3', 'Qc1', 'Qc2', 'Qc3'
- uptake
    - ratio (continuous)
    
#### Hypothesis  
- Plant: Qn1 < Qn2, Qn2 < Qn3 on uptake

Box plot and histogram plot are recommended to compare more than two groups difference on continuous (ratio) variables.

In [13]:
co2 = pd.read_csv('data/co2.csv')
categories = ['Qn1', 'Qn2', 'Qn3', 'Qc1', 'Qc2', 'Qc3']
co2_select = co2[co2.Plant.isin(categories)]

In [14]:
alt.Chart(co2_select).mark_boxplot().encode(
    x='Plant', 
    y='uptake'
).properties(width=500)

## Drug  

#### Variables
- drug
    - nominal   
    - two values: Ecstasy, Alcohol  
- sundayBDI  
    - ratio
- wedsBDI
    - ratio
- BDIchange
    - ratio

#### Hypothesis
- drug: Ecstasy > Alcohol on wedsBDI  
- sundayBDI ~ BDIchange 

Scatter plot: association/relationship between two continuous variables  
Box plot: comparison between continuous variables of two groups
    

In [15]:
d = {'drug': pd.Series(["Ecstasy", "Ecstasy", "Ecstasy", "Alcohol", "Alcohol", "Alcohol"],
                       index=['0', '1', '2', '3', '4', '5']),
     'sundayBDI': pd.Series([15, 35, 16, 16, 15, 20], index=['0', '1', '2', '3', '4', '5']),
     'wedsBDI': pd.Series([28, 35, 35, 5, 6, 30], index=['0', '1', '2', '3', '4', '5']),
     'BDIchange': pd.Series([13, 0, 19, -11, -9, 10], index=['0', '1', '2', '3', '4', '5'])}

drug = pd.DataFrame(d)

###  drug: Ecstasy > Alcohol on wedsBDI 

if normality test necessary: q-q plot  
if equal variance test necessary: error bar plot with standard deviation  

#### Normality

In [17]:
alt.Chart(drug).transform_quantile(
    'wedsBDI', step=0.01
).mark_point().encode(
    x='prob:Q',
    y='value:Q'
).properties(title='Q-Q Plot - wedsBDI')

#### Equal Variance

In [20]:
alt.Chart(drug).mark_errorbar(extent='stdev').encode(
  x=alt.X('wedsBDI:Q', scale=alt.Scale(zero=False)),
  y=alt.Y('drug:N')
).properties(height=100) + alt.Chart(drug).mark_point(filled=True, color='black').encode(
  x=alt.X('wedsBDI:Q', aggregate='mean'),
  y=alt.Y('drug:N')
).properties(height=100) + alt.Chart(drug).mark_tick().encode(
  x=alt.X('wedsBDI:Q'),
  y=alt.Y('drug:N')
).properties(height=100)

In [55]:
alt.Chart(drug).mark_boxplot().encode(
    x='drug', 
    y='wedsBDI'
).properties(width=200) + \
alt.Chart(drug).mark_point(color='red').encode(
  x=alt.X('drug:N'),
  y=alt.Y('wedsBDI:Q',aggregate='mean'),
  tooltip = [alt.Tooltip('mean(wedsBDI):Q'),
             alt.Tooltip('drug:N')]
)

### sundayBDI ~ BDIchange 

In [56]:
alt.Chart(drug).mark_point().encode(
    x='sundayBDI',
    y='BDIchange'
)

## Olympics  

#### Variables 
- Sport
    - nominal  
    - two values: Swimming, Wrestling  
- Sex
    - nominal
    - two values: M, F
- Weight
    - ratio
    
#### Hypothesis  
- Sport: Wrestling > Swimming on Weight  
    - histogram  
- Sex: F < M on Weight  
    - histogram

In [58]:
olympics = pd.read_csv('data/athlete_events_cleaned_weight.csv')

### Sport: Wrestling > Swimming on Weight  

If the number of row of dataset excesses 5000, use plotly combined chart instead

In [60]:
categories = ['Swimming', 'Wrestling']
olympics_select = olympics[olympics.Sport.isin(categories)]

In [62]:
olympics_select.shape[0] > 5000

True

In [63]:
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [65]:
sport = px.histogram(
    olympics_select, 
    x="Weight", 
    histnorm='probability density',
    color="Sport", 
    opacity = 0.5,
    marginal="box",
    nbins=70)
sport.update_layout(yaxis_title='density')
sport.show()

### Sex: F < M on Weight

In [66]:
sport = px.histogram(
    olympics_select, 
    x="Weight", 
    histnorm='probability density',
    color="Sex", 
    opacity = 0.5,
    marginal="box",
    nbins=70)
sport.update_layout(yaxis_title='density')
sport.show()

## US Crime

#### Variables 
- Prob
    - ratio
- Ineq
    - ratio
    
#### Hypothesis
- Ineq ~ -Prob
    - Scatter Plot

In [67]:
crime = pd.read_csv('data/UScrime.csv')

In [76]:
alt.Chart(crime).mark_point().encode(
    x='Ineq',
    y='Prob'
)