# Alternative sperm exploration

Several ideas float:

- Reduce each sample (biounit) to a singular quantitiy
    - This quantity should be a measure of central tendency or dispersion
    - For central tendency -> mean, median
    - For dispersion -> standard deviation, IQR
    - Bonus : skewedness and kurtosis
- Create KDE plot for every sample with HUE as replicate to help remove outlying technical replicates
- Create ANOVA/KW testing with appropriate post-hoc tests to help identify outlying animals


In [15]:
import pandas as pd
import seaborn as sns
from sklearn.utils import resample 
import pingouin as pg
import numpy as np
import warnings
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt

plt.style.use("ggplot")
warnings.filterwarnings("ignore") 


data = pd.read_csv("sperm_experiment.csv").iloc[:,1:]
data = data.loc[data["RED-HLin"]<4000]
samples = sorted(list(set(data.Sample)))
stat_options = ["mean","median","std","kurtosis","skew","iqrd"]

#private functions
def _mean(series):
    return (series.mean())
def _median(series):
    return(series.median())
def _std(series):
    return(series.std())
def _kurtosis(series):
    return(series.kurtosis())
def _skew(series):
    return(series.skew())
def _iqrd(series):
    return(series.quantile(.75)-series.quantile(.25))

def drawing_kdeplots(sample,**kwargs):
    """Densityplot of DNA content of a particular sample"""
    fig = plt.Figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    
    if "column" in kwargs:
        column = kwargs["column"]
        if column == "lin":
            col = "RED-HLin"
        else:
            col = "RED-HLog"
    else:
        col = "RED-HLin"
    sns.kdeplot(data=data.loc[data.Sample==sample],x=col,hue="Replicate",ax=ax)
    display(fig)
    
def statistic_calculator(statistic="mean",column="lin"):
    """
    statistic - "mean","median","std","kurtosis","skew","iqrd"
    """
    _commands = {"mean":_mean, "median":_median, "std":_std,
                 "kurtosis":_kurtosis, "skew":_skew, "iqrd":_iqrd
                }
    if column == "lin":
        col = "RED-HLin"
    else:
        col = "RED-HLog"
    
    _l = []
    for sample in samples:
        _df = data.loc[data.Sample==sample].iloc[0,:7].copy()
        series = data.loc[data.Sample==sample][col]
        stat_value = _commands[statistic](series)
        _df[statistic]=stat_value
        _l.append(_df)
        
    statistic_dataframe = pd.concat(_l,axis=1).T
    statistic_dataframe.reset_index(inplace=True,drop=True)
    statistic_dataframe.drop(["Replicate"],axis=1,inplace=True)
    statistic_dataframe.rename(columns={"Name":"Experiment"},inplace=True)
    return(statistic_dataframe)

def visualizer_statistic(statistic):
    """Swarmplot and Pointplot of {statistic}"""
    data_t = statistic_calculator(statistic)
    fig = plt.Figure(figsize=(15,5))
    singular = fig.add_subplot(121)
    pooled = fig.add_subplot(122)
    singular.set_title(f"Pointplot for Samples and their {statistic.capitalize()}")
    pooled.set_title(f"Pointplot for Treatments and their {statistic.capitalize()}")
    sns.swarmplot(data=data_t,x="Sample",y=statistic,hue="Treatment",ax=singular,
                 s=10) #<-every sample unique, this is good
    sns.pointplot(data=data_t,x="Experiment",hue="Treatment",y=statistic,
                  dodge=True,join=False,capsize=.1,ax=pooled) #<-samples pooled, this is also good
    display(fig)

def testing_statistic(statistic, between, parametric):
    """Statistical testing of chosen statistic"""
    data_t = statistic_calculator(statistic)
    iterator = list(set(data[between]))
    first = list(data_t.loc[data_t[between]==iterator[0]][statistic])
    second = list(data_t.loc[data_t[between]==iterator[1]][statistic])
    if parametric==True:
        test = pg.ttest(x=first,y=second,paired=False)
    else:
        test=pg.mwu(x=first,y=second)
    test.insert(0,"Between",between)
    test.insert(0,"Based on",statistic)
    display(test)

statisticW = widgets.Dropdown(options=stat_options,value=stat_options[0],description="Statistic")
betweenW = widgets.Dropdown(options=["Treatment","Experiment"],value="Treatment",description="Between")
parametricW = widgets.Checkbox(value=True,description='Parametric',disabled=False,indent=False)
samplesW = widgets.Dropdown(options=samples,value=samples[0],description="Sample: ")


# Draw KDE plots of a particular sample

In [16]:
widgets.interactive(drawing_kdeplots,sample=samplesW)

interactive(children=(Dropdown(description='Sample: ', options=(55, 56, 57, 58, 60, 87, 88, 89, 90, 91, 92), v…

# Drop the replicates that don't work

In [17]:
#reps_to_drop = []
reps_to_drop = ["55:2","56:3","57:1","90:1","92:1"] #<-visualy deduced
data=data.loc[~data.Replicate.isin(reps_to_drop)]
print(sorted(list(set(data.Replicate))))

['55:1', '55:3', '56:1', '56:2', '57:2', '57:3', '58:1', '58:2', '58:3', '60:1', '60:2', '60:3', '87:1', '87:2', '87:3', '88:1', '88:2', '88:3', '89:1', '89:2', '89:3', '90:2', '90:3', '91:1', '91:2', '91:3', '92:2', '92:3']


# Visualize and Test

In [18]:
widgets.interactive(visualizer_statistic,statistic=statisticW)

interactive(children=(Dropdown(description='Statistic', options=('mean', 'median', 'std', 'kurtosis', 'skew', …

In [19]:
widgets.interactive(testing_statistic,statistic=statisticW,between=betweenW,parametric=parametricW,)

interactive(children=(Dropdown(description='Statistic', options=('mean', 'median', 'std', 'kurtosis', 'skew', …