In [1]:
import itertools

# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy.integrate

# Import Altair for high level plotting
import altair as alt
import altair_catplot as altcat

# Import Bokeh modules for interactive plotting
import bokeh.io
import bokeh.plotting

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

# Pevent bulky altair plots
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [2]:
# Load in the genotype file, call it df_gt for genotype DataFrame
df_gt = pd.read_csv("../data/fish_activity_for_validation/150717_2A_genotype_3.txt", 
                  delimiter='\t', comment ='#')

In [3]:
# Tidy the DataFrame
df_gt = pd.melt(df_gt, var_name='genotype', value_name='location')

# Drop all rows that have a NaN in them
df_gt = df_gt.dropna()

df_gt = df_gt.reset_index(drop=True)

df_gt.loc[:,'location'] = df_gt.loc[:, 'location'].astype(int)

In [4]:
df_gt.head(10)

Unnamed: 0,genotype,location
0,wt,1
1,wt,3
2,wt,8
3,wt,12
4,wt,21
5,wt,28
6,wt,31
7,wt,33
8,wt,34
9,wt,35


In [5]:
# Now, load in the CSV file
fname = '../data/fish_activity_for_validation/150717_2A_2B.csv'
df = pd.read_csv(fname, comment='#')

First, we want to test that all the columns are what we excpect them to be before we start tidying the data.

In [6]:
def test_column_names(df, fname):
    """Ensure DataFrame has proper columns."""
    column_names = ['location','animal','user','sn','an','datatype','start','end','startreason',
                    'endreason','frect', 'fredur', 'midct', 'middur', 'burct', 'burdur', 'stdate',
                    'sttime']

    assert list(df.columns) == column_names, fname + ' has wrong column names.'

In [7]:
test_column_names(df, fname)

Now, we get rid of the c from location column, and convert to an integer type so we can merge this dataframe with the genotype dataframe.

In [9]:
sanititize_loc = lambda x: int(x.replace("c", ""))

df['location'] = df['location'].apply(sanititize_loc)

In [10]:
# We perform the merge.
df = pd.merge(df, df_gt)

We are only looking for fish in instrument 2A, numbered 1 through 96 in the activity data file, so we can get rid of fish in instrument 2B, numbered 97 and above, as they are not used.

In [11]:
df.head()

Unnamed: 0,location,animal,user,sn,an,datatype,start,end,startreason,endreason,frect,fredur,midct,middur,burct,burdur,stdate,sttime,genotype
0,1,z001,ZEBRALAB02\zebralab_user,1,0,quant,0.0,60.0,Beginning of session,End of period,0,60.0,0,0.0,0,0.0,17/07/2015,14:29:59,wt
1,1,z001,ZEBRALAB02\zebralab_user,1,0,quant,60.0,120.0,End of period,End of period,0,60.0,0,0.0,0,0.0,17/07/2015,14:30:59,wt
2,1,z001,ZEBRALAB02\zebralab_user,1,0,quant,120.0,180.0,End of period,End of period,6,59.0,6,0.9,0,0.0,17/07/2015,14:31:59,wt
3,1,z001,ZEBRALAB02\zebralab_user,1,0,quant,180.0,240.0,End of period,End of period,0,60.0,0,0.0,0,0.0,17/07/2015,14:32:59,wt
4,1,z001,ZEBRALAB02\zebralab_user,1,0,quant,240.0,300.0,End of period,End of period,0,60.0,0,0.0,0,0.0,17/07/2015,14:33:59,wt


In [29]:
df["location"] = df[df["location"] < 97]
df.tail()

Unnamed: 0,location,animal,user,sn,an,datatype,start,end,startreason,endreason,frect,fredur,midct,middur,burct,burdur,stdate,sttime,genotype
325396,96,z096,ZEBRALAB02\zebralab_user,1,0,quant,246840.0,246900.0,End of period,End of period,4,59.6,4,0.4,0,0.0,20/07/2015,11:03:59,het
325397,96,z096,ZEBRALAB02\zebralab_user,1,0,quant,246900.0,246960.0,End of period,End of period,28,55.2,29,4.8,0,0.0,20/07/2015,11:04:59,het
325398,96,z096,ZEBRALAB02\zebralab_user,1,0,quant,246960.0,247020.0,End of period,End of period,69,50.2,68,9.9,0,0.0,20/07/2015,11:05:59,het
325399,96,z096,ZEBRALAB02\zebralab_user,1,0,quant,247020.0,247080.0,End of period,End of period,62,51.6,63,8.4,0,0.0,20/07/2015,11:06:59,het
325400,96,z096,ZEBRALAB02\zebralab_user,1,0,quant,247080.0,247103.7,End of period,End of session,27,19.3,26,4.4,0,0.0,20/07/2015,11:07:59,het


We don't want any of our numerical columns to be negative; we'll write a general function for it and apply it to all the numerical columns later on.

In [47]:
# Loop through all the numerical columns (given in cols) and check that they are positive
def num_negative_col(df, cols):
    results = []
    for col in cols:
       results.append("The column {} has {} negative values".format (col, (df[col] < 0).sum()))
    return results

In [48]:
cols = ["location", "sn", "an", "start", "end", "frect", "fredur", "midct", "burct", "burdur"]
num_negative_col(df, cols)

['The column location has 0 negative values',
 'The column sn has 0 negative values',
 'The column an has 0 negative values',
 'The column start has 0 negative values',
 'The column end has 0 negative values',
 'The column frect has 0 negative values',
 'The column fredur has 0 negative values',
 'The column midct has 0 negative values',
 'The column burct has 0 negative values',
 'The column burdur has 0 negative values']

We want to check that the duration of all the periods are equal! We take the most common interval duration and see how many data points deviate from the mode.

In [104]:
# Make sure that all the time intervals are equal
def test_time_intervals(df):
    df_temp = df
    df_temp["length"] = df["end"] - df["start"]
    results = []
    
    # Find the mode of the column
    mode = df_temp["length"].mode()[0]
    
    outliers = df_temp.loc[df_temp["length"] != mode]
    
    results.append("The mode is {}".format(mode))
    results.append("{} rows deviate from the mode".format(outliers.shape[0]))
    
    # commenting this out for now bc it produces a lot of output
#     for index, row in outliers.iterrows():
#         results.append("The row at index {} has a duration of {}"
#                        .format(index, round(row["length"], 2)))
    
    return results 

In [105]:
test_time_intervals(df)

['The mode is 60.0', '237 rows deviate from the mode']

Now, we get into the more *creative* data validation. The middur column is what we use for computing activity. We will first check the activity column middur. Obviously, fish cannot be spend more seconds active than the total duration of the period.

In [134]:
def test_middur(df):
    df_temp = df
    df_temp["length"] = df["end"] - df["start"]
    
    outliers = df_temp[df["middur"] > df_temp["length"]].shape[0]
    
    if outliers != 0:
        return "Something is wrong! In {} periods, fish ".format(outliers) + \
                "were active longer than the total duration"
    
    return "Number of seconds active is less than total period duration for all trials"

In [135]:
test_middur(df)

'Number of seconds active is less than total period duration for all trials'

Notes: 
1. middur columns is what we use for computing activity.
2. Only the fish in instrument 2A, numbered 1 through 96 in the activity data file 150717_2A_2B.csv, were genotyped. The fish in instrument 2B, numbered 97 and above, are not used in the assay. **does this mean we filter them out?**

Enforce that fish 1-96 are here

Dates and times need to be in a specific format

Fish have equal # data points

Numbers need to be non negative

intervals 60 sec and cont. -> fail silently

sn and an are binary -> what do they mean! They are always the same

check datatype all quant

middur is a double bw 0-60


Get rid of the c's in location combine with the other data frame and get rid of the NaNs


start and end excpet for the exception (max and min)



-----------
check for what the time interval is and then remove all the extraneous things. Only then can you really know what activity values to get rid of.

- calculate freq with fancy max function

- fit distribution to frequncy data 

- 

Questions:
    
1. When we have a row duration of more that 60 seconds, should we throw it out?