In [None]:
import itertools

# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy.integrate

# Import Altair for high level plotting
import altair as alt
import altair_catplot as altcat

# Import Bokeh modules for interactive plotting
import bokeh.io
import bokeh.plotting

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

# Pevent bulky altair plots
alt.data_transformers.enable('json')

In [None]:
# Load in the genotype file, call it df_gt for genotype DataFrame
df_gt = pd.read_csv("../data/fish_activity_for_validation/150717_2A_genotype_3.txt", 
                  delimiter='\t', comment ='#')

In [None]:
# Tidy the DataFrame
df_gt = pd.melt(df_gt, var_name='genotype', value_name='location')

# Drop all rows that have a NaN in them
df_gt = df_gt.dropna()

df_gt = df_gt.reset_index(drop=True)

df_gt.loc[:,'location'] = df_gt.loc[:, 'location'].astype(int)

In [None]:
df_gt.head(10)

In [None]:
# Now, load in the CSV file
fname = '../data/fish_activity_for_validation/150717_2A_2B.csv'
df = pd.read_csv(fname, comment='#')

First, we want to test that all the columns are what we excpect them to be before we start tidying the data.

In [None]:
def test_column_names(df, fname):
    """Ensure DataFrame has proper columns."""
    column_names = ['location','animal','user','sn','an','datatype','start','end','startreason',
                    'endreason','frect', 'fredur', 'midct', 'middur', 'burct', 'burdur', 'stdate',
                    'sttime']

    assert list(df.columns) == column_names, fname + ' has wrong column names.'

In [None]:
test_column_names(df, fname)

In [None]:
df.head()

Now, we get rid of the c from location column, and convert to an integer type so we can merge this dataframe with the genotype dataframe.

In [None]:
sanititize_loc = lambda x: int(x.replace("c", ""))

df['location'] = df['location'].apply(sanititize_loc)

In [None]:
# We perform the merge.
df = pd.merge(df, df_gt)

We are only looking for fish in instrument 2A, numbered 1 through 96 in the activity data file, so we can get rid of fish in instrument 2B, numbered 97 and above, as they are not used.

In [None]:
df["location"] = df[df["location"] < 97]
df.tail()

We don't want any of our numerical columns to be negative; we'll write a general function for it and apply it to all the numerical columns later on.

In [None]:
def num_negative_col(df, col):
    """Return number of negative entries for the given column"""
    negatives = (df[df[col] < 0]).sum()
    return negatives

# use .all

Now, we get into the more *creative* data validation. We will first check the activity column middur. Activity must be a non-negative double from 1-60 (as a fish cannot be active more than 60 seconds in a minute).


In [None]:
def test_middur(df):
    df_t = df[df["middur"] > 60].shape[0]
    df_f = df[df[isinstance(y, float)].shape[0]
    return df_f

In [None]:
# Make sure that all the time intervals are equal
def test_time_intervals(df):
    df_temp = df
    df_temp["length"] = df["end"] - df["end"]
    return df_temp
        
    

In [None]:
test_time_intervals(df).head()

In [None]:
df_t = df[df["middur"] > 60]
df_t.head()

Notes: 
1. middur columns is what we use for computing activity.
2. Only the fish in instrument 2A, numbered 1 through 96 in the activity data file 150717_2A_2B.csv, were genotyped. The fish in instrument 2B, numbered 97 and above, are not used in the assay. **does this mean we filter them out?**

Enforce that fish 1-96 are here

Dates and times need to be in a specific format

Fish have equal # data points

Numbers need to be non negative

intervals 60 sec and cont. -> fail silently

sn and an are binary -> what do they mean! They are always the same

check datatype all quant

middur is a double bw 0-60


Get rid of the c's in location combine with the other data frame and get rid of the NaNs


start and end excpet for the exception (max and min)



-----------
check for what the time interval is and then remove all the extraneous things. Only then can you really know what activity values to get rid of.

- calculate freq with fancy max function

- fit distribution to frequncy data 

- 