# Creating Dataframe

In [None]:
%reset -f
import glob, os, sys, io
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import pandas as pd
import numpy as np

from wavhandler import *
from utils import *
import multiprocessing

import logging
logger = logging.getLogger()
logger.propagate = False
logger.setLevel(logging.ERROR)
np.random.seed(0)
import seaborn as sns
sns.set()

In [None]:
def process_parallel(path):
    data, _ = read_simple([path])
    fname = path.split('/')[-1][:-4]
    df = pd.DataFrame.from_dict(process_signal(fname, data), orient='index').reset_index()
    df['names'] = path
    return df

def make_insect_df(insect_class='Culex'):
    print('Setting the number of cores..')
    try:
        cpus = multiprocessing.cpu_count()
    except NotImplementedError:
        cpus = 2   # arbitrary default

    print('Gathering all files for selected class..')
    wavhdlr = WavHandler('/home/yannis/data/insects/Potamitis/Wingbeats/{}'.format(insect_class), sample_size=-1, recursive=True)
    wavhdlr.read(create_table=True)
    names = wavhdlr.df_table.names.tolist()

    print('Creating poll of processes..')
    pool = multiprocessing.Pool(processes=cpus)
    print('Calculating..')
    result_list = []
    result_list.append(pool.map(process_parallel, names))
    print('Creating Dataframe..')
    df = pd.concat(result_list[0], axis=0, sort=False)
    print('Saving Dataframe to csv..')
    df.to_csv('./data/'+insect_class+'.csv', sep=';')
    print('Done.')

In [None]:
# make_insect_df(insect_class='Culex')
# make_insect_df(insect_class='Anopheles')
# make_insect_df(insect_class='Aedes')

# Reading and Exporting Dataframe

In [None]:
def read_insect_df(insect_class='Culex'):
    df = pd.read_csv('./data/'+insect_class+'.csv', delimiter=';')
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df.rename(columns={'index': 'fname'}, inplace=True)
    df['pathlen'] = df.names.apply(lambda x: len(x.split('/')))
    df['fnamelen'] = df.fname.apply(lambda x: len(x.split('_')))
    df['temp'] = df.fname.apply(lambda x: x.split('_')[-3] if len(x.split('_')) > 5 else np.nan)
    df['humd'] = df.fname.apply(lambda x: x.split('_')[-1] if len(x.split('_')) > 5 else np.nan)
    df['label1'] = insect_class
    df['label2'] = df.names.apply(lambda x: x.split("/")[-3])

    return df

In [None]:
df_culex = read_insect_df(insect_class='Culex')
df_anoph = read_insect_df(insect_class='Anopheles')
df_aedes = read_insect_df(insect_class='Aedes')

df = pd.concat([df_culex, df_anoph, df_aedes], axis=0)
df.sample(5)

## Analyzing dataframe

In [None]:
import matplotlib.pyplot as plt
nr_bins = 100
plt.figure(figsize=(24,5))
plt.subplot(2,3,1); plt.title('fr0'); df['fr0'].hist(bins=nr_bins)
plt.subplot(2,3,2); plt.title('fr1'); df['fr1'].hist(bins=nr_bins)
plt.subplot(2,3,3); plt.title('fr2'); df['fr2'].hist(bins=nr_bins)
plt.subplot(2,3,4); plt.title('pow0'); df['pow0'].hist(bins=nr_bins)
plt.subplot(2,3,5); plt.title('pow1'); df['pow1'].hist(bins=nr_bins)
plt.subplot(2,3,6); plt.title('pow2'); df['pow2'].hist(bins=nr_bins)
plt.show()

## After cleaning

In [None]:
df = df[(df.damping_0 > 0.005) & (df.damping_1 > 0.005) & (df.damping_2 > 0.005)]
df = df[(df.pow0 > 0.025)      & (df.pow1 > 0.005)      & (df.pow2 > 0.0005)]
df = df[(df.fr0 < 900.)        & (df.fr1 < 2000.)       & (df.fr2 < 2500.)]

df.shape

In [None]:
df.to_pickle('./data/mosquitos.pkl')

In [None]:
import matplotlib.pyplot as plt
nr_bins = 100
plt.figure(figsize=(24,7))
plt.subplot(2,3,1); plt.title('fr0'); df['fr0'].hist(bins=nr_bins)
plt.subplot(2,3,2); plt.title('fr1'); df['fr1'].hist(bins=nr_bins)
plt.subplot(2,3,3); plt.title('fr2'); df['fr2'].hist(bins=nr_bins)
plt.subplot(2,3,4); plt.title('pow0'); df['pow0'].hist(bins=nr_bins)
plt.subplot(2,3,5); plt.title('pow1'); df['pow1'].hist(bins=nr_bins)
plt.subplot(2,3,6); plt.title('pow2'); df['pow2'].hist(bins=nr_bins)
plt.show()

In [None]:
df.sample(10)

In [None]:
label = 'label2'
mosqlist = df[label].unique().tolist()
plt.figure(figsize=(24,12))
for i in range(len(mosqlist)):
    plt.subplot(3,2,i+1);
    plt.title(mosqlist[i]);
    df[df[label]==mosqlist[i]].fr0.hist(bins=50)
    #plt.ylim(0,8500)