In [1]:
%reset -f
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%config Completer.use_jedi = False

In [2]:
import sys
sys.path.insert(0,"..")
from tqdm import tqdm
# from tqdm.notebook import tqdm as tqdm
import numpy as np
import pandas as pd
import torch
torch.manual_seed(42)
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, ConcatDataset
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import datasets, transforms
torchaudio.set_audio_backend('soundfile')
import os
import random
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import psutil
import requests
import seaborn as sns

from utils import *
from datasets import *
from models import *
from transforms import *
from profiler import *

Available workers: 16


In [3]:
maxpeaks = 15
minpeaks = 1
maxscore = 12
minscore = 3.5
selection = f"peaks>{minpeaks} & peaks<{maxpeaks} & score>{minscore} & score<={maxscore}"

# Melanogaster and Suzukii main data

In [4]:
dmel1 = WingbeatDatasetProfiler(dsname="Melanogaster_RL/Z", custom_label=[0])
dsuz1 = WingbeatDatasetProfiler(dsname="Suzukii_RL/Y", custom_label=[1])
dmel2 = WingbeatDatasetProfiler(dsname="Melanogaster_RL/Y", custom_label=[0])
dsuz2 = WingbeatDatasetProfiler(dsname="Suzukii_RL/X", custom_label=[1])

Found 24763 in dataset: Melanogaster_RL/Z, and 1 label(s): ['D. melanogaster']
Label(s) changed to [0]


Collecting all data from the dataloader..: 1548it [00:19, 78.72it/s]                           


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.
Found 25732 in dataset: Suzukii_RL/Y, and 1 label(s): ['D. suzukii']
Label(s) changed to [1]


Collecting all data from the dataloader..: 1609it [00:14, 110.37it/s]                          


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.
Found 29002 in dataset: Melanogaster_RL/Y, and 1 label(s): ['D. melanogaster']
Label(s) changed to [0]


Collecting all data from the dataloader..: 1813it [00:08, 202.57it/s]                          


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.
Found 19657 in dataset: Suzukii_RL/X, and 1 label(s): ['D. suzukii']
Label(s) changed to [1]


Collecting all data from the dataloader..: 1229it [00:13, 88.65it/s]                           


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.


In [5]:
temprs, humds = [],[]
for i in [dmel1, dmel2, dsuz1, dsuz2]:
    i.wbts.parse_filenames(temp_humd=True)
    tmp = i.wbts.df
    humds.append(tmp.humidity)
    temprs.append(tmp.temperature)
humds, temprs = pd.concat(humds), pd.concat(temprs)

print(f"Average humidity: {(humds[humds>50]-50).mean().round(0)}")
print(f"std: {(humds[humds>50]-50).std().round(0)}")
print(f"Average temperature: {temprs.mean().round(1)}")
print(f"std: {temprs.std().round(1)}")


Average humidity: 64.0
std: 5.0
Average temperature: 21.7
std: 0.6


# Plots of time signals and PSDs

In [6]:
# dmel1.plot_random_wbts(noaxis=False)

In [7]:
# dmel1.plot_random_psds(noaxis=False)

In [8]:
# plt.figure(figsize=(16,6))
# sns.scatterplot(data=dsuz2.df, x="score", y="peaks", alpha=0.9);
# sns.scatterplot(data=dmel2.df, x="score", y="peaks", alpha=0.3);
# plt.plot([minscore, maxscore], [maxpeaks, maxpeaks], linewidth=2, c='black');
# plt.plot([maxscore, maxscore], [minpeaks, maxpeaks], linewidth=2, c='black');
# plt.plot([maxscore, maxscore], [minpeaks, maxpeaks], linewidth=2, c='black');
# plt.plot([minscore, maxscore], [minpeaks, minpeaks], linewidth=2, c='black');

In [9]:
# plt.ylim(0,450)
# np_hist(dsuz1.df, 'score')

In [10]:
# plt.ylim(0,450)
# np_hist(dmel1.df, 'score');

In [11]:
# plt.ylim(0,3700)
# np_hist(dsuz1.df, 'peaks')

In [12]:
# plt.ylim(0,3700)
# np_hist(dmel1.df, 'peaks')

##### duration

In [13]:
# np_hist(dmel1.df, 'duration', res=2.5)

In [14]:
dfmel1 = dmel1.df.query(selection)
dfmel1.y = 0
dfmel2 = dmel2.df.query(selection)
dfmel2.y = 0

dfsuz1 = dsuz1.df.query(selection)
dfsuz1.y = 1
dfsuz2 = dsuz2.df.query(selection)
dfsuz2.y = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

### Clean examples

In [15]:
# dmel1.plot_random_psds(df=dfmel1, noaxis=False)

In [16]:
torch.backends.cudnn.benchmark = True
batch_size = 64

In [17]:
train_df = pd.concat([dfmel1, dfsuz1])
test_df = pd.concat([dfmel2, dfsuz2])

print(train_df.y.value_counts())
print(test_df.y.value_counts())

1    16876
0    13021
Name: y, dtype: int64
1    13567
0    12118
Name: y, dtype: int64


# Saving

In [18]:
train_df.to_parquet("../data_created/suzmel_train.parquet")
test_df.to_parquet("../data_created/suzmel_test.parquet")

In [19]:
# X, y = train_df.iloc[:, 0], train_df.iloc[:, 1]
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=42)

# dfs_train = DataFrameset(pd.concat([X_train, y_train], axis=1), transform=transforms.Compose(transforms_list_train))
# dfs_valid = DataFrameset(pd.concat([X_valid, y_valid], axis=1), transform=transforms.Compose(transforms_list_test))

# train_dataloader = DataLoader(dfs_train, batch_size=32, shuffle=True, num_workers=4, pin_memory=True, worker_init_fn=worker_init_fn)
# valid_dataloader = DataLoader(dfs_valid, batch_size=32, shuffle=True, num_workers=4, pin_memory=True, worker_init_fn=worker_init_fn)

## MeBioS Suzukii and Melanogaster data

In [20]:
ds1 = WingbeatDatasetProfiler(dsname="Suzukii_RL/R", custom_label=[1])
ds2 = WingbeatDatasetProfiler(dsname="Suzukii_RL/L", custom_label=[1])
ds3 = WingbeatDatasetProfiler(dsname="Melanogaster_RL/X", custom_label=[0])
ds4 = WingbeatDatasetProfiler(dsname="Melanogaster_RL/W", custom_label=[0])

Found 14348 in dataset: Suzukii_RL/R, and 1 label(s): ['D. suzukii']
Label(s) changed to [1]


Collecting all data from the dataloader..: 897it [00:09, 97.41it/s]                          


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.
Found 21940 in dataset: Suzukii_RL/L, and 1 label(s): ['D. suzukii']
Label(s) changed to [1]


Collecting all data from the dataloader..: 1372it [00:19, 72.16it/s]                           


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.
Found 2086 in dataset: Melanogaster_RL/X, and 1 label(s): ['D. melanogaster']
Label(s) changed to [0]


Collecting all data from the dataloader..: 131it [00:02, 52.88it/s]                         


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.
Found 1882 in dataset: Melanogaster_RL/W, and 1 label(s): ['D. melanogaster']
Label(s) changed to [0]


Collecting all data from the dataloader..: 118it [00:02, 52.06it/s]                          


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.


In [21]:
temprs, humds = [],[]
for i in [ds1, ds2, ds3, ds4]:
    i.wbts.parse_filenames(temp_humd=True)
    tmp = i.wbts.df
    humds.append(tmp.humidity)
    temprs.append(tmp.temperature)
humds, temprs = pd.concat(humds), pd.concat(temprs)

print(f"Average humidity: {(humds[humds>50]-50).mean().round(0)}")
print(f"std: {(humds[humds>50]-50).std().round(0)}")
print(f"Average temperature: {temprs.mean().round(1)}")
print(f"std: {temprs.std().round(1)}")


Average humidity: 55.0
std: 9.0
Average temperature: 23.3
std: 1.1


In [22]:
extra_df = pd.concat([ds1.df,ds2.df,ds3.df,ds4.df])
extra_df.y.value_counts()

1    36288
0     3968
Name: y, dtype: int64

In [23]:
extra_df = extra_df.query(selection)
extra_df.y.value_counts()

1    21606
0     1176
Name: y, dtype: int64

In [24]:
test_df.shape[0]

25685

In [25]:
extra_df.shape

(22782, 13)

In [26]:
extra_df.to_parquet("../data_created/suzmel_extra.parquet")

In [27]:
pd.concat([train_df, extra_df]).to_parquet("../data_created/suzmel_traindf_plus_extradf.parquet")

In [28]:
nu1 = WingbeatDatasetProfiler(dsname="/home/kalfasyan/data/wingbeats/wingbeats_20211129/wingbeats_correct_dates/Melanogaster_202111_SD", custom_label=[0])
nu2 = WingbeatDatasetProfiler(dsname="/home/kalfasyan/data/wingbeats/wingbeats_20211129/wingbeats_correct_dates/Suzukii_202111_SD", custom_label=[1])
nu_df = pd.concat([nu1.df,nu2.df])

nu_df = nu_df.query(selection)
nu_df.y.value_counts()

Found 11136 in dataset: /home/kalfasyan/data/wingbeats/wingbeats_20211129/wingbeats_correct_dates/Melanogaster_202111_SD, and 1 label(s): ['wingbeats_correct_dates']
Label(s) changed to [0]


Collecting all data from the dataloader..: 100%|██████████| 696/696 [00:03<00:00, 214.64it/s]


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.
Found 2725 in dataset: /home/kalfasyan/data/wingbeats/wingbeats_20211129/wingbeats_correct_dates/Suzukii_202111_SD, and 1 label(s): ['wingbeats_correct_dates']
Label(s) changed to [1]


Collecting all data from the dataloader..: 171it [00:01, 170.21it/s]                         


Creating a pandas Dataframe with file-paths, clean-scores, duration, sums of abs values, indice and labels..
Finished.


0    5918
1    1704
Name: y, dtype: int64

In [29]:
pd.concat([train_df, extra_df,nu_df]).to_parquet("../data_created/suzmel_traindf_plus_extradf_plus_nudf.parquet")

In [30]:
train_df.y.value_counts()

1    16876
0    13021
Name: y, dtype: int64

In [31]:
pd.concat([train_df, extra_df]).y.value_counts()

1    38482
0    14197
Name: y, dtype: int64

In [32]:
pd.concat([train_df, extra_df,nu_df]).y.value_counts()

1    40186
0    20115
Name: y, dtype: int64

In [110]:
selection

'peaks>1 & peaks<15 & score>3.5 & score<=12'

In [114]:
extra_df.shape

(22744, 13)