In [1]:
%reset -f
import sys
sys.path.insert(0, "..")
from wavhandler import Dataset
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, classification_report, make_scorer, log_loss
from utils_train import *
import deepdish as dd
from configs import DatasetConfiguration
import os
import pandas as pd
from sklearn.metrics import confusion_matrix
import seaborn as sb
import matplotlib.pyplot as plt

seed = 42
np.random.seed(seed=seed)

splitting = 'random'
data_setting = 'raw'
model_setting = 'dl4tsc_inc'

assert splitting in ['random','randomcv','custom'], "Wrong splitting method given."
assert data_setting in ['raw','stft','psd_dB', 'cwt'], "Wrong data settting given."
assert model_setting in ['wavenet','lstm','gru','conv1d','conv1d_psd',
                        'DenseNet121','DenseNet169','DenseNet201',
                        'InceptionResNetV2','VGG16','VGG19',
                        'dl4tsc_fcn','dl4tsc_res', 'tsc_res_baseline',
                        'tsc_fcn_baseline', 'conv1d_baseline', 'dl4tsc_inc'], "Wrong model setting given"

In [2]:
dataset = Dataset('Pcfruit_sensor49')
dataset.read(loadmat=False)
dataset.make_array(setting='raw');
dataset.make_array(setting='psd_dB');

X = dataset.filenames.tolist()
y = dataset.y.copy()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

Species: all.
Read 10264 filenames in 0.05 seconds.


In [3]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, shuffle=True)
X_sens49, y_sens49 = X, y

### Creating model extractor

In [4]:
dconf = DatasetConfiguration(names=['Wingbeats','LG','Leafminers'])
dconf.select(name='Wingbeats',species=['Ae. aegypti','Ae. albopictus', 'C. quinquefasciatus', 'C. pipiens'])
dconf.select(name='LG', species=['LG_zapr_26_09'])
dconf.select(name='Leafminers', species=['P_Cichorii'])
dconf.read()

using_conv2d = False
traincf = TrainConfiguration(dataset=dconf, setting=data_setting, model_name=f'paper2_{splitting}_{data_setting}_{model_setting}_weights')
modelconf = ModelConfiguration(model_setting=model_setting, data_setting=data_setting, target_names=traincf.target_names).config

modelconf.load_weights(traincf.top_weights_path)

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense

new_model = Model(modelconf.inputs, modelconf.layers[-2].output)
x = Dense(traincf.targets, activation='relu')(new_model.layers[-1].output)
x = Dense(traincf.targets, activation='softmax')(x)

del modelconf
model = Model(inputs=new_model.inputs, outputs=x)



############ INPUT SHAPE:(5000, 1)
/home/kalfasyan/projects/wingbeat_frequencies/temp_data/


In [5]:
xtractor_model = Model(inputs=model.input, outputs=model.get_layer(model.layers[-2].name).output)

In [6]:
# Xx = xtractor_model.predict_generator(valid_generator(X_sens49, 
#                                                 y_sens49, 
#                                                 batch_size=traincf.batch_size, 
#                                                 setting=traincf.setting, 
#                                                 target_names=traincf.target_names,
#                                                 preprocessing_train_stats='',
#                                                 using_conv2d=using_conv2d),
#         steps = int(math.ceil(float(len(X_sens49)) / float(traincf.batch_size))))

## Modelling pca features of 2 flies

In [7]:
# 0.84 accuracy

# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler


# X_std = StandardScaler(with_std=True).fit_transform(Xx)

# pca = PCA(n_components=6)

# X_final =pca.fit_transform(X_std)

# plt.figure(figsize=(15,10))
# sb.scatterplot(X_final[:,0], X_final[:,1],
#                 alpha=0.5, 
#                 legend='full', 
#                 style=y_test,
# #                 size=sub.y_hours,
#                 hue=y_test, # HERE SELECT HOURS OR DAYS
#                )#palette=sns.color_palette("cubehelix", 8)) 
# plt.xlabel('component 1')
# plt.ylabel('component 2')
# plt.show()

# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score
# from sklearn.tree import DecisionTreeClassifier
# from xgboost import XGBClassifier

# estimator = XGBClassifier(n_estimators=150, n_jobs=8) #LogisticRegression(solver='lbfgs', n_jobs=-1)
# estimator.fit(X_final, y_test)

# np.mean(cross_val_score(estimator, X_final, y_test, cv=5))

## Visualization of all insects

### Fetching pre-trained test data

In [8]:
dconf = DatasetConfiguration(names=['Wingbeats','LG','Leafminers'])
dconf.select(name='Wingbeats',species=['Ae. aegypti','Ae. albopictus', 'C. quinquefasciatus', 'C. pipiens'])
dconf.select(name='LG', species=['LG_zapr_26_09'])
dconf.select(name='Leafminers', species=['P_Cichorii'])
dconf.read()

In [9]:
X_pre = dconf.fnames.tolist()
le_pre = LabelEncoder()
y_pre = le_pre.fit_transform(dconf.labels.tolist())

In [10]:
names_pre = list(get_integer_mapping(le_pre).keys())

In [11]:
X_pre,y_pre = shuffle(X_pre, y_pre, random_state=seed)
_, X_test_pre, _, _, y_test_pre, _ = train_test_val_split(X_pre,y_pre,test_size=0.1, val_size=0.2, random_state=seed)

#### Adding 2 anopheles mosquitoes

In [12]:
dconf_an = DatasetConfiguration(names=['Wingbeats'])
dconf_an.select(name='Wingbeats',species=['An. arabiensis', 'An. gambiae'])
dconf_an.read()

In [13]:
sample = np.random.randint(0,len(dconf_an.fnames), 5000)

X_an = dconf_an.fnames.loc[sample].tolist()
y_an = dconf_an.labels.loc[sample].tolist()

### Combining data

In [42]:
X_test_all = X_test_pre + X_an #X_sens49 + X_test_pre + X_an

len_bdir = len(BASE_DIR.split('/'))
y_test_all = pd.Series(X_test_all).apply(lambda x: x.split('/')[len_bdir]).tolist()
target_names = pd.Series(y_test_all).unique().tolist()

le_all = LabelEncoder()
y_test_all = le_all.fit_transform(y_test_all)

In [43]:
pd.Series(y_test_all).value_counts()

0    8555
5    7460
3    3568
4    3042
1    2023
2    1432
6     721
7     160
dtype: int64

In [44]:
X_all = xtractor_model.predict_generator(valid_generator(X_test_all, 
                                                y_test_all, 
                                                batch_size=traincf.batch_size, 
                                                setting=traincf.setting, 
                                                target_names=target_names,
                                                preprocessing_train_stats='',
                                                using_conv2d=using_conv2d),
        steps = int(math.ceil(float(len(X_test_all)) / float(traincf.batch_size))))

### Plotting PCA of all

In [45]:
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# import seaborn as sns

# X_std = StandardScaler(with_std=True).fit_transform(X_all)

# pca = PCA(n_components=6)

# X_final =pca.fit_transform(X_std)

# plt.figure(figsize=(15,10))
# sb.scatterplot(X_final[:,0], X_final[:,1],
#                 alpha=0.5, 
#                 legend='full', 
# #                 style=y_test_all,
# #                 size=sub.y_hours,
#                 hue=y_test_all, # HERE SELECT HOURS OR DAYS
#                palette=sns.color_palette("cubehelix", len(target_names))) 
# plt.xlabel('component 1')
# plt.ylabel('component 2')
# plt.show()

In [46]:
from sklearn.manifold import TSNE

params = {"learning_rate": 10,#,100,200],#,500,1000],
          "perplexity": 50,#[20,30,50,100,150],
          "init": 'pca', #,'random'],
          "steps": 1000}#, 5000]}  

tSNE = TSNE(learning_rate=params['learning_rate'], 
            perplexity=params['perplexity'], 
            init=params['init'], 
            random_state=0, 
            n_iter=params['steps'], 
            n_jobs=-1)

X_final = tSNE.fit_transform(X_all)

In [47]:
# import seaborn as sb
# plt.figure(figsize=(20,12))
# sb.scatterplot(X_final[:,0], X_final[:,1],alpha=0.9, legend='full', 
#                 hue=y_test_all,
#                 palette=sb.color_palette("cubehelix", len(target_names)))
# get_integer_mapping(le_all)

In [48]:
# df = pd.DataFrame(X_all[:,:2], columns=['a','b'])
# df['labels'] = y_test_all
df = pd.DataFrame({
'x': X_all[:,0],
'y': X_all[:,1],
'l': y_test_all,
# 'group': list(get_integer_mapping(le_all).keys())
})

import seaborn as sb

In [49]:
%matplotlib widget

fig = plt.figure(figsize=(16,8))

# sb.scatterplot(X_final[:,0], X_final[:,1],alpha=0.9, legend='full', 
#                 hue=y_test_all,
#                 palette=sb.color_palette("cubehelix", len(target_names)))
ax = fig.gca()

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

x_min, x_max = np.min(X_final, 0), np.max(X_final, 0)
Xxx = (X_final - x_min) / (x_max - x_min)

for i in range(df.shape[0]):
    ax.text(Xxx[i,0], Xxx[i,1], y_test_all[i], horizontalalignment='center', size='small', color = plt.cm.Set3(y_test_all[i]), fontdict = {'weight': 'bold', 'size': 4})

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [50]:
get_integer_mapping(le_all)

{'Ae. aegypti': 0,
 'Ae. albopictus': 1,
 'An. arabiensis': 2,
 'An. gambiae': 3,
 'C. pipiens': 4,
 'C. quinquefasciatus': 5,
 'LG_zapr_26_09': 6,
 'P_Cichorii': 7}

In [51]:
from scipy.special import softmax

In [52]:
np.argmax(softmax(X_all, axis=1), axis=1)

array([0, 4, 4, ..., 0, 0, 0])

In [53]:
plt.figure()
pd.DataFrame(X_all)[0].plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f7bb1d7a978>