In [2]:
%reset -f
import glob, os, sys, io
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import pandas as pd
import numpy as np

from wavhandler import *
from utils import *
import multiprocessing
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix
sn.set()

import logging
logger = logging.getLogger()
logger.propagate = False
logger.setLevel(logging.ERROR)
np.random.seed(0)

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import xgboost
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [2]:
df_mosquitos = pd.read_csv('./data/mosquitos.csv', delimiter=';')

In [3]:
#bi_classes = ['LG_drosophila_10_09', 'LG_zapr_26_09']

X_names, y = get_data(target_names=all_6, nr_signals=10000, only_names=True)

100%|██████████| 6/6 [00:00<00:00,  8.35it/s]


# Creating a dataframe of PSDs for all mosquito classes

In [4]:
df_concat = make_df_parallel(df_mosquitos, setting='psd', names=X_names).T
df_concat['label'] = y
df_concat.label = df_concat.label.apply(lambda x: all_6[x])
df_concat.shape

(62205, 130)

# Loading the general dataframe (with custom features)

In [None]:
# df_mosquitos = pd.read_pickle('./data/mosquitos.pkl')
df_mosquitos.drop(['names','pathlen','fnamelen','temp','humd'], axis=1, inplace=True)
df_mosquitos.set_index('fname', inplace=True)
labelarray_mosq = df_mosquitos.label.values
print(df_mosquitos.shape)
df_mosquitos.tail(10)

In [None]:
# df = pd.merge(df_mosquitos, df_concat, left_index=True, right_index=True)
# df.drop(['label_x','label_y'], axis=1, inplace=True)
# # df.to_pickle('./data/big_df.pkl')
# print(df.shape)
# df.head()

## Selecting which dataframe to use

In [5]:
df = df_concat#df_mosquitos.iloc[:,:-1]
cols = df.columns.tolist()
labels = df.label
classes = np.unique(labels)
#df.label.value_counts()

In [6]:
pd.Series(labels).value_counts()

Ae. aegypti            10762
C. quinquefasciatus    10646
An. gambiae            10395
C. pipiens             10205
Ae. albopictus         10103
An. arabiensis         10094
Name: label, dtype: int64

# Training a classifier

In [3]:
# X, y = get_data(target_names=all_6, nr_signals=20000, only_names=False)
# X = transform_data(X)

100%|██████████| 6/6 [00:14<00:00,  2.38s/it]
100%|██████████| 120906/120906 [04:04<00:00, 494.48it/s]


In [7]:
X, y = shuffle(df.iloc[:,:-1].values, labels, random_state=3)

from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=0)
ros.fit(X,y)
X, y = ros.fit_resample(X,y)
print('After undersampling: \n{}\n'.format(pd.DataFrame(y).iloc[:,0].value_counts()))

After undersampling: 
Ae. aegypti            10094
Ae. albopictus         10094
C. quinquefasciatus    10094
An. gambiae            10094
An. arabiensis         10094
C. pipiens             10094
Name: 0, dtype: int64



In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [7]:
classifier = xgboost.XGBClassifier(n_estimators=650, learning_rate=0.2, n_jobs=-1)

In [8]:
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=650,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [9]:
y_pred = classifier.predict(X_test)

ac = accuracy_score(y_test, y_pred)
#cv_ac = cross_val_score(classifier, X, y, cv=3, scoring='accuracy')
print("Name: %s, ac: %f" % ('XGBoost', ac))
#print("Name: %s, cv_ac: %f" % ('XGBoost', np.mean(cv_ac)))

Name: XGBoost, ac: 0.781284


In [None]:
print('done')

## Feature importance

In [None]:
feature_importances = pd.DataFrame(classifier.feature_importances_,
                                    index = df.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head(10)

### Saving the model

In [None]:
import pickle

pickle.dump(classifier, open("./data/pima.pickle.dat", "wb"))
pd.Series(df.index).to_csv('./data/pima_idx_used.csv')

# loaded_model = pickle.load(open("pima.pickle.dat", "rb"))

In [None]:
pd.DataFrame(X).to_csv('./data/unsupervised/mosquitos_test.csv')

# Confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# Making a dataframe of the confusion matrix to plot it
df_cm = pd.DataFrame(cm, index=[i for i in classes], 
                    columns=[i for i in classes])
plt.figure(figsize=(12,7))
sn.heatmap(df_cm, annot=True, fmt='g')

plt.show()

## Distance Matrix

In [None]:
sub = np.concatenate((X, y.reshape(-1,1)), axis=1)
sub = pd.DataFrame(sub)
sub.sort_values(by=sub.iloc[:,-1].name, inplace=True)

In [None]:
sub.dropna(how='any', axis=0, inplace=True)


In [None]:
from scipy.spatial.distance import pdist, squareform
D = squareform(pdist(sub.values[:,:-1], metric='euclidean'))
#‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, 
#‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘kulsinski’, 
#‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, 
#‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.

plt.figure(figsize=(20,14))
plt.imshow(D)
plt.colorbar()

# Time warping

In [None]:
def find_median_signal(D=None):
    a = np.nanmedian(D, axis=0)
    minval = np.argmin(a[np.nonzero(a)])
    return minval # index - argmin

In [None]:
X = sub.values[:,:-1]

In [None]:
import numpy as np
from scipy.spatial.distance import euclidean

from fastdtw import fastdtw

xx = X[50,:]
yy = X[13,:]

distance, path = fastdtw(xx, yy, dist=euclidean)
print(distance)

xx_idx = np.array([path[i][0] for i in range(len(path))])
yy_idx = np.array([path[i][1] for i in range(len(path))])

In [None]:
plt.figure(figsize=(20,8))
plt.subplot(2,2,1)
plt.plot(xx, c='r')
plt.plot(yy, c='y')
plt.legend(('xx','yy'))
#plt.ylim(0,0.15)

plt.subplot(2,2,2)
plt.plot(xx[xx_idx], c='b')
plt.plot(yy[yy_idx], c='c')
plt.legend(('xx_new','yy_new'))
#plt.ylim(0,0.15)

plt.subplot(2,2,3)
plt.plot(xx, c='r')
plt.plot(xx[xx_idx], c='b')
plt.legend(('xx','xx_new'))
#plt.ylim(0,0.15)

plt.subplot(2,2,4)
plt.plot(yy, c='y')
plt.plot(yy[yy_idx], c='c')
plt.legend(('yy','yy_new'))
#plt.ylim(0,0.15)

In [None]:
median_signal_idx = find_median_signal(D=X.astype(float))
median_signal = X[median_signal_idx,:]
plt.plot(median_signal)

In [None]:
#X.shape

D_dtw = np.zeros((600,600))

def warp_with_median_signal(xx, median_signal, distance=euclidean):
    _, path = fastdtw(xx, median_signal, dist=distance)
    
    idx = np.array([path[i][0] for i in range(len(path))])
    sig = xx[idx]
    
    return sig[:128]

In [None]:
sig = X[10,:]
plt.subplot(1,2,1)
plt.plot(warp_with_median_signal(sig, median_signal, distance=euclidean))
#plt.xlim(0,2500)

plt.subplot(1,2,2)
plt.plot(sig)
#plt.xlim(0,2500)

In [None]:
from tqdm import tqdm

XX = []
for i in tqdm(range(X.shape[0])):
    XX.append(warp_with_median_signal(X[i,:], median_signal, distance=euclidean))

In [None]:
len(XX[2])

In [None]:
XX = np.vstack(XX)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import xgboost
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

X, y = shuffle(XX, y, random_state=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

classifier = xgboost.XGBClassifier(n_estimators=300, n_jobs=-1)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
ac = accuracy_score(y_test, y_pred)
print("Name: %s, ac: %f" % ('XGBoost', ac))

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm