In [1]:
%reset -f
import glob, os, sys, io
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import pandas as pd
import numpy as np

from wavhandler import *
from utils import *
import multiprocessing
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix
sn.set()

import logging
logger = logging.getLogger()
logger.propagate = False
logger.setLevel(logging.ERROR)
np.random.seed(0)

In [2]:
df_mosquitos = pd.read_pickle('./data/mosquitos.pkl')

# Creating dataframes that contain PSDs for each insect class

In [3]:
sz = 10000
df_an = make_df_parallel(df_mosquitos, setting='psd', insect_class='Anopheles', sample_size=sz).T
df_an['label'] = 'an'
df_ae = make_df_parallel(df_mosquitos, setting='psd',insect_class='Aedes', sample_size=sz).T
df_ae['label'] = 'ae'
df_cu = make_df_parallel(df_mosquitos, setting='psd',insect_class='Culex', sample_size=sz).T
df_cu['label'] = 'cu'

In [4]:
df_concat = pd.concat([df_ae, df_an, df_cu], axis=0, sort=False)
df_concat.dropna(how='all', axis=1, inplace=True)
df_concat.dropna(how='any', axis=0, inplace=True)
df_concat.head()
labelarray = df_concat.label.values
#df_concat.drop('label', axis=1, inplace=True)

In [5]:
df_an = tsfresh_transform(df_an.drop('label', axis=1).T)
df_ae = tsfresh_transform(df_ae.drop('label', axis=1).T)
df_cu = tsfresh_transform(df_cu.drop('label', axis=1).T)

In [6]:
from tsfresh import extract_features
features_an = extract_features(df_an, column_id='id', column_sort='time')
#features_an['label'] = 'an'
features_ae = extract_features(df_ae, column_id='id', column_sort='time')
#features_ae['label'] = 'ae'
features_cu = extract_features(df_cu, column_id='id', column_sort='time')
#features_cu['label'] = 'cu'

Feature Extraction: 100%|██████████| 20/20 [08:09<00:00, 12.73s/it]
Feature Extraction: 100%|██████████| 20/20 [11:12<00:00, 12.64s/it]
Feature Extraction: 100%|██████████| 20/20 [10:20<00:00, 12.80s/it]


In [7]:
featurelist = [features_ae, 
                features_an, 
                features_cu]

df = pd.concat(featurelist, axis=0)
df.reset_index().drop('id',axis=1, inplace=True)

In [8]:
DF = pd.DataFrame(np.concatenate((df.values, df_concat.values), axis=1)).drop(0,axis=1)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import xgboost
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

X, y = shuffle(DF.values[:,:-1], DF.values[:,-1], random_state=3)
#del df

"""
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=0)
ros.fit(X,y)
X, y = ros.fit_resample(X,y)
print('After undersampling: \n{}\n'.format(pd.DataFrame(y).iloc[:,0].value_counts()))
"""

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

classifier = xgboost.XGBClassifier(n_estimators=650, learning_rate=0.2, n_jobs=-1)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
ac = accuracy_score(y_test, y_pred)
print("Name: %s, ac: %f" % ('XGBoost', ac))

Name: XGBoost, ac: 0.853833


In [12]:
feature_importances = pd.DataFrame(classifier.feature_importances_)
#                                    index = df.columns)
#                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head(10)

Unnamed: 0,0
0,0.000301
1,0.000677
2,0.00173
3,0.001203
4,0.000752
5,0.000226
6,0.000451
7,0.000827
8,0.001128
9,0.002106
