In [None]:
%reset -f
import glob, os, sys, io
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import pandas as pd
import numpy as np

from wavhandler import *
from utils import *
import multiprocessing

import logging
logger = logging.getLogger()
logger.propagate = False
logger.setLevel(logging.ERROR)
np.random.seed(0)

In [None]:
def get_psd(fname, data, plot=False):

    sig_bandpass = butter_bandpass_filter(data=data, lowcut=L_CUTOFF, highcut=H_CUTOFF, fs=F_S, order=B_ORDER)
    sig_cropped = crop_signal(sig_bandpass, window=300, intens_threshold=0.0004, offset=200)

    if sig_cropped is None or sig_cropped.empty:
        return pd.Series(np.ones(2500,)*np.nan)

    psd = psd_process(sig_cropped, fs=F_S, scaling='density', window='hamming', nfft=8192, noverlap=None, crop_hz=2500)
    psd[fname] = psd.pow_amp
    return psd[fname]

def process_parallel(path):
    data, _ = read_simple([path])
    fname = path.split('/')[-1][:-4]
    return get_psd(fname, data)

def make_insect_df(insect_class='Culex'):
    print('Setting the number of cores..')
    try:
        cpus = multiprocessing.cpu_count()
    except NotImplementedError:
        cpus = 2   # arbitrary default

    print('Gathering all files for selected class..')
    wavhdlr = WavHandler('/home/yannis/data/insects/Potamitis/Wingbeats/{}'.format(insect_class), sample_size=-1, recursive=True)
    wavhdlr.read(create_table=True)
    names = wavhdlr.df_table.names.tolist()

    print('Creating poll of processes..')
    pool = multiprocessing.Pool(processes=cpus)
    print('Calculating..')
    result_list = []
    result_list.append(pool.map(process_parallel, names))
    print('Creating Dataframe..')
    df = pd.concat(result_list[0], axis=1, sort=False)
    return df

In [None]:
#df_an = make_insect_df(insect_class='Anopheles')
#print('Saving Dataframe..')
#df_an.to_csv('./data/df_an.csv', sep=';')

In [None]:
#df_ae = make_insect_df(insect_class='Aedes')
#print('Saving Dataframe..')
#df_ae.to_csv('./data/df_ae.csv', sep=';')

In [None]:
#df_cu = make_insect_df(insect_class='Culex')
#print('Saving Dataframe..')
#df_cu.to_csv('./data/df_cu.csv', sep=';')

# Principal Component Analysis

In [None]:
df_an = pd.read_csv('./data/df_an.csv', delimiter=';')


In [None]:
print(df_an.shape)
df_an.drop('Unnamed: 0', axis=1, inplace=True)
print(df_an.shape)

In [None]:
df_an.head()

In [None]:
df_an = df_an[df_an.columns[~df_an.isna().any()].tolist()]
#df_an.iloc[:,2].isnull().sum()
X = df_an.values
cols = df_an.columns
del df_an
X = X.T

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#idx = np.random.randint(X.shape[0], size=10000)
#X = X[idx,:]
X_std = StandardScaler(with_std=False).fit_transform(X);  # standardization of data

pca = PCA(n_components=3) # 10 to get 80% explained variance
Χ_final = pca.fit_transform(X_std)
print("EXPLAINED VARIANCE: {0:.2f}".format(pca.explained_variance_ratio_.sum()) )

In [None]:
#import matplotlib.pyplot as plt
#import seaborn as sns
#plt.figure(figsize=(15,11))
#sns.scatterplot(Χ_final[:,0], Χ_final[:,1] ,alpha=0.4, size=Χ_final[:,2])
#sns.
#plt.xlabel('component 1')
#plt.ylabel('component 2')


In [None]:
df_pca = pd.DataFrame(pca.components_,index = ['PC-1','PC-2', 'PC-3'])
df_pca

In [None]:
df_pca.loc['PC-1'].plot()

In [None]:
df_pca.loc['PC-2'].plot()

In [None]:
df_pca.loc['PC-3'].plot()

In [None]:
import plotly
import plotly.plotly as py
plotly.tools.set_credentials_file(username='kalfasyan', api_key='oOr9ILFmszXQYREloJTp')

import plotly.graph_objs as go

import numpy as np

x, y, z = Χ_final[:,0], Χ_final[:,1], Χ_final[:,2]
trace1 = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=1,
        line=dict(
            color=z,#'rgba(217, 217, 217, 0.14)',
            colorscale='Viridis',
            width=0.5
        ),
        opacity=0.9
    )
)


data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-3d-scatter')

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(init='k-means++', n_clusters=2, n_init=10)
kmeans.fit(Χ_final)

In [None]:
import plotly
import plotly.plotly as py
plotly.tools.set_credentials_file(username='kalfasyan', api_key='oOr9ILFmszXQYREloJTp')

import plotly.graph_objs as go

import numpy as np

x, y, z = Χ_final[:,0], Χ_final[:,1], Χ_final[:,2]
trace1 = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=1,
        line=dict(
            color=kmeans.labels_,#'rgba(217, 217, 217, 0.14)',
            colorscale='Viridis',
            width=0.5
        ),
        opacity=0.9
    )
)


data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-3d-scatter')

# Edw eisai

In [None]:
df_an = df_an.transpose()
df_an.shape

In [None]:
df_an['kmeans'] = kmeans.labels_

In [None]:
df_an.sample(10)

In [None]:
import pickle

def save_obj(obj, name ):
    with open('./data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('./data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
save_obj(df_an['kmeans'].to_dict(), 'df_an_clusters')