# EMPIRICAL MODE DECOMPOSITION (EMD)
This notebook contains the EMD processing for all stations.
- 1. Finds the IMF for each time series 
- 2. Separate meaningful and noise IMF (meaningful is iMF with absolute pearson correlation greater than 0.3)
- 3. Aggregates meaningful and noise IMFs (this shows best resutls in original paper)
- 4. Saves the data in CSV format for later use. 

In [49]:
import os
import datetime

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
from PyEMD import EMD
import unidecode

import warnings
warnings.filterwarnings('ignore')

In [50]:
## Load the data 
input_path = 'data/'
df = pd.read_csv(input_path + 'clean_transactions.csv', parse_dates = ['timestamp'])

## Transform the data 
transactions = df.copy()
stations = transactions.columns[~transactions.columns.str.contains('time')]
transactions[stations] = transactions[stations] + 1
transactions[stations] = transactions[stations].transform(np.log)

#Adding more variables
transactions['day'] = transactions.timestamp.dt.dayofweek 
transactions['hour'] = transactions.timestamp.dt.hour 
transactions['weekday'] = (transactions.day <= 4).astype(int)

In [51]:
existing_files = os.listdir('data')

## Dropping accent marks
## This may cause some unexpected comparison of strings later on. 
existing_files = [unidecode.unidecode(x) for x in existing_files]

In [53]:
counter = 0
for station in stations:
    start = time.time()
    print (station)

    if unidecode.unidecode(station + '.csv') in existing_files:
        print ('Station EDM already exist. Moving on to the next station')
    else:
        print ('Creating EDM file...')
    end = time.time()
    counter += 1
    print ('Processing time:{:.2f}'.format(end - start))
    print ('')

    if counter == len(stations):
        print ('all stations are done')

(02000) Cabecera Autopista Norte
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02001) Centro Comercial Santa Fe
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02101) Toberín
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02102) Calle 161
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02103) Mazurén
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02104) Calle 146
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02105) Calle 142
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02200) Alcalá
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02201) Prado
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02202) Calle 127
Station EDM already exist. Moving on to the next station
Processing time:0.00

(02204) Pe

In [48]:
for station in stations:
    start = time.time()
    print (station)

    ## Use unicode only for comparison. Station name is preserved. 
    if unidecode.unidecode(station + '.csv') in existing_files:
        print ('Station EDM already exist. Moving on to the next station')
    else:
        print ('Creating EDM file...')
        ## Estimates IMFs
        s = np.array(transactions[station])
        t = np.array(transactions[station].index)
        IMF = EMD().emd(s,t)

        ## Separate meaninful and noise IMF 
        meaningful_list = []
        noise_list = []

        for imf in IMF:
            r = np.corrcoef(s, imf)[0,1]
            # pho = r[0,1]
            # r2 = r[0,1]**2

            if abs(r) > 0.15:
                meaningful_list.append(imf)
            else:
                noise_list.append(imf)
        
        ## Aggregate meaninful and noise IMF 
        meaningful = np.array(meaningful_list).sum(axis = 0)
        noise = np .array(noise_list).sum(axis = 0)
        imfs = np.stack([meaningful, noise]).T

        ## Saving to file 
        df = transactions[['timestamp',station,'day','hour','weekday']] 
        df[['IMF_meaningful','IMF_noise']] = imfs 
        df.to_csv('data/EDM/' + station + '.csv')
    end = time.time()
    print ('Processing time:{:.2f} seconds'.format(end - start))
    print ('')

(02000) Cabecera Autopista Norte
Creating EDM file...
Processing time:244.82 seconds

(02001) Centro Comercial Santa Fe
Creating EDM file...
Processing time:210.25 seconds

(02101) Toberín
Creating EDM file...
Processing time:211.55 seconds

(02102) Calle 161
Creating EDM file...
Processing time:171.23 seconds

(02103) Mazurén
Creating EDM file...
Processing time:182.16 seconds

(02104) Calle 146
Creating EDM file...
Processing time:200.09 seconds

(02105) Calle 142
Creating EDM file...
Processing time:221.71 seconds

(02200) Alcalá
Creating EDM file...
Processing time:222.77 seconds

(02201) Prado
Creating EDM file...
Processing time:206.19 seconds

(02202) Calle 127
Creating EDM file...
Processing time:179.22 seconds

(02204) Pepe Sierra
Creating EDM file...
Processing time:172.33 seconds

(02205) Calle 106
Creating EDM file...
Processing time:172.07 seconds

(02300) Calle 100
Creating EDM file...
Processing time:203.21 seconds

(02302) Virrey
Creating EDM file...
Processing time:169

## Experimental code 

In [3]:
%%time 
col_name = '(02000) Cabecera Autopista Norte'
s = np.array(transactions[col_name])
t = np.array(transactions[col_name].index)
IMF = EMD().emd(s,t)

CPU times: user 3min 44s, sys: 9.78 s, total: 3min 54s
Wall time: 3min 56s


In [20]:
len(IMF)

20

In [110]:
# N = IMF.shape[0]+1

# # Plot results
# plt.subplots(figsize=(20, 100))
# plt.subplot(N,1,1)
# plt.plot(t, s, 'r')
# plt.title("Input signal: $S(t)=cos(22\pi t^2) + 6t^2$")
# plt.xlabel("Time [s]")

# for n, imf in enumerate(IMF):
#     plt.subplot(N,1,n+2)
#     plt.plot(t, imf, 'g')
#     plt.title("IMF "+str(n+1))
#     plt.xlabel("Time [s]")

# plt.tight_layout()
# plt.savefig('simple_example')
# plt.show()

In [25]:
s

array([0., 0., 0., ..., 0., 0., 0.])

In [27]:
IMF[0]

array([0.43430627, 0.60432584, 0.18736505, ..., 1.13185124, 1.12181047,
       1.09168058])

In [86]:
meaningful_list = []
noise_list = []

for imf in IMF:
    r = np.corrcoef(s, imf)
    pho = r[0,1]
    r2 = r[0,1]**2

    if abs(pho) > 0.3:
        meaningful_list.append(imf)
    else:
        noise_list.append(imf)

In [87]:
meaningful = np.array(meaningful_list).sum(axis = 0)
noise = np.array(noise_list).sum(axis = 0)

In [97]:
a = np.stack([meaningful, noise]).T

In [109]:
station = '(02000) Cabecera Autopista Norte'
df = transactions[['timestamp',station,'day','hour','weekday']] 
df[['IMF_meaningful','IMF_noise']] = a 
df.to_csv('data/' + station + '.csv')

In [51]:
meaningful + noise

array([ 0.00000000e+00, -4.44089210e-16, -1.11022302e-15, ...,
       -8.88178420e-16, -2.22044605e-16, -1.55431223e-15])

In [7]:
from sklearn.metrics import r2_score

In [None]:
r2_score(s, y_pred)

In [1]:
import json
dictionary_data = {"a": 1, "b": 2}

a_file = open("data.json", "w")
json.dump(dictionary_data, a_file)
a_file.close()

a_file = open("data.json", "r")
output = a_file.read()
print(output)

{"a": 1, "b": 2}
