# Experiment description
## Hypothesis: 
Bands density distribution is similar on 'localidades' with similar predictability values. 
## Method: 
Fourier analysis to identify periodicity on time series signals by localidad and density bands estimation. Time series to analyze***:

- Localidades with high predictability (P>0.7) and low variance (variance < 0.2): Ciudad Bolívar, Bosa, Usme, San Cristobal, Rafael Uribe Uribe, Suba, Kennedy. 
- Localidades with medium predictability ([0.4, 0.7]) and variance ([0.2, 0.4]): Engativa, Tunjuelito, Fontibon, Usaquen
- Localidades with low predictability (P < 0.4) and high variance (variance > 0.4): Antonio Nariño, Santa Fe, Candelaria, Barrios Unidos y Mártires.

** Localidades that dont fit on former classification: Puente Aranda, Chapinero, Teusaquillo

*** Classification based on predictability results obtained on experiment 08 and variance of experiment 11

## Built-in methods

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import calendar
import scipy as sp
import scipy.fftpack
from scipy.signal import find_peaks

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

In [None]:
def set_initial_dataset_day(df_by_date,name_day):
    df_by_date = df_by_date.reset_index()
    df_by_date['day_of_week'] = df_by_date['date'].dt.day_name()
    monday_idx = df_by_date.index[df_by_date['day_of_week'] == name_day].tolist()[0]
    df_by_date = df_by_date[monday_idx:].set_index('date').drop(['day_of_week'],axis=1)
    return df_by_date

In [None]:
def preprocess_df_localidad(df_by_localidad):   
    #Make sure dataset starts on Monday for the experiment
    df_by_localidad = set_initial_dataset_day(df_by_localidad,'Monday')

    #Make sure dataset include consecutive dates in period
    idx = pd.date_range(min(df_by_localidad.reset_index().date), max(df_by_localidad.reset_index().date))
    df_by_localidad = df_by_localidad.reindex(idx, fill_value=0)
    
    # keep length windows of experiment 11 to set lT
    lenWindow = 7
    lT=(len(df_by_localidad)//lenWindow)*lenWindow 
    
    df_localidad_values = pd.Series(df_by_localidad['total_eventos']).values
    df_localidad_values = df_localidad_values[0:lT]

    df_localidad_values = df_localidad_values/(df_localidad_values.max())
    
    return df_localidad_values

In [None]:
def fourier_analysis(signal):
    time = range(len(signal))
    fftdata = np.fft.fft(signal)
    fftdatafreq = np.zeros((len(signal)))

    for i in range(len(fftdata)):
        fftdatafreq[i] = abs(fftdata[i].real)
    
    unilateral_fft = fftdatafreq[0:len(fftdatafreq) // 2]
    return unilateral_fft

In [None]:
def bands_density(freq_signal, bands_number, localidad):
    freq_bands = np.linspace(1,len(freq_signal), bands_number+1)
    density_array = []
    density_array.append(localidad)
    for i in range(len(freq_bands)-1):
        low_freq = int(freq_bands[i])
        high_freq = int(freq_bands[i+1] - 1)
        current_density = density_estimator(freq_signal,low_freq, high_freq)
        density_array.append(current_density)
    return density_array
        

In [None]:
def print_freq_bands(freq_signal, bands_number):
    freq_bands = np.linspace(1,len(freq_signal), bands_number+1)
    for i in range(len(freq_bands)-1):
        low_freq = int(freq_bands[i])
        high_freq = int(freq_bands[i+1] - 1)
        print('band'+str(i+1)+': ', str(low_freq)+' Hz', 'to ', str(high_freq)+' Hz')

In [None]:
def density_estimator(freq_signal, low_freq, high_freq):
    density_sum = 0
    for point in freq_signal[low_freq:high_freq-1]:
        density_sum = density_sum + (abs(point)*abs(point))
    return density_sum.round(2)

In [None]:
def density_table(df_density):
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    g=sns.heatmap(df_density.astype('float'),annot=False,linewidths=0,cmap="Blues",cbar=True)
    g.set_yticklabels(g.get_yticklabels(), rotation = 0)
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    file_path = 'periodicity_experiments/predictability/figures/'
    #plt.savefig(workingPath+file_path+'table_'+str(name_experiment)+'_p_variance_time_'+str(lenWindow)+'_levels_'+str(nLevel),dpi=300,bbox_inches = "tight")
    plt.show()

## Data

In [None]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/06. verify_enrich_nuse_11022020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
df = df_input.copy()

In [None]:
df = df.loc[df['ANIO']!= 2019]
df['date']=pd.to_datetime(df['FECHA']).dt.strftime('%Y-%m-%d')
df['date']=pd.to_datetime(df['date'],format='%Y-%m-%d')
df_by_date = pd.DataFrame(df.groupby(['date','PERIODO_TS','LOCALIDAD']).size(),columns=["total_eventos"])

In [None]:
df_by_date = df_by_date.reset_index().set_index('date')

In [None]:
df_by_date.LOCALIDAD.unique()

# Results
## All localidades

In [None]:
localidadesList = list(df_by_date.LOCALIDAD.unique())

In [None]:
localidadesList.remove('SIN LOCALIZACION')
localidadesList.remove('SUMAPAZ')

In [None]:
flagF = True

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    df_localidad_values = preprocess_df_localidad(df_by_localidad)
    
    fft_localidad = fourier_analysis(df_localidad_values)[1:] #ignore DC component
    #fft_localidad = fft_localidad/(fft_localidad.max()) #normalize signal
    
    bands_number = 10
    bands = bands_density(fft_localidad, bands_number, localidad)
    
    if flagF==True:
        flagF = False
        bands_values = np.array([bands]);
    else:
        bands_values = np.vstack((bands_values, [bands]))
    
df_bands_values = pd.DataFrame(bands_values, columns=['localidad','band1','band2','band3','band4','band5','band6','band7','band8','band9','band10'])
df_bands_values.set_index('localidad', inplace=True)
print_freq_bands(fft_localidad, bands_number)
density_table(df_bands_values)

## Experiment localidades with high predictability and low predictability variance

In [None]:
localidadesList = ['CIUDAD BOLIVAR','BOSA','USME','SAN CRISTOBAL','RAFAEL URIBE URIBE','SUBA','KENNEDY']

In [None]:
flagF = True

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    df_localidad_values = preprocess_df_localidad(df_by_localidad)
    
    fft_localidad = fourier_analysis(df_localidad_values)[1:] #ignore DC component
    #fft_localidad = fft_localidad/(fft_localidad.max()) #normalize signal
    
    bands_number = 10
    bands = bands_density(fft_localidad, bands_number, localidad)
    
    if flagF==True:
        flagF = False
        bands_values = np.array([bands]);
    else:
        bands_values = np.vstack((bands_values, [bands]))
    
df_bands_values = pd.DataFrame(bands_values, columns=['localidad','band1','band2','band3','band4','band5','band6','band7','band8','band9','band10'])
df_bands_values.set_index('localidad', inplace=True)
print_freq_bands(fft_localidad, bands_number)
density_table(df_bands_values)

## Experiment localidades with medium predictability values and variance

In [None]:
localidadesList = ['ENGATIVA','TUNJUELITO','FONTIBON','USAQUEN']

In [None]:
flagF = True

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    df_localidad_values = preprocess_df_localidad(df_by_localidad)
    
    fft_localidad = fourier_analysis(df_localidad_values)[1:] #ignore DC component
    #fft_localidad = fft_localidad/(fft_localidad.max()) #normalize signal
    
    bands_number = 10
    bands = bands_density(fft_localidad, bands_number, localidad)
    
    if flagF==True:
        flagF = False
        bands_values = np.array([bands]);
    else:
        bands_values = np.vstack((bands_values, [bands]))
    
df_bands_values = pd.DataFrame(bands_values, columns=['localidad','band1','band2','band3','band4','band5','band6','band7','band8','band9','band10'])
df_bands_values.set_index('localidad', inplace=True)
print_freq_bands(fft_localidad, bands_number)
density_table(df_bands_values)

## Experiment localidades with low predictability values and high predictability variance

In [None]:
localidadesList = ['ANTONIO NARIÑO','SANTA FE','CANDELARIA','BARRIOS UNIDOS','LOS MARTIRES']

In [None]:
flagF = True

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    df_localidad_values = preprocess_df_localidad(df_by_localidad)
    
    fft_localidad = fourier_analysis(df_localidad_values)[1:] #ignore DC component
    #fft_localidad = fft_localidad/(fft_localidad.max()) #normalize signal
    
    bands_number = 10
    bands = bands_density(fft_localidad, bands_number, localidad)
    
    if flagF==True:
        flagF = False
        bands_values = np.array([bands]);
    else:
        bands_values = np.vstack((bands_values, [bands]))
    
df_bands_values = pd.DataFrame(bands_values, columns=['localidad','band1','band2','band3','band4','band5','band6','band7','band8','band9','band10'])
df_bands_values.set_index('localidad', inplace=True)
print_freq_bands(fft_localidad, bands_number)
density_table(df_bands_values)

## Experiment particular localidades

In [None]:
localidadesList = ['PUENTE ARANDA','CHAPINERO','TEUSAQUILLO']

In [None]:
flagF = True

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    df_localidad_values = preprocess_df_localidad(df_by_localidad)
    
    fft_localidad = fourier_analysis(df_localidad_values)[1:] #ignore DC component
    #fft_localidad = fft_localidad/(fft_localidad.max()) #normalize signal
    
    bands_number = 10
    bands = bands_density(fft_localidad, bands_number, localidad)
    
    if flagF==True:
        flagF = False
        bands_values = np.array([bands]);
    else:
        bands_values = np.vstack((bands_values, [bands]))
    
df_bands_values = pd.DataFrame(bands_values, columns=['localidad','band1','band2','band3','band4','band5','band6','band7','band8','band9','band10'])
df_bands_values.set_index('localidad', inplace=True)
print_freq_bands(fft_localidad, bands_number)
density_table(df_bands_values)