# Experiment description
## Hypothesis: 
Peaks identified on sample dataset (3.5 and 7 days periods) are also observed on localidades with higher predictability on full dataset. 
## Method: 
Fourier analysis to identify periodicity on time series signals by localidad. Time series to analyze***:

- Localidades with high predictability (P>0.7) and low variance (variance < 0.2): Ciudad Bolívar, Bosa, Usme, San Cristobal, Rafael Uribe Uribe, Suba, Kennedy. 
- Localidades with medium predictability ([0.4, 0.7]) and variance ([0.2, 0.4]): Engativa, Tunjuelito, Fontibon, Usaquen
- Localidades with low predictability (P < 0.4) and high variance (variance > 0.4): Antonio Nariño, Santa Fe, Candelaria, Barrios Unidos y Mártires.

** Localidades that dont fit on former classification: Puente Aranda, Chapinero, Teusaquillo

*** Classification based on predictability results obtained on experiment 08 and variance of experiment 11

## Built-in methods

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import calendar
import scipy as sp
import scipy.fftpack
from scipy.signal import find_peaks

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

In [None]:
def set_initial_dataset_day(df_by_date,name_day):
    df_by_date = df_by_date.reset_index()
    df_by_date['day_of_week'] = df_by_date['date'].dt.day_name()
    monday_idx = df_by_date.index[df_by_date['day_of_week'] == name_day].tolist()[0]
    df_by_date = df_by_date[monday_idx:].set_index('date').drop(['day_of_week'],axis=1)
    return df_by_date

In [None]:
def low_pass_filter(signal, fc =0.04, b =0.08):
    N = int(np.ceil((4 / b)))
    if not N % 2: N += 1
    n = np.arange(N)

    sinc_func = np.sinc(2 * fc * (n - (N - 1) / 2.))
    window = 0.42 - 0.5 * np.cos(2 * np.pi * n / (N - 1)) + 0.08 * np.cos(4 * np.pi * n / (N - 1))
    sinc_func = sinc_func * window
    sinc_func = sinc_func / np.sum(sinc_func)

    s = signal
    s = s - np.nanmean(s)
    filtered_signal = np.convolve(s, sinc_func)
    return filtered_signal

In [None]:
def fourier_analysis(signal):
    time = range(len(signal))
    fftdata = np.fft.fft(signal)
    fftdatafreq = np.zeros((len(signal)))

    for i in range(len(fftdata)):
        fftdatafreq[i] = abs(fftdata[i].real)

    #plt.ylabel("Amplitude")
    #plt.xlabel("Frequency")
    
    unilateral_fft = fftdatafreq[0:len(fftdatafreq) // 2]
    #plt.plot(time[1:len(fftdatafreq) // 2], unilateral_fft[1:])
    return unilateral_fft

In [None]:
def get_max_frequencies(unilateral_fft_signal,fs_original_signal):
    max_freq = unilateral_fft_signal.argsort()[-10:][::-1]
    print('Max frequencies: ',max_freq)
    periodicity = fs_original_signal/max_freq
    print('Periods: ', periodicity)

In [None]:
def get_frequency_peaks(unilateral_fft, custom_distance):
    peaks, _ = find_peaks(unilateral_fft, distance = custom_distance)
    return peaks

In [None]:
def get_peaks_magnitude(unilateral_fft, peaks_location_array):
    peaks_magnitude = []
    for peak in peaks_location_array:
        magnitude = round(unilateral_fft[peak],2)
        peaks_magnitude.append(magnitude)
    return peaks_magnitude

In [None]:
def get_peaks_period(array_frequency_peaks,fs):
    peaks_period = []
    for peak in array_frequency_peaks:
        period = round((fs/peak),2)
        peaks_period.append(period)
    return peaks_period

In [None]:
def get_periods_by_peak(peaks,n_periods):
    period_values_array = {}
    for period in range(n_periods):
        period_values_array[period]=[]
        for peak in peaks:
            current_period = peaks[peak]['periods'][period]
            period_values_array[period].append(current_period)

    return period_values_array

In [None]:
def plot_fft_period_axis(signal_time_domain,unilateral_fft):
    fs = len(signal_time_domain)
    samples_fft = len(unilateral_fft)
    inv=unilateral_fft[1:][::-1]
    #inv = low_pass_filter(inv, fc =0.08, b =0.08)
    #samples_inv = len(inv) +1
    points_x = np.arange(samples_fft)
    period = fs/(samples_fft - points_x[1:])
    inv = inv/inv.max()
    
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    ax.plot(period,inv)
    plt.ylabel("Amplitude")
    plt.xlabel("Period (in days)")
    plt.axis([0, 15, 0, 1.2])
    plt.show()
    

## Data

In [None]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/06. verify_enrich_nuse_11022020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
df = df_input.copy()

In [None]:
df = df.loc[df['ANIO']!= 2019]
df['date']=pd.to_datetime(df['FECHA']).dt.strftime('%Y-%m-%d')
df['date']=pd.to_datetime(df['date'],format='%Y-%m-%d')
df_by_date = pd.DataFrame(df.groupby(['date','PERIODO_TS','LOCALIDAD']).size(),columns=["total_eventos"])

In [None]:
df_by_date = df_by_date.reset_index().set_index('date')

In [None]:
df_by_date.LOCALIDAD.unique()

# Results
## Experiment localidades with high predictability and low predictability variance

In [None]:
localidadesList = ['CIUDAD BOLIVAR','BOSA','USME','SAN CRISTOBAL','RAFAEL URIBE URIBE','SUBA','KENNEDY']

In [None]:
# keep length windows of experiment 11 to set lT
lenWindow = 7
lT=(len(df['date'].unique())//lenWindow)*lenWindow 

In [None]:
peaks={}

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    
    #Make sure dataset starts on Monday for the experiment
    df_by_localidad = set_initial_dataset_day(df_by_localidad,'Monday')

    #Make sure dataset include consecutive dates in period
    idx = pd.date_range(min(df_by_localidad.reset_index().date), max(df_by_localidad.reset_index().date))
    df_by_localidad = df_by_localidad.reindex(idx, fill_value=0)
    
    df_localidad_values = pd.Series(df_by_localidad['total_eventos']).values
    df_localidad_values = df_localidad_values[0:lT]
    
    df_localidad_values = df_localidad_values/(df_localidad_values.max())
    
    fft_localidad = fourier_analysis(df_localidad_values)
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    ax.plot(fft_localidad[1:])
    plt.ylabel("Amplitude")
    plt.xlabel("Frequency")
    plt.show()
    
    plot_fft_period_axis(df_localidad_values,fft_localidad)
    
    peaks[localidad]={}
    peak_points = get_frequency_peaks(fft_localidad, 90)
    peak_values = get_peaks_magnitude(fft_localidad,peak_points)
    peak_periods = get_peaks_period(list(peak_points),len(df_localidad_values))
    peaks[localidad]['frequency']=list(peak_points)
    peaks[localidad]['periods']=peak_periods
    peaks[localidad]['values']=peak_values   
    print(localidad+': '+str(peaks[localidad]))

## Experiment localidades with medium predictability values and variance

In [None]:
localidadesList = ['ENGATIVA','TUNJUELITO','FONTIBON','USAQUEN']

In [None]:
peaks={}

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    
    #Make sure dataset starts on Monday for the experiment
    df_by_localidad = set_initial_dataset_day(df_by_localidad,'Monday')

    #Make sure dataset include consecutive dates in period
    idx = pd.date_range(min(df_by_localidad.reset_index().date), max(df_by_localidad.reset_index().date))
    df_by_localidad = df_by_localidad.reindex(idx, fill_value=0)
    
    df_localidad_values = pd.Series(df_by_localidad['total_eventos']).values
    df_localidad_values = df_localidad_values[0:lT]
    
    df_localidad_values = df_localidad_values/(df_localidad_values.max())
    
    fft_localidad = fourier_analysis(df_localidad_values)
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    ax.plot(fft_localidad[1:])
    plt.ylabel("Amplitude")
    plt.xlabel("Frequency")
    plt.show()
    
    plot_fft_period_axis(df_localidad_values,fft_localidad)
    
    peaks[localidad]={}
    peak_points = get_frequency_peaks(fft_localidad, 90)
    peak_values = get_peaks_magnitude(fft_localidad,peak_points)
    peak_periods = get_peaks_period(list(peak_points),len(df_localidad_values))
    peaks[localidad]['frequency']=list(peak_points)
    peaks[localidad]['periods']=peak_periods
    peaks[localidad]['values']=peak_values   
    print(localidad+': '+str(peaks[localidad]))

## Experiment localidades with low predictability values and high predictability variance

In [None]:
localidadesList = ['ANTONIO NARIÑO','SANTA FE','CANDELARIA','BARRIOS UNIDOS','LOS MARTIRES']

In [None]:
peaks={}

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    
    #Make sure dataset starts on Monday for the experiment
    df_by_localidad = set_initial_dataset_day(df_by_localidad,'Monday')

    #Make sure dataset include consecutive dates in period
    idx = pd.date_range(min(df_by_localidad.reset_index().date), max(df_by_localidad.reset_index().date))
    df_by_localidad = df_by_localidad.reindex(idx, fill_value=0)
    
    df_localidad_values = pd.Series(df_by_localidad['total_eventos']).values
    df_localidad_values = df_localidad_values[0:lT]
    
    df_localidad_values = df_localidad_values/(df_localidad_values.max())
    
    fft_localidad = fourier_analysis(df_localidad_values)
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    ax.plot(fft_localidad[1:])
    plt.ylabel("Amplitude")
    plt.xlabel("Frequency")
    plt.show()
    
    plot_fft_period_axis(df_localidad_values,fft_localidad)
    
    peaks[localidad]={}
    peak_points = get_frequency_peaks(fft_localidad, 90)
    peak_values = get_peaks_magnitude(fft_localidad,peak_points)
    peak_periods = get_peaks_period(list(peak_points),len(df_localidad_values))
    peaks[localidad]['frequency']=list(peak_points)
    peaks[localidad]['periods']=peak_periods
    peaks[localidad]['values']=peak_values   
    print(localidad+': '+str(peaks[localidad]))

## Experiment particular localidades

In [None]:
localidadesList = ['PUENTE ARANDA','CHAPINERO','TEUSAQUILLO']

In [None]:
peaks={}

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    
    #Make sure dataset starts on Monday for the experiment
    df_by_localidad = set_initial_dataset_day(df_by_localidad,'Monday')

    #Make sure dataset include consecutive dates in period
    idx = pd.date_range(min(df_by_localidad.reset_index().date), max(df_by_localidad.reset_index().date))
    df_by_localidad = df_by_localidad.reindex(idx, fill_value=0)
    
    df_localidad_values = pd.Series(df_by_localidad['total_eventos']).values
    df_localidad_values = df_localidad_values[0:lT]
    
    df_localidad_values = df_localidad_values/(df_localidad_values.max())
    
    fft_localidad = fourier_analysis(df_localidad_values)
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    ax.plot(fft_localidad[1:])
    plt.ylabel("Amplitude")
    plt.xlabel("Frequency")
    plt.show()
    
    plot_fft_period_axis(df_localidad_values,fft_localidad)
    
    peaks[localidad]={}
    peak_points = get_frequency_peaks(fft_localidad, 90)
    peak_values = get_peaks_magnitude(fft_localidad,peak_points)
    peak_periods = get_peaks_period(list(peak_points),len(df_localidad_values))
    peaks[localidad]['frequency']=list(peak_points)
    peaks[localidad]['periods']=peak_periods
    peaks[localidad]['values']=peak_values   
    print(localidad+': '+str(peaks[localidad]))