# Experiment description
## Hypothesis: 
Peaks identified on sample dataset (3.5 and 7 days periods) are also observed on localidades with higher predictability and lower predictability variance on full dataset. 
## Method: 
Fourier analysis to identify periodicity on time series signals by localidad. Time series to analyze: 
- Higher predictability and lower variance (all months): Ciudad Bolívar, Usme, San Cristobal 
- Higher predictability and medium variance (all months): Bosa, Rafael Uribe Uribe, Engativa, Tunjuelito
- Lower predictability: Teusaquillo, Martires, Candelaria, Barrios Unidos y Chapinero.

## Built-in methods

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import calendar
import scipy as sp
import scipy.fftpack
from scipy.signal import find_peaks

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

In [2]:
def set_initial_dataset_day(df_by_date,name_day):
    df_by_date = df_by_date.reset_index()
    df_by_date['day_of_week'] = df_by_date['date'].dt.day_name()
    monday_idx = df_by_date.index[df_by_date['day_of_week'] == name_day].tolist()[0]
    df_by_date = df_by_date[monday_idx:].set_index('date').drop(['day_of_week'],axis=1)
    return df_by_date

In [3]:
def low_pass_filter(signal, fc =0.04, b =0.08):
    N = int(np.ceil((4 / b)))
    if not N % 2: N += 1
    n = np.arange(N)

    sinc_func = np.sinc(2 * fc * (n - (N - 1) / 2.))
    window = 0.42 - 0.5 * np.cos(2 * np.pi * n / (N - 1)) + 0.08 * np.cos(4 * np.pi * n / (N - 1))
    sinc_func = sinc_func * window
    sinc_func = sinc_func / np.sum(sinc_func)

    s = signal
    s = s - np.nanmean(s)
    filtered_signal = np.convolve(s, sinc_func)
    return filtered_signal

In [4]:
def fourier_analysis(signal):
    time = range(len(signal))
    fftdata = np.fft.fft(signal)
    fftdatafreq = np.zeros((len(signal)))

    for i in range(len(fftdata)):
        fftdatafreq[i] = abs(fftdata[i].real)

    #plt.ylabel("Amplitude")
    #plt.xlabel("Frequency")
    
    unilateral_fft = fftdatafreq[0:len(fftdatafreq) // 2]
    #plt.plot(time[1:len(fftdatafreq) // 2], unilateral_fft[1:])
    return unilateral_fft

In [5]:
def get_max_frequencies(unilateral_fft_signal,fs_original_signal):
    max_freq = unilateral_fft_signal.argsort()[-10:][::-1]
    print('Max frequencies: ',max_freq)
    periodicity = fs_original_signal/max_freq
    print('Periods: ', periodicity)

In [6]:
def get_frequency_peaks(unilateral_fft, custom_distance):
    peaks, _ = find_peaks(unilateral_fft, distance = custom_distance)
    return peaks

In [7]:
def get_peaks_magnitude(unilateral_fft, peaks_location_array):
    peaks_magnitude = []
    for peak in peaks_location_array:
        magnitude = round(unilateral_fft[peak],2)
        peaks_magnitude.append(magnitude)
    return peaks_magnitude

In [8]:
def get_peaks_period(array_frequency_peaks,fs):
    peaks_period = []
    for peak in array_frequency_peaks:
        period = round((fs/peak),2)
        peaks_period.append(period)
    return peaks_period

In [9]:
def get_periods_by_peak(peaks,n_periods):
    period_values_array = {}
    for period in range(n_periods):
        period_values_array[period]=[]
        for peak in peaks:
            current_period = peaks[peak]['periods'][period]
            period_values_array[period].append(current_period)

    return period_values_array

In [10]:
def plot_fft_period_axis(signal_time_domain,unilateral_fft):
    fs = len(signal_time_domain)
    samples_fft = len(unilateral_fft)
    inv=unilateral_fft[1:][::-1]
    #inv = low_pass_filter(inv, fc =0.08, b =0.08)
    #samples_inv = len(inv) +1
    points_x = np.arange(samples_fft)
    period = fs/(samples_fft - points_x[1:])
    inv = inv/inv.max()
    
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    ax.plot(period,inv)
    plt.ylabel("Amplitude")
    plt.xlabel("Period (in days)")
    plt.axis([0, 15, 0, 1.2])
    plt.show()
    

## Data

In [23]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/06. verify_enrich_nuse_11022020.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [27]:
df = df_input.copy()

Unnamed: 0,STR_NUMERO_INTERNO,FECHA,HORA,ANIO,MES,COD_LOCALIDAD,LOCALIDAD,COD_UPZ,UPZ,COD_SEC_CATAST,SEC_CATASTRAL,COD_BARRIO,BARRIO,TIPO_UNICO,TIPO_DETALLE,LATITUD,LONGITUD,STR_DIRECCION_INCIDENTE,ESTADO_INCIDENTE,PERIODO_TS,LOG_TEXT
0,NU11_169253142,2014-02-01 22:16:40,2216,2014,2,19,CIUDAD BOLIVAR,UPZ68,EL TESORO,2534,CASA DE TEJA,1345,S.C. CASA DE TEJA,934,934 - RI?A,4.541922,-74.136036,ND,CERRADO,2014/02,"OUTGOING~3008249263~169253142~CIUDAD BOLIVAR~CASA DE TEJA~FRENTE AL ASADERO INF DE RI?A CON HERIDOS POR ARMA BLANACA ~3~910, 934~NU11_169253142~DIRECTION/INCIDENT CALLER TELEPHONE NUMBER/INCIDENT ID/TOWN/DISTRICT/INCIDENT CREATION DATE/TIME/INCIDENT PRIORITY/INCIDENT TYPE/CUSTOMER FORMAT INCIDENT ID~~"
1,NU10_152698142,2014-02-01 00:33:37,33,2014,2,13,TEUSAQUILLO,UPZ107,QUINTA PAREDES,6210,CENTRO NARINO,3414,CENTRO NARI?O,934,934 - RI?A,4.626935,-74.090799,KR 36 23 76,CERRADO,2014/02,OUTGOING~ANTONIO~3112593895~152698142~TEUSAQUILLO~CENTRO ANTONIO NARI?O~KR 36 23~76~RI?A INTRAFAMILIAR AL INTERIOR DEL AP HAY UN SJTO EN ESTADO 922//NO SABE DE HERIDOS//NO SABE DE ARMAS~PORTERIA 5~3~CO CENTRO NARI?O~934~BL 5 AP 207~NU10_152698142~DIRECTION/INCIDENT CALLER TELEPHONE DESCRIPTION/INCIDENT CALLER TELEPHONE NUMBER/INCIDENT ID/TOWN/DISTRICT/STREET NAME/PREMISE NUMBER/INCIDENT CREATION DATE/TIME/INCIDENT ADDITIONAL INFORMATION/INCIDENT PRIORITY/ORGANISATION NAME/INCIDENT TYPE/BUILDING NAME/CUSTOMER FORMAT INCIDENT ID~~
2,NU11_170452142,2014-02-01 23:18:51,2318,2014,2,11,SUBA,UPZ71,TIBABUYES,9225,TIBABUYES UNIVERSAL,3541,TIBABUYES UNIVERSAL,934,934 - RI?A,4.746107,-74.113194,CL 138 132B,CERRADO,2014/02,OUTGOING~3202402858~170452142~SUBA~TIBABUYES~CL 138 132B~RI?A EN VIA PUBLICA ENTRE PAREJA NO ARMAS NOHERIDOS~3~934~NU11_170452142~DIRECTION/INCIDENT CALLER TELEPHONE NUMBER/INCIDENT ID/TOWN/DISTRICT/STREET NAME/INCIDENT CREATION DATE/TIME/INCIDENT PRIORITY/INCIDENT TYPE/CUSTOMER FORMAT INCIDENT ID~~
3,NU10_169415142,2014-02-01 22:23:26,2223,2014,2,9,FONTIBON,UPZ75,FONTIBON,6406,VILLEMAR,637,VILLEMAR,934,934 - RI?A,4.672305,-74.137839,CL 21 96D 28,CERRADO,2014/02,OUTGOING~PEDRO ANTONIO JACOME ~3138805381~BOGOTA D.C.~169415142~FONTIBON~VILLEMAR FONTIBON~CL 21 96D~28~INFO DE LOS INQUILINOS Y LOS SUE?OS DE LA VIVIENDA // NO INFO DE HERIDOS NI ARMAS ~INFO QUE YA FUE PONAL Y NECESITAN PATRULLA ~3~934~NU10_169415142~DIRECTION/INCIDENT CALLER TELEPHONE DESCRIPTION/INCIDENT CALLER TELEPHONE NUMBER/COUNTY/INCIDENT ID/TOWN/DISTRICT/STREET NAME/PREMISE NUMBER/INCIDENT CREATION DATE/TIME/INCIDENT ADDITIONAL INFORMATION/INCIDENT PRIORITY/INCIDENT TYPE/CUSTOMER FORMAT INCIDENT ID~~
4,NU10_164818142,2014-02-01 18:13:11,1813,2014,2,16,PUENTE ARANDA,UPZ43,SAN RAFAEL,4206,SAN FRANCISCO,992,PRIMAVERA NORTE - SAN FRANCISCO,934,934 - RI?A,4.619172,-74.110462,AUTOPISTA SUR,CERRADO,2014/02,"OUTGOING~MARIA OMAIRA MENDOZA~3138145139~164818142~PUENTE ARANDA~AUTOPISTA SUR~IND DE ACC ENTRE DOS COLECTIVOS DE SERVICIO PUBLICO//NO HERIDOS NO ATRAPADOS//LOS CONDUCTORES ESTAN INICIANDO UNA RI?A~BAJANDO EL PUENTE DE MATATIGRES DE NORTE A SUR~3~934, 942~NU10_164818142~DIRECTION/INCIDENT CALLER TELEPHONE DESCRIPTION/INCIDENT CALLER TELEPHONE NUMBER/INCIDENT ID/TOWN/STREET NAME/INCIDENT CREATION DATE/TIME/INCIDENT ADDITIONAL INFORMATION/INCIDENT PRIORITY/INCIDENT TYPE/CUSTOMER FORMAT INCIDENT ID~~"


In [28]:
df = df.loc[df['ANIO']!= 2019]
df['date']=pd.to_datetime(df['FECHA']).dt.strftime('%Y-%m-%d')
df['date']=pd.to_datetime(df['date'],format='%Y-%m-%d')
df_by_date = pd.DataFrame(df.groupby(['date','PERIODO_TS','LOCALIDAD']).size(),columns=["total_eventos"])

In [29]:
df_by_date = df_by_date.reset_index().set_index('date')

In [34]:
df_by_date.LOCALIDAD.unique()

array(['ANTONIO NARIÑO', 'BARRIOS UNIDOS', 'BOSA', 'CANDELARIA',
       'CHAPINERO', 'CIUDAD BOLIVAR', 'ENGATIVA', 'FONTIBON', 'KENNEDY',
       'LOS MARTIRES', 'PUENTE ARANDA', 'RAFAEL URIBE URIBE',
       'SAN CRISTOBAL', 'SANTA FE', 'SUBA', 'SUMAPAZ', 'TEUSAQUILLO',
       'TUNJUELITO', 'USAQUEN', 'USME', 'SIN LOCALIZACION'], dtype=object)

# Results
## Experiment localidades with high predictability and low predictability variance

In [15]:
localidadesList = ['CIUDAD BOLIVAR','USME','SAN CRISTOBAL']

In [31]:
# keep length windows of experiment 11 to set lT
lenWindow = 7
lT=(len(df['date'].unique())//lenWindow)*lenWindow 

1826

In [None]:
lT = 756 #keep number of samples from predictiblity experiment(04)
peaks={}

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    
    #Make sure dataset starts on Monday for the experiment
    df_by_localidad = set_initial_dataset_day(df_by_localidad,'Monday')

    #Make sure dataset include consecutive dates in period
    idx = pd.date_range(min(df_by_localidad.reset_index().date), max(df_by_localidad.reset_index().date))
    df_by_localidad = df_by_localidad.reindex(idx, fill_value=0)
    
    df_localidad_values = pd.Series(df_by_localidad['total_eventos']).values
    df_localidad_values = df_localidad_values[0:lT]
    
    fft_localidad = fourier_analysis(df_localidad_values)
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    ax.plot(fft_localidad[1:])
    plt.ylabel("Amplitude")
    plt.xlabel("Frequency")
    plt.show()
    
    plot_fft_period_axis(df_localidad_values,fft_localidad)
    
    peaks[localidad]={}
    peak_points = get_frequency_peaks(fft_localidad, 90)
    peak_values = get_peaks_magnitude(fft_localidad,peak_points)
    peak_periods = get_peaks_period(list(peak_points),len(df_localidad_values))
    peaks[localidad]['frequency']=list(peak_points)
    peaks[localidad]['periods']=peak_periods
    peaks[localidad]['values']=peak_values   
    print(localidad+': '+str(peaks[localidad]))

## Experiment localidades with lower predictability values

In [None]:
localidadesList = ['TEUSAQUILLO','LOS MARTIRES','CHAPINERO','BARRIOS UNIDOS']

In [None]:
lT = 756 #keep number of samples from predictiblity experiment(04)
peaks={}

for localidad in localidadesList:
    df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
    
    #Make sure dataset starts on Monday for the experiment
    df_by_localidad = set_initial_dataset_day(df_by_localidad,'Monday')

    #Make sure dataset include consecutive dates in period
    idx = pd.date_range(min(df_by_localidad.reset_index().date), max(df_by_localidad.reset_index().date))
    df_by_localidad = df_by_localidad.reindex(idx, fill_value=0)
    
    df_localidad_values = pd.Series(df_by_localidad['total_eventos']).values
    df_localidad_values = df_localidad_values[0:lT]
    
    fft_localidad = fourier_analysis(df_localidad_values)
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    ax.plot(fft_localidad[1:])
    plt.ylabel("Amplitude")
    plt.xlabel("Frequency")
    plt.show()
    
    plot_fft_period_axis(df_localidad_values,fft_localidad)
    
    peaks[localidad]={}
    peak_points = get_frequency_peaks(fft_localidad, 90)
    peak_values = get_peaks_magnitude(fft_localidad,peak_points)
    peak_periods = get_peaks_period(list(peak_points),len(df_localidad_values))
    peaks[localidad]['frequency']=list(peak_points)
    peaks[localidad]['periods']=peak_periods
    peaks[localidad]['values']=peak_values   
    print(localidad+': '+str(peaks[localidad]))