# Experiment description
## Hypothesis: 
H1: Predictability results are not affected by the 'localidad' size.
## Method: 
Use of predictability and complementary measures such as constancy and contingency.

## Built-in methods

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import scipy
import math
from math import pi
import geopandas as gpd
%matplotlib inline

In [None]:
def set_initial_dataset_day(df_by_date,name_day):
    df_by_date = df_by_date.reset_index()
    df_by_date['day_of_week'] = df_by_date['date'].dt.day_name()
    monday_idx = df_by_date.index[df_by_date['day_of_week'] == name_day].tolist()[0]
    df_by_date = df_by_date[monday_idx:].set_index('date').drop(['day_of_week'],axis=1)
    return df_by_date

In [None]:
# https://plot.ly/python/v3/fft-filters/
def low_pass_filter(signal_values, fc =0.04, b =0.08):
    N = int(np.ceil((4 / b)))
    if not N % 2: N += 1
    n = np.arange(N)

    sinc_func = np.sinc(2 * fc * (n - (N - 1) / 2.))
    window = 0.42 - 0.5 * np.cos(2 * np.pi * n / (N - 1)) + 0.08 * np.cos(4 * np.pi * n / (N - 1))
    sinc_func = sinc_func * window
    sinc_func = sinc_func / np.sum(sinc_func)

    s = signal_values
    s = s - np.nanmean(s)
    filtered_signal = np.convolve(s, sinc_func)
    return filtered_signal

In [None]:
# Methods for slicing windows
def im2patches(im,n):
    patches = [];
    for i in range(len(im)-n):
        patch = im[i:(i+n-1)]        
        patch = patch - np.nanmean(patch);
        if(np.linalg.norm(patch)>0):
            patch = patch/np.linalg.norm(patch);
        if i==0:
            patches = patch;
        else:
            patches = np.vstack((patches,patch))
    return patches;


def writeEmbeding(timeSeries,lenWindow,samplePath, scenarioName):
    slicingWindows = im2patches(timeSeries,lenWindow);
    workingPath = '/Users/anamaria/Desktop/dev/security_project/periodicity_experiments/predictability/slicing/'
    prevStation = str(samplePath);
    with open(workingPath+'slicingWindows'+"_"+str(prevStation)+"_"+str(scenarioName)+"_"+str(lenWindow)+'_.pickle', 'wb') as f:
        lv = slicingWindows.tolist();                        
        pickle.dump(lv, f, protocol=2)

    workingPath = '/Users/anamaria/Desktop/dev/security_project/periodicity_experiments/predictability/timeSeries/'    
    with open(workingPath+'timeSeries'+"_"+str(prevStation)+"_"+str(scenarioName)+"_"+str(lenWindow)+'_.pickle', 'wb') as f:
        lv = timeSeries.tolist();                        
        pickle.dump(lv, f, protocol=2)



In [None]:
#Methods for predictability
def getBarcode(samplePath,lenWindow,scenarioName):
    workingPath = '/Users/anamaria/Desktop/dev/security_project/periodicity_experiments/predictability/'
    barcode = [];

    with open(workingPath+'timeSeries/'+'timeSeries_'+samplePath+"_"+str(scenarioName)+'_'+str(lenWindow)+'_'+'.pickle', 'rb') as f:
            timeSeries = pickle.load(f);            
    return (barcode,timeSeries);

def computeBarcodeEntropy(barsLenB0):
    barlen = np.array(barsLenB0);
    barlen = barlen/barlen.sum();
    hbc = 0;
    for i in range(barlen.shape[0]):
        if barlen[i]!=0:
            hbc = hbc-(barlen[i])*np.log(barlen[i]);
    return hbc;


def computeGeneralPredictability(timeSeries,binsData,lenWindow):
    # Colwell, R. K. (1974). Predictability, constancy, and contingency of periodic phenomena. Ecology, 55(5), 1148-1153.
    # Normalize the caudal values
    nLevels = binsData.shape[0]-1;
    matStations = np.array(timeSeries).reshape((np.array(timeSeries).shape[0]//lenWindow,lenWindow))    

    grandMean = np.mean(np.mean(matStations));
    #matStations = matStations / grandMean;
    N = np.zeros((nLevels,lenWindow));
    for i in range(1,matStations.shape[1]): 
        # Computes histograms per columns
        hist, bin_edges = np.histogram(matStations[:,i],bins = binsData);
        N[:,i] = hist;
    X = np.sum(N, axis=0);
    Y = np.sum(N, axis=1);
    Z = np.sum(Y);
    hx = 0;
    hy = 0;
    hxy = 0;
    for j in range(X.shape[0]):
        if X[j]!=0:
            hx = hx-(X[j]/Z)*np.log(X[j]/Z);
            
    for i in range(Y.shape[0]):
        if Y[i]!=0:
            hy = hy-(Y[i]/Z)*np.log(Y[i]/Z);
            
    for i in range(Y.shape[0]):
        for j in range(X.shape[0]):
            if N[i,j]!=0:
                hxy = hxy-((N[i,j]/Z)*np.log(N[i,j]/Z));    
    
    # predictability
    p = 1 - (hxy - hx)/np.log(N.shape[0]);
    # constancy
    c = 1 - hy/np.log(N.shape[0]);
    # Returns constancy and contingency
    return (c,p-c,p);



In [None]:
localidadSize = {'ANTONIO NARIÑO':109199,
                 'BARRIOS UNIDOS':270280,
                 'BOSA':753496,
                 'CANDELARIA':22243,
                 'CHAPINERO':126192,
                 'CIUDAD BOLIVAR':748012,
                 'ENGATIVA':883319,
                 'FONTIBON':424038,
                 'KENNEDY':1230539,
                 'LOS MARTIRES':93248,
                 'PUENTE ARANDA':218555,
                 'RAFAEL URIBE URIBE':348023,
                 'SAN CRISTOBAL':392220,
                 'SANTA FE':93857,
                 'SUBA':1315509,
                 'TEUSAQUILLO':140135,
                 'TUNJUELITO':186383,
                 'USAQUEN':475275,
                 'USME':342940}

def predictability_experiment(df_by_date,lenWindow,localidadesList,Levels,lT):
    #write embeding
    for localidad in localidadesList:
        df_by_localidad = df_by_date[df_by_date['LOCALIDAD'] == localidad]
        print(localidad)
        
        #Normalize events considering localidad size
        df_by_localidad['total_eventos'] = df_by_localidad['total_eventos']/localidadSize[localidad]
        print(pd.Series(df_by_localidad['total_eventos']).values[0:5])
        
        #Make sure dataset starts on Monday for the experiment
        df_by_localidad = set_initial_dataset_day(df_by_localidad,'Monday')
        
        #Make sure dataset include consecutive dates in period
        idx = pd.date_range(min(df_by_localidad.reset_index().date), max(df_by_localidad.reset_index().date))
        df_by_localidad = df_by_localidad.reindex(idx, fill_value=0)
        
        df_localidad_values = pd.Series(df_by_localidad['total_eventos']).values
        df_localidad_values = df_localidad_values[0:lT]
        print(df_localidad_values[0:5])
        print('=============')
        writeEmbeding(df_localidad_values,lenWindow,'aggressiveBehavior',localidad)
    
    #find predictability, constancy and contingency
    workingPath = '/Users/anamaria/Desktop/dev/security_project/periodicity_experiments/predictability/';

    flagF = True;
    for localidad in localidadesList:        
        for nLevels in Levels:
            for expName in ['aggressiveBehavior']:
            #for expName in ['aggressiveBehavior','random']:
                (barcode,timeSeries) = getBarcode(expName,lenWindow,localidad);
                binsLevels = np.linspace(np.min(timeSeries),np.max(timeSeries),nLevels);
                #print(nLevels)
                #print(localidad)
                c,m,p = computeGeneralPredictability(timeSeries,binsLevels,lenWindow)
                
                if flagF==True:
                    flagF = False
                    predValues = np.array([expName,localidad,lenWindow,nLevels,p,m,c]);
                else:
                    predValues = np.vstack((predValues, [expName,localidad,lenWindow,nLevels,p,m,c]))

    return predValues
        
    

In [None]:
def table_predictability_report(df_agressiveBehavior,lenWindow,localidadesList,levelCategories,name_experiment):
    join=df_agressiveBehavior.pivot('localidad','crime_level','predictability')
    var1_order = []
    var2_order = levelCategories
    if len(var2_order) > 0:
        join = join.reindex(var2_order, axis=1)
    if len(var1_order) > 0:
        join = join.reindex(var1_order)
    
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    fig.set_size_inches(7, 6)
    g=sns.heatmap(join.astype('float'),annot=True,fmt=".1%",linewidths=0,cmap="Blues",cbar=False)
    g.set_yticklabels(g.get_yticklabels(), rotation = 0)
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    file_path = '/Users/anamaria/Desktop/dev/security_project/periodicity_experiments/predictability/figures/'
    plt.savefig(file_path+'table_aggressiveBehavior_localidades'+str(name_experiment)+'_predictability_time_'+str(lenWindow)+'_levels_'+str(levelCategories),dpi=300,bbox_inches = "tight")
    plt.show()

In [None]:
def table_constancy_report(df_agressiveBehavior,lenWindow,localidadesList,levelCategories,name_experiment):
    join=df_agressiveBehavior.pivot('localidad','crime_level','constancy')
    var1_order = []
    var2_order = levelCategories
    if len(var2_order) > 0:
        join = join.reindex(var2_order, axis=1)
    if len(var1_order) > 0:
        join = join.reindex(var1_order)
    
    fig, ax = plt.subplots(1,1,sharex=True, sharey=True)
    fig.set_size_inches(7, 6)
    g=sns.heatmap(join.astype('float'),annot=True,fmt=".1%",linewidths=0,cmap="Blues",cbar=False)
    g.set_yticklabels(g.get_yticklabels(), rotation = 0)
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    file_path = '/Users/anamaria/Desktop/dev/security_project/periodicity_experiments/predictability/figures/'
    plt.savefig(file_path+'table_aggressiveBehavior_localidades'+str(name_experiment)+'_constancy_time_'+str(lenWindow)+'_levels_'+str(levelCategories),dpi=300,bbox_inches = "tight")
    plt.show()

In [None]:
def map_localidad(ax,df,col_localidad,col_vals,vmin=None,vmax=None):
  loc_geo="/Users/anamaria/Desktop/dev/security_project/assets/localidades_polygon.json"
  loc_=gpd.read_file(loc_geo)
  loc_=loc_.merge(df,left_on='LocNombre',right_on=col_localidad)
  loc_.plot(cmap='viridis',column=col_vals,legend=True,ax=ax,vmin=vmin,vmax=vmax)  

In [None]:
def map_predictability(df_crime, crime_level, lenWindow,name_experiment):
    subdata = df_crime[df_crime['crime_level']==crime_level]
    subdata = subdata[subdata['lenWindow']==str(lenWindow)]
    subdata["predictability"] = pd.to_numeric(subdata["predictability"])
    
    fig, ax = plt.subplots(figsize=(12,12))
    map_localidad(ax,subdata,'localidad','predictability')
    ax.axis('off')
    file_path = '/Users/anamaria/Desktop/dev/security_project/periodicity_experiments/predictability/figures/'
    plt.savefig(file_path+'map_aggressiveBehavior_localidades'+str(name_experiment)+'_predictability_time_'+str(lenWindow)+'_levels_'+str(crime_level),dpi=300,bbox_inches = "tight")
    plt.show()

## Load data

In [None]:
data_location = '/Users/anamaria/Desktop/dev/security_project/datasets/verify_enrich_nuse_29112019.csv'
df_input = pd.read_csv(data_location,delimiter=",")

In [None]:
df_input['date']=pd.to_datetime(df_input['FECHA'])
df_by_date = pd.DataFrame(df_input.groupby(['date','LOCALIDAD']).size(),columns=["total_eventos"])

In [None]:
df_by_date = df_by_date.reset_index().set_index('date')

## Experiment to validate H1

In [None]:
Levels=[3,5,10]
levelCategories = list(map(lambda x: str(x), Levels))
localidadesList = list(df_by_date.LOCALIDAD.unique())
localidadesList.remove('SIN LOCALIZACION')
timeWindows = [7, 14, 28, 84]
lT = 756
name_experiment = '_normalizadas'

In [None]:
for lenWindow in timeWindows:
    predValues = predictability_experiment(df_by_date,lenWindow,localidadesList,Levels,lT)
    df_prediction = pd.DataFrame(predValues, columns=['crime_type', 'localidad','lenWindow','crime_level','predictability','contingency','constancy'])
    df_agressiveBehavior = df_prediction[df_prediction['crime_type']=='aggressiveBehavior']
    table_predictability_report(df_agressiveBehavior,lenWindow,localidadesList,levelCategories,name_experiment)
    crime_level = levelCategories[-1]