# Experiment description
## Hypothesis: 
SEPP prediction performance is better for localidades with higher predictability 'Colwells index' values (periodicity_experiments/experiment_08). Hit rates should be bigger on localidades with higher predictability values.

## Method: 

- Train SEPP model for localidades with high and low predictability index:
    - Use dynamic data to train model size (200,600)
- Evaluate model prediction using hit rates measure:
    - Use traditional coverages values [0.01: 0.1]
    - Find prediction for 3am, 9am, 3pm, 9pm.
- Compare hit rates measures among localidades with high vs. low predictability values.


In [50]:
%matplotlib inline
import pandas as pd
import pickle
import dateutil.parser
import pyproj
import open_cp
from PIL import Image
import datetime
import matplotlib.pyplot as plt
import numpy as np

In [2]:
import import_ipynb
import training_sepp_builtin

importing Jupyter notebook from training_sepp_builtin.ipynb
Collecting https://github.com/QuantCrimAtLeeds/PredictCode/zipball/master
  Using cached https://github.com/QuantCrimAtLeeds/PredictCode/zipball/master
[33mYou are using pip version 18.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting https://github.com/MatthewDaws/SEPP/zipball/master
  Using cached https://github.com/MatthewDaws/SEPP/zipball/master
[33mYou are using pip version 18.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


Failed to import `rtree`.
Failed to import `rtree`.


In [20]:
def selectData(nameLoc,dateIni,dateEnd):
    dfloc = df[df["LOCALIDAD"]==nameLoc]
    dfloc["FECHA_HORA"]=dfloc["HORA"].astype(str)
    valHour = dfloc["FECHA_HORA"].values
    valDate = dfloc["FECHA"].values
    timesVals = [];
    k = 0;
    for i in valHour:
        if len(i)<=2:
            timeStr = valDate[k] +" " + "00:"+i+":00"
        else:
            timeStr = valDate[k] +" " + i[:-2]+":"+i[-2:]+":00"
        
        k = k + 1

        timesVals.append(timeStr)
    dfloc["FECHA_HORA"] = timesVals;
    
    dfloc["FECHA_HORA_TS"] = dfloc.FECHA_HORA.map(dateutil.parser.parse)
    dfFilter = dfloc[(dfloc.FECHA_HORA_TS > dateIni) & (dfloc.FECHA_HORA_TS < dateEnd)]
    timestamps = dfFilter.FECHA_HORA_TS
    
    print("TimeStamps")
    print(type(timestamps))

    xcoords, ycoords = (dfFilter.LONGITUD.values,dfFilter.LATITUD.values)
    proj = pyproj.Proj(init="EPSG:3116")
    xcoords, ycoords = proj(xcoords,ycoords)
    
    points_crime = open_cp.TimedPoints.from_coords(timestamps, xcoords, ycoords)
    
    #Generando Grilla para los algoritmos. 
    maxx = max(xcoords)
    minx = min(xcoords)
    maxy = max(ycoords)
    miny = min(ycoords)

    region = open_cp.RectangularRegion(xmin=minx, xmax=maxx, ymin=miny, ymax=maxy)
    
    return (points_crime,region)


In [10]:
def load_model(localidad, train_date):
    custom_path = path+'/aggressive_behavior_model/SEPP/'+'pkl/TrainRina_'+str(localidad)+'_'+train_date+'.pkl'
    infile = open(custom_path,'rb')
    modelCrime = pickle.load(infile)
    infile.close()
    return modelCrime

In [64]:
def run_prediction(localidad,modelCrime,predict_time):
    points_crime,region = selectData(localidad,initial_date_prediction,final_date_prediction)
    modelCrime.data = points_crime 
    prediction = modelCrime.predict(predict_time)
    gridpred = open_cp.predictors.GridPredictionArray.from_continuous_prediction_region(prediction, region, 150, 150)
    hitrates = open_cp.evaluation.hit_rates(gridpred, points_crime, [0.01,0.1])
    return gridpred, hitrates

In [62]:
def plot_gridpred(localidad,predict_time,gridpred):
    fig, ax = plt.subplots(figsize=(20,10))
    m = ax.pcolormesh(*gridpred.mesh_data(), gridpred.intensity_matrix, cmap="CMRmap_r")
    ax.set_title("Predicción localidad: "+localidad+'; fecha: '+str(predict_time))
    fig.colorbar(m, ax=ax)

## Select data

In [3]:
path = '/Users/anamaria/Desktop/dev/security_project'

In [4]:
df = pd.read_csv(path+'/datasets/verify_enrich_nuse_29112019.csv')

In [5]:
higher_predictability_localidades = ['CIUDAD BOLIVAR', 'BOSA', 'USME', 'SAN CRISTOBAL', 'RAFAEL URIBE URIBE']
lower_predictability_localidades = ['TEUSAQUILLO', 'CHAPINERO']

## Train datasets

In [6]:
final_date = '2017-01-30'

In [None]:
for localidad in higher_predictability_localidades:
    print(localidad)
    localidad_predictor = training_sepp_builtin.trainModel_3(df, localidad, final_date)
    localidad_outfile = open(path+'/aggressive_behavior_model/SEPP/'+'pkl/TrainRina_'+str(localidad)+'_'+final_date+'.pkl','wb')
    pickle.dump(localidad_predictor, localidad_outfile)
    localidad_outfile.close() 

In [None]:
for localidad in lower_predictability_localidades:
    print(localidad)
    localidad_predictor = training_sepp_builtin.trainModel_3(df, localidad, final_date)
    localidad_outfile = open(path+'/aggressive_behavior_model/SEPP/'+'pkl/TrainRina_'+str(localidad)+'_'+final_date+'.pkl','wb')
    pickle.dump(localidad_predictor, localidad_outfile)
    localidad_outfile.close() 

## Test prediction

In [56]:
train_date = '2017-01-30'
initial_date_prediction = '2017-01-31'
final_date_prediction = '2017-02-01'

In [None]:
flagF = True
localidades_list = higher_predictability_localidades+lower_predictability_localidades
hours_timedelta = [3, 9, 15, 21]
for localidad in localidades_list:
    print(localidad)
    for hour_value in hours_timedelta:
        modelCrime = load_model(localidad,train_date)  
        predict_time = datetime.datetime.strptime(initial_date_prediction,'%Y-%m-%d')+datetime.timedelta(hours=hour_value)
        gridpred, hitrates = run_prediction(localidad,modelCrime,predict_time)   
        #plot_gridpred(localidad,predict_time,gridpred) ## uncomment to plot gridpred
        predict_time = predict_time.strftime('%Y-%m-%d %H:%M:%S')
        if flagF==True:
            flagF = False
            hitrates_values = np.array([localidad,predict_time,hitrates[0.01],hitrates[0.1]]);
        else:
            hitrates_values = np.vstack((hitrates_values, [localidad,predict_time,hitrates[0.01],hitrates[0.1]]))


In [76]:
df_hitrates = pd.DataFrame(hitrates_values, columns=['localidad','prediction_time','hitrate_0.01','hitrate_0.1'])
df_hitrates

Unnamed: 0,localidad,prediction_time,hitrate_0.01,hitrate_0.1
0,CIUDAD BOLIVAR,2017-01-31 03:00:00,0.0,0.0519480519480519
1,CIUDAD BOLIVAR,2017-01-31 09:00:00,0.0,0.0389610389610389
2,CIUDAD BOLIVAR,2017-01-31 15:00:00,0.0,0.0389610389610389
3,CIUDAD BOLIVAR,2017-01-31 21:00:00,0.0,0.0389610389610389
4,BOSA,2017-01-31 03:00:00,0.0,0.0161290322580645
5,BOSA,2017-01-31 09:00:00,0.0,0.0161290322580645
6,BOSA,2017-01-31 15:00:00,0.0,0.0161290322580645
7,BOSA,2017-01-31 21:00:00,0.0,0.032258064516129
8,USME,2017-01-31 03:00:00,0.0,0.0
9,USME,2017-01-31 09:00:00,0.0,0.2424242424242424
