# Experiment description
## Hypothesis: 
We can predict with an average hitrate higher than 0.1 (10% coverage) the aggressive behavior occurrence over a week.

## Method: 

Estimate hitrates over predictions between 2017-01-22 and 2017-01-29. Experiment parameters:
- Use city trained model between 2017-01-01 and 2017-01-21
- Predict for each day in the following hours: [3, 9, 15, 21]
- Estimate hitrate and PAI using 5% and 10% coverage
- Estimate average PAI

## Results:
- Time elapsed experiment: 56 hours aprox
- Hitrates were lower than 0.1 in all predictions.
- average PAI 5%:  0.5338693700851119
- average PAI 10%:  0.47119505453105576

In [None]:
%matplotlib inline
import pandas as pd
import pickle
import dateutil.parser
import pyproj
import open_cp
from PIL import Image
import datetime
import matplotlib.pyplot as plt
import numpy as np

In [None]:
import import_ipynb
import training_sepp_builtin

In [None]:
def selectDataPrediction(nameLoc,dateIni,dateEnd):
    if nameLoc == 'city':
        dfloc = df
    else:
        dfloc = df[df["LOCALIDAD"]==nameLoc]
            
    dfloc["FECHA_HORA"]=dfloc["HORA"].astype(str)
    valHour = dfloc["FECHA_HORA"].values
    valDate = dfloc["FECHA"].values
    timesVals = [];
    k = 0;
    for i in valHour:
        if len(i)<=2:
            timeStr = valDate[k] +" " + "00:"+i+":00"
        else:
            timeStr = valDate[k] +" " + i[:-2]+":"+i[-2:]+":00"
        
        k = k + 1

        timesVals.append(timeStr)
    dfloc["FECHA_HORA"] = timesVals;
    
    dfloc["FECHA_HORA_TS"] = dfloc.FECHA_HORA.map(dateutil.parser.parse)
    dfFilter = dfloc[(dfloc.FECHA_HORA_TS > dateIni) & (dfloc.FECHA_HORA_TS < dateEnd)]
    timestamps = dfFilter.FECHA_HORA_TS
    
    print("TimeStamps")
    print(type(timestamps))
    print([timestamps])

    xcoords, ycoords = (dfFilter.LONGITUD.values,dfFilter.LATITUD.values)
    proj = pyproj.Proj(init="EPSG:3116")
    xcoords, ycoords = proj(xcoords,ycoords)
    
    points_crime = open_cp.TimedPoints.from_coords(timestamps, xcoords, ycoords)
    
    #Generando Grilla para los algoritmos. 
    maxx = max(xcoords)
    minx = min(xcoords)
    maxy = max(ycoords)
    miny = min(ycoords)
    #print(maxx,minx,maxy,miny)

    region = open_cp.RectangularRegion(xmin=minx, xmax=maxx, ymin=miny, ymax=maxy)
    #print(region)
    
    return (points_crime,region)


In [None]:
def load_model(localidad, experiment_name):
    custom_path = path+'/aggressive_behavior_model/SEPP/'+'pkl/TrainRina_'+experiment_name+'.pkl'
    infile = open(custom_path,'rb')
    modelCrime = pickle.load(infile)
    infile.close()
    return modelCrime

In [None]:
def run_prediction(localidad,modelCrime,time_window_prediction,predict_time):
    points_crime,region = selectDataPrediction(localidad,time_window_prediction['start'],time_window_prediction['end'])
    modelCrime.data = points_crime 
    prediction = modelCrime.predict(predict_time)
    gridpred = open_cp.predictors.GridPredictionArray.from_continuous_prediction_region(prediction, region, 150, 150)
    hitrates = open_cp.evaluation.hit_rates(gridpred, points_crime, [0.05,0.1])
    return gridpred, hitrates

In [None]:
def plot_gridpred(localidad,predict_time,gridpred):
    fig, ax = plt.subplots(figsize=(20,10))
    m = ax.pcolormesh(*gridpred.mesh_data(), gridpred.intensity_matrix, cmap="CMRmap_r")
    ax.set_title("Predicción localidad: "+localidad+'; fecha: '+str(predict_time))
    fig.colorbar(m, ax=ax)

## Select data

In [None]:
path = '/Users/anamaria/Desktop/dev/security_project'

In [None]:
df = pd.read_csv(path+'/datasets/verify_enrich_nuse_29112019.csv')

## Test prediction

In [None]:
localidad = 'city'
parameters = {"time_bw":144,"space_bw":50,"time_cutoff":90,"space_cutoff":500}
train_initial_date = '2017-01-01'
train_final_date = '2017-01-22'
trained_model_name = str(localidad)+'_'+train_initial_date+'_'+train_final_date+'_time_cutoff_'+str(parameters['time_cutoff'])

In [None]:
initial_date_prediction = '2017-01-22'
final_date_prediction = '2017-01-29'

In [None]:
dates_to_predict = []
for i in range(7):
    current_datetime = datetime.datetime.strptime(initial_date_prediction,'%Y-%m-%d')+datetime.timedelta(days=i)
    dates_to_predict.append(current_datetime.strftime('%Y-%m-%d'))
    
dates_to_predict

In [None]:
flagF = True
modelCrime = load_model(localidad,trained_model_name)

for initial_date in dates_to_predict:
    hours_timedelta = [3, 9, 15, 21]
    end_date = datetime.datetime.strptime(initial_date,'%Y-%m-%d')+datetime.timedelta(days=1)
    end_date = end_date.strftime('%Y-%m-%d')
    time_window_prediction = {'start':initial_date,'end':end_date}

    for hour_value in hours_timedelta:
        current_date_prediction = time_window_prediction['start']
        predict_time = datetime.datetime.strptime(current_date_prediction,'%Y-%m-%d')+datetime.timedelta(hours=hour_value)
        gridpred, hitrates = run_prediction(localidad,modelCrime,time_window_prediction,predict_time)   
        plot_gridpred(localidad,predict_time,gridpred) ## uncomment to plot gridpred
        predict_time = predict_time.strftime('%Y-%m-%d %H:%M:%S')
        if flagF==True:
            flagF = False
            hitrates_values = np.array([localidad,predict_time,hitrates[0.05],hitrates[0.1]]);
        else:
            hitrates_values = np.vstack((hitrates_values, [localidad,predict_time,hitrates[0.05],hitrates[0.1]]))


In [None]:
df_hitrates = pd.DataFrame(hitrates_values, columns=['localidad','prediction_time','hitrate_0.05','hitrate_0.1'])

### Estimate PAI

In [None]:
df_hitrates['PAI_0.05']=pd.to_numeric(df_hitrates['hitrate_0.05'])/0.05
df_hitrates['PAI_0.1']=pd.to_numeric(df_hitrates['hitrate_0.1'])/0.1

In [None]:
df_hitrates

In [None]:
experiment_name = 'city_hitrates_2017-01-22_2017-01-28'
hitrates_outfile = open(path+'/aggressive_behavior_model/SEPP/'+'pkl/PredictionRina_'+experiment_name+'.pkl','wb')
pickle.dump(df_hitrates, hitrates_outfile)
hitrates_outfile.close()

In [None]:
print('average PAI 5%: ', df_hitrates['PAI_0.05'].mean())
print('average PAI 10%: ', df_hitrates['PAI_0.1'].mean())

## Processing results

In [79]:
experiment_name = 'city_hitrates_2017-01-22_2017-01-28'
custom_path = path+'/aggressive_behavior_model/SEPP/'+'pkl/PredictionRina_'+experiment_name+'.pkl'
infile = open(custom_path,'rb')
df_hitrates = pickle.load(infile)
infile.close()

In [80]:
df_hitrates['hitrate_0.05']=pd.to_numeric(df_hitrates['hitrate_0.05'])
df_hitrates['hitrate_0.1']=pd.to_numeric(df_hitrates['hitrate_0.1'])
df_hitrates['PAI_0.05']=pd.to_numeric(df_hitrates['PAI_0.05'])
df_hitrates['PAI_0.1']=pd.to_numeric(df_hitrates['PAI_0.1'])
df_hitrates['hitrate_0.05']=df_hitrates['hitrate_0.05'].round(4)
df_hitrates['hitrate_0.1']=df_hitrates['hitrate_0.1'].round(4)
df_hitrates['PAI_0.05']=df_hitrates['PAI_0.05'].round(4)
df_hitrates['PAI_0.1']=df_hitrates['PAI_0.1'].round(4)

In [81]:
df_hitrates['date']=[d[0:10] for d in df_hitrates['prediction_time']]
df_hitrates['hour']=[d[11:16] for d in df_hitrates['prediction_time']]

In [84]:
import plotly.express as px
fig = px.scatter(df_hitrates, x="date", y="PAI_0.1", color="hour",size="PAI_0.1")
fig.show()

### Average by date

In [52]:
flagF=True

for date in dates_to_predict:
    df_group = df_prueba[df_prueba['prediction_time'].str.contains(date)]
    hitrate_005 = df_group['hitrate_0.05'].mean().round(4)
    hitrate_01 = df_group['hitrate_0.1'].mean().round(4)
    PAI_005 = df_group['PAI_0.05'].mean().round(4)
    PAI_01 = df_group['PAI_0.1'].mean().round(4)
    if flagF==True:
        flagF = False
        summarize_values = np.array([date,hitrate_005,hitrate_01,PAI_005,PAI_01]);
    else:
        summarize_values = np.vstack((summarize_values, [date,hitrate_005,hitrate_01,PAI_005,PAI_01]))


In [53]:
df_summarize = pd.DataFrame(summarize_values, columns=['prediction_time','hitrate_0.05','hitrate_0.1','PAI_0.05','PAI_0.1'])

In [54]:
df_summarize

Unnamed: 0,prediction_time,hitrate_0.05,hitrate_0.1,PAI_0.05,PAI_0.1
0,2017-01-22,0.0167,0.0359,0.3338,0.3595
1,2017-01-23,0.0299,0.0468,0.5985,0.4678
2,2017-01-24,0.0336,0.0556,0.673,0.5558
3,2017-01-25,0.0266,0.0483,0.532,0.483
4,2017-01-26,0.0291,0.0534,0.5818,0.5345
5,2017-01-27,0.026,0.045,0.52,0.4498
6,2017-01-28,0.0249,0.0448,0.4985,0.4478


### Average by hour

In [57]:
hours = ['03:00:00','09:00:00','15:00:00','21:00:00']

In [58]:
flagF=True

for hour in hours:
    df_group = df_prueba[df_prueba['prediction_time'].str.contains(hour)]
    hitrate_005 = df_group['hitrate_0.05'].mean().round(4)
    hitrate_01 = df_group['hitrate_0.1'].mean().round(4)
    PAI_005 = df_group['PAI_0.05'].mean().round(4)
    PAI_01 = df_group['PAI_0.1'].mean().round(4)
    if flagF==True:
        flagF = False
        summarize_values = np.array([hour,hitrate_005,hitrate_01,PAI_005,PAI_01]);
    else:
        summarize_values = np.vstack((summarize_values, [hour,hitrate_005,hitrate_01,PAI_005,PAI_01]))


In [59]:
df_summarize = pd.DataFrame(summarize_values, columns=['prediction_time','hitrate_0.05','hitrate_0.1','PAI_0.05','PAI_0.1'])

In [60]:
df_summarize

Unnamed: 0,prediction_time,hitrate_0.05,hitrate_0.1,PAI_0.05,PAI_0.1
0,03:00:00,0.0247,0.0362,0.4943,0.3623
1,09:00:00,0.0248,0.0384,0.4973,0.3837
2,15:00:00,0.0291,0.0537,0.582,0.5371
3,21:00:00,0.0281,0.0601,0.5621,0.6014
