# Experiment description
## Hypothesis: 
We can predict with an average hitrate higher than 0.1 (10% coverage) the aggressive behavior occurrence over a week at locality scope.

## Method: 

Estimate hitrates over predictions between 2017-01-22 and 2017-01-29. Experiment parameters:
- Use localidades trained model between 2017-01-01 and 2017-01-21
- Predict for each day in the following hours: [3, 9, 15, 21]
- Estimate hitrate and PAI using 5% and 10% coverage
- Estimate average PAI

## Results:
- Time elapsed experiment: 3 hours
- Hitrates were higher than 0.1 in some predictions.
- average PAI 5%:  0.08242157347447497
- average PAI 10%:  0.12990330529277822

In [1]:
%matplotlib inline
import pandas as pd
import pickle
import dateutil.parser
import pyproj
import open_cp
from PIL import Image
import datetime
import matplotlib.pyplot as plt
import numpy as np

In [2]:
import import_ipynb
import training_sepp_builtin

importing Jupyter notebook from training_sepp_builtin.ipynb
Collecting https://github.com/QuantCrimAtLeeds/PredictCode/zipball/master
  Using cached https://github.com/QuantCrimAtLeeds/PredictCode/zipball/master
[33mYou are using pip version 18.1, however version 20.1b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting https://github.com/MatthewDaws/SEPP/zipball/master
  Using cached https://github.com/MatthewDaws/SEPP/zipball/master
[33mYou are using pip version 18.1, however version 20.1b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


Failed to import `rtree`.
Failed to import `rtree`.


In [3]:
def selectDataPrediction(nameLoc,dateIni,dateEnd):
    if nameLoc == 'city':
        dfloc = df
    else:
        dfloc = df[df["LOCALIDAD"]==nameLoc]
            
    dfloc["FECHA_HORA"]=dfloc["HORA"].astype(str)
    valHour = dfloc["FECHA_HORA"].values
    valDate = dfloc["FECHA"].values
    timesVals = [];
    k = 0;
    for i in valHour:
        if len(i)<=2:
            timeStr = valDate[k] +" " + "00:"+i+":00"
        else:
            timeStr = valDate[k] +" " + i[:-2]+":"+i[-2:]+":00"
        
        k = k + 1

        timesVals.append(timeStr)
    dfloc["FECHA_HORA"] = timesVals;
    
    dfloc["FECHA_HORA_TS"] = dfloc.FECHA_HORA.map(dateutil.parser.parse)
    dfFilter = dfloc[(dfloc.FECHA_HORA_TS > dateIni) & (dfloc.FECHA_HORA_TS < dateEnd)]
    timestamps = dfFilter.FECHA_HORA_TS
    
    print("TimeStamps")
    print(type(timestamps))
    print([timestamps])

    xcoords, ycoords = (dfFilter.LONGITUD.values,dfFilter.LATITUD.values)
    proj = pyproj.Proj(init="EPSG:3116")
    xcoords, ycoords = proj(xcoords,ycoords)
    
    points_crime = open_cp.TimedPoints.from_coords(timestamps, xcoords, ycoords)
    
    #Generando Grilla para los algoritmos. 
    maxx = max(xcoords)
    minx = min(xcoords)
    maxy = max(ycoords)
    miny = min(ycoords)
    #print(maxx,minx,maxy,miny)

    region = open_cp.RectangularRegion(xmin=minx, xmax=maxx, ymin=miny, ymax=maxy)
    #print(region)
    
    return (points_crime,region)


In [18]:
def load_model(experiment_name):
    custom_path = path+'/aggressive_behavior_model/SEPP/'+'pkl/TrainRina_'+experiment_name+'.pkl'
    infile = open(custom_path,'rb')
    modelCrime = pickle.load(infile)
    infile.close()
    return modelCrime

In [5]:
def run_prediction(localidad,modelCrime,time_window_prediction,predict_time):
    points_crime,region = selectDataPrediction(localidad,time_window_prediction['start'],time_window_prediction['end'])
    modelCrime.data = points_crime 
    prediction = modelCrime.predict(predict_time)
    gridpred = open_cp.predictors.GridPredictionArray.from_continuous_prediction_region(prediction, region, 150, 150)
    hitrates = open_cp.evaluation.hit_rates(gridpred, points_crime, [0.05,0.1])
    return gridpred, hitrates

In [6]:
def plot_gridpred(localidad,predict_time,gridpred):
    fig, ax = plt.subplots(figsize=(20,10))
    m = ax.pcolormesh(*gridpred.mesh_data(), gridpred.intensity_matrix, cmap="CMRmap_r")
    ax.set_title("Predicción localidad: "+localidad+'; fecha: '+str(predict_time))
    fig.colorbar(m, ax=ax)

## Select data

In [7]:
path = '/Users/anamaria/Desktop/dev/security_project'

In [21]:
df = pd.read_csv(path+'/datasets/verify_enrich_nuse_29112019.csv')

In [26]:
train_initial_date = '2017-01-01'
train_final_date = '2017-01-22'

## Test prediction

In [22]:
initial_date_prediction = '2017-01-22'
final_date_prediction = '2017-01-29'

In [None]:
dates_to_predict = []
for i in range(7):
    current_datetime = datetime.datetime.strptime(initial_date_prediction,'%Y-%m-%d')+datetime.timedelta(days=i)
    dates_to_predict.append(current_datetime.strftime('%Y-%m-%d'))
    
dates_to_predict

In [30]:
localidades_list = list(df.LOCALIDAD.unique())

In [None]:
localidades_list.remove('CANDELARIA')
localidades_list.remove('PUENTE ARANDA')
localidades_list.remove('SIN LOCALIZACION')

In [None]:
flagF = True
for localidad in localidades_list:
    trained_model_name = 'localidad_'+str(localidad)+'_'+train_initial_date+'_'+train_final_date
    modelCrime = load_model(trained_model_name)

    for initial_date in dates_to_predict:
        hours_timedelta = [3, 9, 15, 21]
        end_date = datetime.datetime.strptime(initial_date,'%Y-%m-%d')+datetime.timedelta(days=1)
        end_date = end_date.strftime('%Y-%m-%d')
        time_window_prediction = {'start':initial_date,'end':end_date}

        for hour_value in hours_timedelta:
            current_date_prediction = time_window_prediction['start']
            predict_time = datetime.datetime.strptime(current_date_prediction,'%Y-%m-%d')+datetime.timedelta(hours=hour_value)
            gridpred, hitrates = run_prediction(localidad,modelCrime,time_window_prediction,predict_time)   
            plot_gridpred(localidad,predict_time,gridpred) ## uncomment to plot gridpred
            predict_time = predict_time.strftime('%Y-%m-%d %H:%M:%S')
            if flagF==True:
                flagF = False
                hitrates_values = np.array([localidad,predict_time,hitrates[0.05],hitrates[0.1]]);
            else:
                hitrates_values = np.vstack((hitrates_values, [localidad,predict_time,hitrates[0.05],hitrates[0.1]]))
    
    df_hitrates = pd.DataFrame(hitrates_values, columns=['localidad','prediction_time','hitrate_0.05','hitrate_0.1'])
    
    experiment_name = 'localidades_hitrates_2017-01-22_2017-01-28'
    hitrates_outfile = open(path+'/aggressive_behavior_model/SEPP/'+'pkl/PredictionRina_'+experiment_name+'.pkl','wb')
    pickle.dump(df_hitrates, hitrates_outfile)
    hitrates_outfile.close()


In [35]:
df_hitrates = pd.DataFrame(hitrates_values, columns=['localidad','prediction_time','hitrate_0.05','hitrate_0.1'])

In [36]:
df_hitrates

Unnamed: 0,localidad,prediction_time,hitrate_0.05,hitrate_0.1
0,ANTONIO NARIÑO,2017-01-22 03:00:00,0.0,0.0
1,ANTONIO NARIÑO,2017-01-22 09:00:00,0.0,0.0
2,ANTONIO NARIÑO,2017-01-22 15:00:00,0.0,0.0
3,ANTONIO NARIÑO,2017-01-22 21:00:00,0.0,0.0
4,ANTONIO NARIÑO,2017-01-23 03:00:00,0.0,0.0
...,...,...,...,...
471,SANTA FE,2017-01-27 21:00:00,0.0,0.0
472,SANTA FE,2017-01-28 03:00:00,0.0,0.0
473,SANTA FE,2017-01-28 09:00:00,0.0,0.0
474,SANTA FE,2017-01-28 15:00:00,0.0,0.0


### Estimate PAI

In [None]:
df_hitrates['PAI_0.05']=pd.to_numeric(df_hitrates['hitrate_0.05'])/0.05
df_hitrates['PAI_0.1']=pd.to_numeric(df_hitrates['hitrate_0.1'])/0.1

In [38]:
df_hitrates

Unnamed: 0,localidad,prediction_time,hitrate_0.05,hitrate_0.1,PAI_0.05,PAI_0.1
0,ANTONIO NARIÑO,2017-01-22 03:00:00,0.0,0.0,0.0,0.0
1,ANTONIO NARIÑO,2017-01-22 09:00:00,0.0,0.0,0.0,0.0
2,ANTONIO NARIÑO,2017-01-22 15:00:00,0.0,0.0,0.0,0.0
3,ANTONIO NARIÑO,2017-01-22 21:00:00,0.0,0.0,0.0,0.0
4,ANTONIO NARIÑO,2017-01-23 03:00:00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
471,SANTA FE,2017-01-27 21:00:00,0.0,0.0,0.0,0.0
472,SANTA FE,2017-01-28 03:00:00,0.0,0.0,0.0,0.0
473,SANTA FE,2017-01-28 09:00:00,0.0,0.0,0.0,0.0
474,SANTA FE,2017-01-28 15:00:00,0.0,0.0,0.0,0.0


In [41]:
experiment_name = 'localidades_hitrates_2017-01-22_2017-01-28'
hitrates_outfile = open(path+'/aggressive_behavior_model/SEPP/'+'pkl/PredictionRina_'+experiment_name+'.pkl','wb')
pickle.dump(df_hitrates, hitrates_outfile)
hitrates_outfile.close()

In [39]:
print('average PAI 5%: ', df_hitrates['PAI_0.05'].mean())
print('average PAI 10%: ', df_hitrates['PAI_0.1'].mean())

average PAI 5%:  0.08242157347447497
average PAI 10%:  0.12990330529277822
