In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
from tqdm import tqdm

# Funções

In [2]:
def metrica_avaliacao(df, predict_column='risk_index',
                      viaturas=3, number_random_executions=1): 
    sum_true = 0
    sum_pred = 0
    for month in df.month_number.unique():
        df_m = df[df.month_number==month]
        for dia in df_m.dia.unique():
            df_d = df_m[df_m.dia==dia]
            for p in df_d.periodo.unique():
                df_p_real = df_d[df_d.periodo == p].copy()
                df_p_pred = df_p_real.copy()
                
                sum_true += df_p_real.sort_values(
                    by='acidentes_graves', ascending=False)['acidentes_graves'][:viaturas].sum()
                sum_pred += df_p_pred.sort_values(
                    by=predict_column, ascending=False)['acidentes_graves'][:viaturas].sum()
                
    return sum_true, sum_pred

# Classe

In [9]:
class WeightedProb:
    def __init__(self, df, side_var_columns, train_til_year):
        
        self.df = df
        self.side_var_columns = side_var_columns
        self.train_til_year = train_til_year
        
        # inicializando os pesos
        self.best_opt_weights = np.ones(len(side_var_columns))
    
    def get_mes_dia_mais_proximo(self, df_recorte, mes_in, dia_in, periodo_in):
        meses = df_recorte.month_number.unique()
        
        if len(meses)>0:
            mes_mais_prox = meses[np.argmin(np.abs(np.array(meses) - mes_in))]
        
            dias = df_recorte[(df_recorte['month_number']==mes_mais_prox)].dia.unique()
            dia_mais_prox = dias[np.argmin(np.abs(np.array(dias) - dia_in))]

            periodos = df_recorte[(df_recorte['month_number']==mes_mais_prox) &(df_recorte['dia']==dia_in)].periodo.unique()
            
            condition = (mes_mais_prox == mes_in) and (dia_mais_prox == dia_in) and (periodo_in in periodos)
        else:
            condition = False
              
        return condition 
    
    def func_risk_index(self, dict_side_var_columns_probs, x):
        risk=0
        for i, sv in enumerate(self.side_var_columns):
            try:
                risk += dict_side_var_columns_probs[sv][x[sv]]*self.best_opt_weights[i]
            except:
                pass
        return risk
    
    def func_risco_otimizando(self, dict_side_var_columns_probs, x):
        risk=0
        for i, sv in enumerate(self.side_var_columns):
            try:
                risk += dict_side_var_columns_probs[sv][x[sv]]*self.pop_opt_weights[i]
            except:
                pass
        return risk
    
    def fit(self):

        self.df_train = self.df[(self.df['ano'] <= self.train_til_year)]
        
        self.dict_side_var_columns_probs = {}

        for sv in self.side_var_columns:
            dict_uniques_sv_prob = {}
            for u in self.df_train[sv].unique():
                dict_uniques_sv_prob[u] = self.df_train[
                    (self.df_train[sv]==u) & (
                        self.df_train.acidentes_graves==1)].count(
                ).values[0]/self.df_train[(self.df_train[sv]==u)].count().values[0]

            self.dict_side_var_columns_probs[sv] = dict_uniques_sv_prob

        self.df_train['risk_index'] = self.df_train[self.side_var_columns].apply((
            lambda x: self.func_risk_index(self.dict_side_var_columns_probs, x)), axis=1)                      

        for sv in self.side_var_columns:
            column_name_prob = sv + "_prob"            
            self.df_train[column_name_prob] = self.df_train[sv].apply(lambda x: self.dict_side_var_columns_probs[sv][x]) 
        
        return self
    
    def predict(self, df_in):
        
        self.df_out = df_in.copy()        
        self.df_out['risk_index'] = self.df_out[self.side_var_columns].apply(
            (lambda x: self.func_risk_index(self.dict_side_var_columns_probs, x)), axis=1)
        
        return self.df_out.copy()
    
    def metrica_avaliacao(self, df, predict_column='risk_index', real_column='acidentes_graves',
                          viaturas=3):  
        sum_true = 0
        sum_pred = 0

        for month in df.month_number.unique():
            df_m = df[df.month_number==month]
            for dia in df_m.dia.unique():
                df_d = df_m[df_m.dia==dia]
                for p in df_d.periodo.unique():
                    df_p_real = df_d[df_d.periodo == p].copy()
                    df_p_pred = df_p_real.copy()
                    sum_true += df_p_real.sort_values(
                        by=real_column, ascending=False)[real_column][:viaturas].sum()
                    sum_pred += df_p_pred.sort_values(
                        by=predict_column, ascending=False)[real_column][:viaturas].sum()

        return sum_true, sum_pred

    def optimize(self,
                   df_objetivo,
                   real_column=['mortos','feridos_graves'],
                   predict_column='risco_otimizado',
                   gama=0.5,
                   viaturas=10,
                   epochs=10,
                   pop_size=100):
        
        df = self.predict(df_objetivo)
        
        best_sum_pred = 0
        len_opt_weigths = len(self.best_opt_weights)
        
        sum_true, _ = self.metrica_avaliacao(df)

        print('Objetivo:',sum_true)
        
        population = np.random.randn(pop_size,len_opt_weigths)
        population[0,:] = self.best_opt_weights
        fitness = np.zeros(pop_size)
        
        for z in range(epochs):
            
            for pop in range(population.shape[0]):
                risco_otimizado = []
                
                self.pop_opt_weights = population[pop,:]

                df[predict_column] = df[self.side_var_columns].apply((
                                    lambda x: self.func_risco_otimizando(self.dict_side_var_columns_probs, x)), axis=1)  

                _, sum_pred = self.metrica_avaliacao(df, predict_column=predict_column)
                            
                print(sum_true, sum_pred, pop)
                fitness[pop] = (sum_true - sum_pred)
            
            best_fathers = population[fitness.argsort()][:int(population.shape[0]/2),:]
            worst_fathers = population[fitness.argsort()][int(population.shape[0]/2):,:]
            
            for pop in range(population.shape[0]):
                
                if pop < 0.25*population.shape[0]:
                    population[pop,:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])] = best_fathers[int(pop/2),:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])]
                elif pop < 0.50*population.shape[0] and pop > 0.25*population.shape[0]:
                    population[pop,:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])] = best_fathers[int(pop/2),:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])]
                    population[pop,:] += np.random.randn(len_opt_weigths)
                else:
                    population[pop,:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])] = worst_fathers[int(pop/2),:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])]
                    population[pop,:] += np.random.randn(len_opt_weigths)
            
            fitness.sort()
            if fitness[0] < np.abs(sum_true - best_sum_pred):
                best_sum_pred = sum_true - fitness[0]
                best_opt_weights = best_fathers[0,:]
            
            print(best_sum_pred)
            
        self.best_opt_weights = best_opt_weights
        
        return sum_true, best_sum_pred
    
    def cross_val_lombra(self, year_test=[2016,2017,2018], caso='todos_no_GA'):
        
        self.cross_val_result = []
        year_test = year_test
        
        for yt in year_test:
            df_train = self.df[self.df.ano != yt]
            df_test = self.df[self.df.ano == yt]
            dict_side_var_columns_probs = {}
            for sv in self.side_var_columns:
                dict_uniques_sv_prob = {}
                for u in df_train[sv].unique():
                    dict_uniques_sv_prob[u] = df_train[
                        (df_train[sv]==u) & (df_train.acidentes_graves==1)].count(
                    ).values[0]/df_train[(df_train[sv]==u)].count().values[0]

                dict_side_var_columns_probs[sv] = dict_uniques_sv_prob

            df_train['risk_index'] = df_train[self.side_var_columns].apply((
                lambda x: self.func_risk_index(dict_side_var_columns_probs, x)), axis=1)                      

            for sv in self.side_var_columns:
                column_name_prob = sv + "_prob"            
                df_train[column_name_prob] = df_train[sv].apply(lambda x: dict_side_var_columns_probs[sv][x]) 
            
            df_out = df_test.copy()        
            df_out['risk_index'] = df_out[self.side_var_columns].apply(
            (lambda x: self.func_risk_index(dict_side_var_columns_probs, x)), axis=1)
            
            sum_true, sum_pred = self.metrica_avaliacao(df_out)
            
            self.cross_val_result.append([yt, sum_true, sum_pred])
            
            df_out.to_csv('cross_val_lombra_wp/'+str(yt)+caso+'.csv', index=None)
            
        return self

In [8]:
a = pd.DataFrame(columns=['a'])
a.a = ['a']*10
a.head(10)

Unnamed: 0,a
0,a
1,a
2,a
3,a
4,a
5,a
6,a
7,a
8,a
9,a


## Open data

In [3]:
df = pd.read_csv('df_19_18_17_16_pad.csv')

print(df.condicao_meteorologica.unique())

df.drop(['index','km'],axis=1, inplace=True)

df['acidentes_graves'] = df['mortos'
                                    ]+df['feridos_graves'
                                    ]+df['feridos']
#                                     ]+df['feridos_leves'
#                                     ]+df['ilesos']

df['acidentes_graves'] = df['acidentes_graves'].apply(lambda x: 1 if (x > 0) else 0)

df.head()

['ignorada' 'tempo_claro' 'tempo_adverso']


Unnamed: 0,dia_semana,br,condicao_meteorologica,tipo_pista,tracado_via,uso_solo,mortos,feridos_leves,feridos_graves,ilesos,...,feridos,veiculos,feriado,km_bins,periodo,ano,month_number,dia,data,acidentes_graves
0,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[8, 12)",2019,1,1,2019-01-01 00:00:00,0
1,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[16, 20)",2019,1,1,2019-01-01 00:00:00,0
2,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[20, 24)",2019,1,1,2019-01-01 00:00:00,0
3,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[12, 16)",2019,1,1,2019-01-01 00:00:00,0
4,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[4, 8)",2019,1,1,2019-01-01 00:00:00,0


## training

In [10]:
# km_bins vai fazer o papel do main.
side_var_columns = ['km_bins','br','tipo_pista', 'tracado_via', 'uso_solo','periodo', 'condicao_meteorologica',
                    'feriado', 'dia_semana','month_number','dia']

WPudo = WeightedProb(df, side_var_columns, train_til_year=2018)
WPudo.fit()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<__main__.WeightedProb at 0x1cfaf212808>

## predicting for padded

In [11]:
df_2019 = df[df.ano==2019]

In [12]:
df_2019_out = WPudo.predict(df_2019)

In [13]:
metrica_avaliacao(df_2019_out, viaturas=3)

(293, 145)

In [9]:
df_2019_out.to_csv('cross_val_lombra_wp/2019sograve_no_GA.csv', index=None)

## Cross Validation

In [10]:
WPudo.cross_val_lombra(year_test=[2016, 2017, 2018], caso='sograve_no_GA')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<__main__.WeightedProb at 0x14ceb0d2f88>

In [11]:
WPudo.cross_val_result

[[2016, 351, 144], [2017, 323, 151], [2018, 253, 123]]

In [12]:
WPudo.best_opt_weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

## otimizando

In [13]:
WPudo.optimize(viaturas=3, df_objetivo=df_2019_out, pop_size=20, epochs=5)

Objetivo: 293
293 145 0
293 134 1
293 1 2
293 87 3
293 48 4
293 94 5
293 84 6
293 77 7
293 101 8
293 61 9
293 35 10
293 101 11
293 140 12
293 67 13
293 108 14
293 26 15
293 57 16
293 23 17
293 81 18
293 115 19
145.0
293 145 0
293 145 1
293 140 2
293 140 3
293 59 4
293 120 5
293 117 6
293 104 7
293 81 8
293 89 9
293 56 10
293 112 11
293 85 12
293 140 13
293 126 14
293 52 15
293 71 16
293 31 17
293 73 18
293 41 19
145.0
293 145 0
293 145 1
293 145 2
293 145 3
293 68 4
293 3 5
293 76 6
293 90 7
293 109 8
293 81 9
293 70 10
293 40 11
293 127 12
293 114 13
293 135 14
293 115 15
293 69 16
293 16 17
293 77 18
293 40 19
145.0
293 145 0
293 145 1
293 145 2
293 145 3
293 70 4
293 6 5
293 146 6
293 140 7
293 111 8
293 126 9
293 52 10
293 0 11
293 140 12
293 110 13
293 118 14
293 114 15
293 64 16
293 22 17
293 5 18
293 113 19
146.0
293 146 0
293 146 1
293 145 2
293 145 3
293 70 4
293 39 5
293 133 6
293 147 7
293 90 8
293 56 9
293 55 10
293 0 11
293 22 12
293 104 13
293 41 14
293 36 15
293 63 16
29

(293, 147.0)

In [14]:
df_2019_opt = WPudo.predict(df_2019_out)

In [15]:
metrica_avaliacao(df_2019_opt, viaturas=3)

(293, 147)

In [16]:
WPudo.best_opt_weights

array([ 1.66954849, -0.40323901,  1.82367703,  3.28080945,  1.06568867,
        0.7110041 ,  1.96407835,  1.91126351,  5.63694876,  1.29987127,
       -1.76266977])

In [17]:
df_2019_opt.to_csv('cross_val_lombra_wp/2019sograve_yes_GA.csv')

## Cross Validation

In [18]:
WPudo.cross_val_lombra(caso='sograve_yes_GA')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<__main__.WeightedProb at 0x14ceb0d2f88>

In [19]:
WPudo.cross_val_result

[[2016, 351, 143], [2017, 323, 156], [2018, 253, 121]]