In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
from tqdm import tqdm

# Funções

In [3]:
def metrica_avaliacao(df, predict_column='risk_index',
                      viaturas=3, number_random_executions=1): 
    sum_true = 0
    sum_pred = 0
    for month in df.month_number.unique():
        df_m = df[df.month_number==month]
        for dia in df_m.dia.unique():
            df_d = df_m[df_m.dia==dia]
            for p in df_d.periodo.unique():
                df_p_real = df_d[df_d.periodo == p].copy()
                df_p_pred = df_p_real.copy()
                
                sum_true += df_p_real.sort_values(
                    by='acidentes_graves', ascending=False)['acidentes_graves'][:viaturas].sum()
                sum_pred += df_p_pred.sort_values(
                    by=predict_column, ascending=False)['acidentes_graves'][:viaturas].sum()
                
    return sum_true, sum_pred

# Classe

In [4]:
class WeightedProb:
    def __init__(self, df, side_var_columns, train_til_year):
        
        self.df = df
        self.side_var_columns = side_var_columns
        self.train_til_year = train_til_year
        
        # inicializando os pesos
        self.best_opt_weights = np.ones(len(side_var_columns))
    
    def get_mes_dia_mais_proximo(self, df_recorte, mes_in, dia_in, periodo_in):
        meses = df_recorte.month_number.unique()
        
        if len(meses)>0:
            mes_mais_prox = meses[np.argmin(np.abs(np.array(meses) - mes_in))]
        
            dias = df_recorte[(df_recorte['month_number']==mes_mais_prox)].dia.unique()
            dia_mais_prox = dias[np.argmin(np.abs(np.array(dias) - dia_in))]

            periodos = df_recorte[(df_recorte['month_number']==mes_mais_prox) &(df_recorte['dia']==dia_in)].periodo.unique()
            
            condition = (mes_mais_prox == mes_in) and (dia_mais_prox == dia_in) and (periodo_in in periodos)
        else:
            condition = False
              
        return condition 
    
    def func_risk_index(self, dict_side_var_columns_probs, x):
        risk=0
        for i, sv in enumerate(self.side_var_columns):
            try:
                risk += dict_side_var_columns_probs[sv][x[sv]]*self.best_opt_weights[i]
            except:
                pass
        return risk
    
    def func_risco_otimizando(self, dict_side_var_columns_probs, x):
        risk=0
        for i, sv in enumerate(self.side_var_columns):
            try:
                risk += dict_side_var_columns_probs[sv][x[sv]]*self.pop_opt_weights[i]
            except:
                pass
        return risk
    
    def fit(self):

        self.df_train = self.df[(self.df['ano'] <= self.train_til_year)]
        
        self.dict_side_var_columns_probs = {}

        for sv in self.side_var_columns:
            dict_uniques_sv_prob = {}
            for u in self.df_train[sv].unique():
                dict_uniques_sv_prob[u] = self.df_train[
                    (self.df_train[sv]==u) & (
                        self.df_train.acidentes_graves==1)].count(
                ).values[0]/self.df_train[(self.df_train[sv]==u)].count().values[0]

            self.dict_side_var_columns_probs[sv] = dict_uniques_sv_prob

        self.df_train['risk_index'] = self.df_train[self.side_var_columns].apply((
            lambda x: self.func_risk_index(self.dict_side_var_columns_probs, x)), axis=1)                      

        for sv in self.side_var_columns:
            column_name_prob = sv + "_prob"            
            self.df_train[column_name_prob] = self.df_train[sv].apply(lambda x: self.dict_side_var_columns_probs[sv][x]) 
        
        return self
    
    def predict(self, df_in):
        
        self.df_out = df_in.copy()        
        self.df_out['risk_index'] = self.df_out[self.side_var_columns].apply(
            (lambda x: self.func_risk_index(self.dict_side_var_columns_probs, x)), axis=1)
        
        return self.df_out.copy()
    
    def metrica_avaliacao(self, df, predict_column='risk_index', real_column='acidentes_graves',
                          viaturas=3):  
        sum_true = 0
        sum_pred = 0

        for month in df.month_number.unique():
            df_m = df[df.month_number==month]
            for dia in df_m.dia.unique():
                df_d = df_m[df_m.dia==dia]
                for p in df_d.periodo.unique():
                    df_p_real = df_d[df_d.periodo == p].copy()
                    df_p_pred = df_p_real.copy()
                    sum_true += df_p_real.sort_values(
                        by=real_column, ascending=False)[real_column][:viaturas].sum()
                    sum_pred += df_p_pred.sort_values(
                        by=predict_column, ascending=False)[real_column][:viaturas].sum()

        return sum_true, sum_pred

    def optimize(self,
                   df_objetivo,
                   real_column=['mortos','feridos_graves'],
                   predict_column='risco_otimizado',
                   gama=0.5,
                   viaturas=10,
                   epochs=10,
                   pop_size=100):
        
        df = self.predict(df_objetivo)
        
        best_sum_pred = 0
        len_opt_weigths = len(self.best_opt_weights)
        
        sum_true, _ = self.metrica_avaliacao(df)

        print('Objetivo:',sum_true)
        
        population = np.random.randn(pop_size,len_opt_weigths)
        population[0,:] = self.best_opt_weights
        fitness = np.zeros(pop_size)
        
        for z in range(epochs):
            
            for pop in range(population.shape[0]):
                risco_otimizado = []
                
                self.pop_opt_weights = population[pop,:]

                df[predict_column] = df[self.side_var_columns].apply((
                                    lambda x: self.func_risco_otimizando(self.dict_side_var_columns_probs, x)), axis=1)  

                _, sum_pred = self.metrica_avaliacao(df, predict_column=predict_column)
                            
                print(sum_true, sum_pred, pop)
                fitness[pop] = (sum_true - sum_pred)
            
            best_fathers = population[fitness.argsort()][:int(population.shape[0]/2),:]
            worst_fathers = population[fitness.argsort()][int(population.shape[0]/2):,:]
            
            for pop in range(population.shape[0]):
                
                if pop < 0.25*population.shape[0]:
                    population[pop,:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])] = best_fathers[int(pop/2),:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])]
                elif pop < 0.50*population.shape[0] and pop > 0.25*population.shape[0]:
                    population[pop,:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])] = best_fathers[int(pop/2),:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])]
                    population[pop,:] += np.random.randn(len_opt_weigths)
                else:
                    population[pop,:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])] = worst_fathers[int(pop/2),:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])]
                    population[pop,:] += np.random.randn(len_opt_weigths)
            
            fitness.sort()
            if fitness[0] < np.abs(sum_true - best_sum_pred):
                best_sum_pred = sum_true - fitness[0]
                best_opt_weights = best_fathers[0,:]
            
            print(best_sum_pred)
            
        self.best_opt_weights = best_opt_weights
        
        return sum_true, best_sum_pred
    
    def cross_val_lombra(self, year_test=[2016,2017,2018], caso_to_print='todos_no_GA'):
        
        self.cross_val_result = []
        year_test = year_test
        
        for yt in year_test:
            df_train = self.df[self.df.ano != yt]
            df_test = self.df[self.df.ano == yt]
            dict_side_var_columns_probs = {}
            for sv in self.side_var_columns:
                dict_uniques_sv_prob = {}
                for u in df_train[sv].unique():
                    dict_uniques_sv_prob[u] = df_train[
                        (df_train[sv]==u) & (df_train.acidentes_graves==1)].count(
                    ).values[0]/df_train[(df_train[sv]==u)].count().values[0]

                dict_side_var_columns_probs[sv] = dict_uniques_sv_prob

            df_train['risk_index'] = df_train[self.side_var_columns].apply((
                lambda x: self.func_risk_index(dict_side_var_columns_probs, x)), axis=1)                      

            for sv in self.side_var_columns:
                column_name_prob = sv + "_prob"            
                df_train[column_name_prob] = df_train[sv].apply(lambda x: dict_side_var_columns_probs[sv][x]) 
            
            df_out = df_test.copy()        
            df_out['risk_index'] = df_out[self.side_var_columns].apply(
            (lambda x: self.func_risk_index(dict_side_var_columns_probs, x)), axis=1)
            
            sum_true, sum_pred = self.metrica_avaliacaoa_avaliacao(df_out)
            
            self.cross_val_result.append([yt, sum_true, sum_pred])
            
            df_out.to_csv('cross_val_lombra_wp/'+str(yt)+caso+'.csv', index=None)
            
        return self

## Open data

In [6]:
df = pd.read_csv('df_19_18_17_16_pad.csv')

print(df.condicao_meteorologica.unique())

df.drop(['index','km'],axis=1, inplace=True)

df['acidentes_graves'] = df['mortos'
                                    ]+df['feridos_graves'
                                    ]+df['feridos'
                                    ]+df['feridos_leves'
                                    ]+df['ilesos']

df['acidentes_graves'] = df['acidentes_graves'].apply(lambda x: 1 if (x > 0) else 0)

df.head()

['ignorada' 'tempo_claro' 'tempo_adverso']


Unnamed: 0,dia_semana,br,condicao_meteorologica,tipo_pista,tracado_via,uso_solo,mortos,feridos_leves,feridos_graves,ilesos,...,feridos,veiculos,feriado,km_bins,periodo,ano,month_number,dia,data,acidentes_graves
0,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[8, 12)",2019,1,1,2019-01-01 00:00:00,0
1,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[16, 20)",2019,1,1,2019-01-01 00:00:00,0
2,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[20, 24)",2019,1,1,2019-01-01 00:00:00,0
3,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[12, 16)",2019,1,1,2019-01-01 00:00:00,0
4,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[4, 8)",2019,1,1,2019-01-01 00:00:00,0


## training

In [7]:
# km_bins vai fazer o papel do main.
side_var_columns = ['km_bins','br','tipo_pista', 'tracado_via', 'uso_solo','periodo', 'condicao_meteorologica',
                    'feriado', 'dia_semana','month_number','dia']

WPudo = WeightedProb(df, side_var_columns, train_til_year=2018)
WPudo.fit()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<__main__.WeightedProb at 0x266c5c37a88>

## predicting for padded

In [8]:
df_2019 = df[df.ano==2019]

In [9]:
df_2019_out = WPudo.predict(df_2019)

In [10]:
metrica_avaliacao(df_2019_out, viaturas=3)

(353, 152)

In [11]:
df_2019_out.to_csv('val_1802.csv', index=None)

## Cross Validation

In [None]:
WPudo.cross_val_lombra(year_test=[2016], caso_to_print='todos_no_GA')

## otimizando

In [12]:
WPudo.optimize(viaturas=3, df_objetivo=df_2019_out, pop_size=20, epochs=10)

Objetivo: 353
353 152 0
353 46 1
353 5 2
353 45 3
353 146 4
353 93 5
353 48 6
353 132 7
353 96 8
353 152 9
353 133 10
353 101 11
353 18 12
353 107 13
353 65 14
353 164 15
353 131 16
353 35 17
353 14 18
353 50 19
164.0
353 164 0
353 164 1
353 152 2
353 152 3
353 152 4
353 103 5
353 45 6
353 64 7
353 98 8
353 100 9
353 128 10
353 139 11
353 107 12
353 126 13
353 30 14
353 147 15
353 139 16
353 14 17
353 52 18
353 41 19
164.0
353 164 0
353 164 1
353 164 2
353 164 3
353 152 4
353 115 5
353 83 6
353 150 7
353 109 8
353 141 9
353 39 10
353 105 11
353 130 12
353 22 13
353 12 14
353 107 15
353 145 16
353 33 17
353 31 18
353 4 19
164.0
353 164 0
353 164 1
353 164 2
353 164 3
353 164 4
353 101 5
353 176 6
353 176 7
353 143 8
353 36 9
353 32 10
353 125 11
353 113 12
353 16 13
353 56 14
353 152 15
353 132 16
353 17 17
353 27 18
353 1 19
176.0
353 176 0
353 176 1
353 176 2
353 176 3
353 164 4
353 91 5
353 156 6
353 134 7
353 157 8
353 47 9
353 32 10
353 120 11
353 95 12
353 22 13
353 55 14
353 136 

(353, 180.0)

In [13]:
df_2019_opt = WPudo.predict(df_2019_out)

In [14]:
metrica_avaliacao(df_2019_opt, viaturas=3)

(353, 180)

In [15]:
WPudo.side_var_columns

['km_bins',
 'br',
 'tipo_pista',
 'tracado_via',
 'uso_solo',
 'periodo',
 'condicao_meteorologica',
 'feriado',
 'dia_semana',
 'month_number',
 'dia']

In [16]:
WPudo.best_opt_weights

array([ 2.17306582,  0.6411744 ,  0.75536902,  0.1945381 , -1.52533669,
        0.00969761,  3.02000162,  1.36572505,  0.06111644,  3.98226079,
       -2.07200212])

In [17]:
df_2019_opt.to_csv('val_1802_opt.csv')

## Cross Validation

In [None]:
WPudo.cross_val_lombra(caso_to_print='todos_with_GA')