In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
from tqdm import tqdm

# Funções

In [2]:
def metrica_avaliacao(df, predict_column='risk_index',
                      viaturas=3, number_random_executions=1): 
    sum_true = 0
    sum_pred = 0
    for month in df.month_number.unique():
        df_m = df[df.month_number==month]
        for dia in df_m.dia.unique():
            df_d = df_m[df_m.dia==dia]
            for p in df_d.periodo.unique():
                df_p_real = df_d[df_d.periodo == p].copy()
                df_p_pred = df_p_real.copy()
                
                sum_true += df_p_real.sort_values(
                    by='acidentes_graves', ascending=False)['acidentes_graves'][:viaturas].sum()
                sum_pred += df_p_pred.sort_values(
                    by=predict_column, ascending=False)['acidentes_graves'][:viaturas].sum()
                
    return sum_true, sum_pred

# Classe

In [3]:
class WeightedProb:
    def __init__(self, df, side_var_columns, train_til_year):
        
        self.df = df
        self.side_var_columns = side_var_columns
        self.train_til_year = train_til_year
        
        # inicializando os pesos
        self.best_opt_weights = np.ones(len(side_var_columns))
    
    def get_mes_dia_mais_proximo(self, df_recorte, mes_in, dia_in, periodo_in):
        meses = df_recorte.month_number.unique()
        
        if len(meses)>0:
            mes_mais_prox = meses[np.argmin(np.abs(np.array(meses) - mes_in))]
        
            dias = df_recorte[(df_recorte['month_number']==mes_mais_prox)].dia.unique()
            dia_mais_prox = dias[np.argmin(np.abs(np.array(dias) - dia_in))]

            periodos = df_recorte[(df_recorte['month_number']==mes_mais_prox) &(df_recorte['dia']==dia_in)].periodo.unique()
            
            condition = (mes_mais_prox == mes_in) and (dia_mais_prox == dia_in) and (periodo_in in periodos)
        else:
            condition = False
              
        return condition 
    
    def func_risk_index(self, dict_side_var_columns_probs, x):
        risk=0
        for i, sv in enumerate(self.side_var_columns):
            try:
                risk += dict_side_var_columns_probs[sv][x[sv]]*self.best_opt_weights[i]
            except:
                pass
        return risk
    
    def func_risco_otimizando(self, dict_side_var_columns_probs, x):
        risk=0
        for i, sv in enumerate(self.side_var_columns):
            try:
                risk += dict_side_var_columns_probs[sv][x[sv]]*self.pop_opt_weights[i]
            except:
                pass
        return risk
    
    def fit(self):

        self.df_train = self.df[(self.df['ano'] <= self.train_til_year)]
        
        self.dict_side_var_columns_probs = {}

        for sv in self.side_var_columns:
            dict_uniques_sv_prob = {}
            for u in self.df_train[sv].unique():
                dict_uniques_sv_prob[u] = self.df_train[
                    (self.df_train[sv]==u) & (
                        self.df_train.acidentes_graves==1)].count(
                ).values[0]/self.df_train[(self.df_train[sv]==u)].count().values[0]

            self.dict_side_var_columns_probs[sv] = dict_uniques_sv_prob

        self.df_train['risk_index'] = self.df_train[self.side_var_columns].apply((
            lambda x: self.func_risk_index(self.dict_side_var_columns_probs, x)), axis=1)                      

        for sv in self.side_var_columns:
            column_name_prob = sv + "_prob"            
            self.df_train[column_name_prob] = self.df_train[sv].apply(lambda x: self.dict_side_var_columns_probs[sv][x]) 
        
        return self
    
    def predict(self, df_in):
        
        self.df_out = df_in.copy()        
        self.df_out['risk_index'] = self.df_out[self.side_var_columns].apply(
            (lambda x: self.func_risk_index(self.dict_side_var_columns_probs, x)), axis=1)
        
        return self.df_out.copy()
    
    def final_output(self, df_in):
        neutral_df_1 = df_in.copy()
        neutral_df_2 = df_in.copy()
        neutral_df_1.condicao_meteorologica.iloc[:] = ['tempo_claro']*len(neutral_df_1)  
        neutral_df_2.condicao_meteorologica.iloc[:] = ['tempo_adverso']*len(neutral_df_2)
        
        self.final_df = neutral_df_1.append(neutral_df_2)
        
        self.final_df['risk_index'] = self.final_df[self.side_var_columns].apply(
            (lambda x: self.func_risk_index(self.dict_side_var_columns_probs, x)), axis=1)
        
        return self.final_df.copy()
    
    def metrica_avaliacao(self, df, predict_column='risk_index', real_column='acidentes_graves',
                          viaturas=3):  
        sum_true = 0
        sum_pred = 0

        for month in df.month_number.unique():
            df_m = df[df.month_number==month]
            for dia in df_m.dia.unique():
                df_d = df_m[df_m.dia==dia]
                for p in df_d.periodo.unique():
                    df_p_real = df_d[df_d.periodo == p].copy()
                    df_p_pred = df_p_real.copy()
                    sum_true += df_p_real.sort_values(
                        by=real_column, ascending=False)[real_column][:viaturas].sum()
                    sum_pred += df_p_pred.sort_values(
                        by=predict_column, ascending=False)[real_column][:viaturas].sum()

        return sum_true, sum_pred

    def optimize(self,
                   df_objetivo,
                   real_column=['mortos','feridos_graves'],
                   predict_column='risco_otimizado',
                   gama=0.5,
                   viaturas=10,
                   epochs=10,
                   pop_size=100):
        
        df = self.predict(df_objetivo)
        
        best_sum_pred = 0
        len_opt_weigths = len(self.best_opt_weights)
        
        sum_true, _ = self.metrica_avaliacao(df)

        print('Objetivo:',sum_true)
        
        population = np.random.randn(pop_size,len_opt_weigths)
        population[0,:] = self.best_opt_weights
        fitness = np.zeros(pop_size)
        
        for z in range(epochs):
            
            for pop in range(population.shape[0]):
                risco_otimizado = []
                
                self.pop_opt_weights = population[pop,:]

                df[predict_column] = df[self.side_var_columns].apply((
                                    lambda x: self.func_risco_otimizando(self.dict_side_var_columns_probs, x)), axis=1)  

                _, sum_pred = self.metrica_avaliacao(df, predict_column=predict_column)
                            
                print(sum_true, sum_pred, pop)
                fitness[pop] = (sum_true - sum_pred)
            
            best_fathers = population[fitness.argsort()][:int(population.shape[0]/2),:]
            worst_fathers = population[fitness.argsort()][int(population.shape[0]/2):,:]
            
            for pop in range(population.shape[0]):
                
                if pop < 0.25*population.shape[0]:
                    population[pop,:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])] = best_fathers[int(pop/2),:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])]
                elif pop < 0.50*population.shape[0] and pop > 0.25*population.shape[0]:
                    population[pop,:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])] = best_fathers[int(pop/2),:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])]
                    population[pop,:] += np.random.randn(len_opt_weigths)
                else:
                    population[pop,:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])] = worst_fathers[int(pop/2),:int(population.shape[1]*(population.shape[0]-pop)/population.shape[0])]
                    population[pop,:] += np.random.randn(len_opt_weigths)
            
            fitness.sort()
            if fitness[0] < np.abs(sum_true - best_sum_pred):
                best_sum_pred = sum_true - fitness[0]
                best_opt_weights = best_fathers[0,:]
            
            print(best_sum_pred)
            
        self.best_opt_weights = best_opt_weights
        
        return sum_true, best_sum_pred
    
    def cross_val_lombra(self, year_test=[2016,2017,2018], caso='todos_no_GA'):
        
        self.cross_val_result = []
        year_test = year_test
        
        for yt in year_test:
            df_train = self.df[self.df.ano != yt]
            df_test = self.df[self.df.ano == yt]
            dict_side_var_columns_probs = {}
            for sv in self.side_var_columns:
                dict_uniques_sv_prob = {}
                for u in df_train[sv].unique():
                    dict_uniques_sv_prob[u] = df_train[
                        (df_train[sv]==u) & (df_train.acidentes_graves==1)].count(
                    ).values[0]/df_train[(df_train[sv]==u)].count().values[0]

                dict_side_var_columns_probs[sv] = dict_uniques_sv_prob

            df_train['risk_index'] = df_train[self.side_var_columns].apply((
                lambda x: self.func_risk_index(dict_side_var_columns_probs, x)), axis=1)                      

            for sv in self.side_var_columns:
                column_name_prob = sv + "_prob"            
                df_train[column_name_prob] = df_train[sv].apply(lambda x: dict_side_var_columns_probs[sv][x]) 
            
            df_out = df_test.copy()        
            df_out['risk_index'] = df_out[self.side_var_columns].apply(
            (lambda x: self.func_risk_index(dict_side_var_columns_probs, x)), axis=1)
            
            sum_true, sum_pred = self.metrica_avaliacao(df_out)
            
            self.cross_val_result.append([yt, sum_true, sum_pred])
            
            df_out.to_csv('EvidenciaDuvida/'+str(yt)+caso+'.csv', index=None)
            
        return self

## Open data

In [4]:
df = pd.read_csv('df_19_18_17_16_pad.csv')

print(df.condicao_meteorologica.unique())

df.drop(['index','km'],axis=1, inplace=True)

df['acidentes_graves'] = df['mortos'
                                    ]+df['feridos_graves']
#                                     ]+df['feridos'
#                                     ]+df['feridos_leves'
#                                     ]+df['ilesos']

df['acidentes_graves'] = df['acidentes_graves'].apply(lambda x: 1 if (x > 0) else 0)
            
df.head()

['ignorada' 'tempo_claro' 'tempo_adverso']


Unnamed: 0,dia_semana,br,condicao_meteorologica,tipo_pista,tracado_via,uso_solo,mortos,feridos_leves,feridos_graves,ilesos,...,feridos,veiculos,feriado,km_bins,periodo,ano,month_number,dia,data,acidentes_graves
0,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[8, 12)",2019,1,1,2019-01-01 00:00:00,0
1,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[16, 20)",2019,1,1,2019-01-01 00:00:00,0
2,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[20, 24)",2019,1,1,2019-01-01 00:00:00,0
3,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[12, 16)",2019,1,1,2019-01-01 00:00:00,0
4,terca,104,ignorada,dupla,reta,sim,0,0,0,0,...,0,0,not-feriado,"[60, 65)","[4, 8)",2019,1,1,2019-01-01 00:00:00,0


## Ajuste

In [5]:
df.dia_semana.replace(to_replace={'terca': 'terça','sabado': 'sábado'}, inplace=True)

df.tracado_via.replace(to_replace={'interseção de vias':'curva','cruzamento':'curva','viaduto':'curva','retorno regulamentado':'curva',
                                  'desvio temporário':'curva', 'ponte':'curva', 'rotatória':'curva'}, inplace=True)

df.tipo_pista.replace(to_replace={'múltipla': 'dupla'}, inplace=True)

## training

In [6]:
# km_bins vai fazer o papel do main.

# tipo_pista', 'tracado_via' removidos por conta do problema do 1.
side_var_columns = ['km_bins','br', 'uso_solo','periodo', 'condicao_meteorologica',
                    'feriado', 'dia_semana','month_number','dia'] 

WPudo = WeightedProb(df, side_var_columns, train_til_year=2018)
WPudo.fit()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<__main__.WeightedProb at 0x186dfa6fd48>

In [7]:
WPudo.dict_side_var_columns_probs

{'km_bins': {'[105, 110)': 0.001141552511415525,
  '[195, 200)': 0.0013698630136986301,
  '[125, 130)': 0.0019786910197869103,
  '[140, 145)': 0.001141552511415525,
  '[110, 115)': 0.0013698630136986301,
  '[120, 125)': 0.0009893455098934551,
  '[130, 135)': 0.0013698630136986301,
  '[185, 190)': 0.0012176560121765602,
  '[100, 105)': 0.001141552511415525,
  '[200, 205)': 0.0013698630136986301,
  '[180, 185)': 0.0016742770167427702,
  '[145, 150)': 0.0013698630136986301,
  '[160, 165)': 0.00106544901065449,
  '[205, 210)': 0.0006088280060882801,
  '[135, 140)': 0.00106544901065449,
  '[170, 175)': 0.0013698630136986301,
  '[210, 215)': 0.0015220700152207,
  '[165, 170)': 0.0013698630136986301,
  '[115, 120)': 0.0012937595129375952,
  '[190, 195)': 0.0009132420091324201,
  '[175, 180)': 0.0015220700152207,
  '[155, 160)': 0.0006088280060882801,
  '[150, 155)': 0.0012176560121765602,
  '[75, 80)': 0.0016742770167427702,
  '[80, 85)': 0.0015220700152207,
  '[50, 55)': 0.0015220700152207,


## predicting

In [8]:
df_2019 = df[df.ano==2019]

In [9]:
df_2019_out = WPudo.predict(df_2019)

In [10]:
metrica_avaliacao(df_2019_out, viaturas=3)

(156, 38)

In [11]:
test_out_final = WPudo.final_output(df_2019)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
test_out_final.condicao_meteorologica.unique()

array(['tempo_claro', 'tempo_adverso'], dtype=object)

In [12]:
test_out_final.to_excel('teste_auto_final.xlsx')

In [25]:
df_2019_out.to_excel('EvidenciaDuvida/2019sograve_no_GA.xlsx')

Exception ignored in: <function ZipFile.__del__ at 0x000002411B736AE8>
Traceback (most recent call last):
  File "C:\Users\hugow\AppData\Local\Programs\Python\Python37\lib\zipfile.py", line 1789, in __del__
    self.close()
  File "C:\Users\hugow\AppData\Local\Programs\Python\Python37\lib\zipfile.py", line 1798, in close
    raise ValueError("Can't close the ZIP file while there is "
ValueError: Can't close the ZIP file while there is an open writing handle on it. Close the writing handle before closing the zip.


## Cross Validation

In [26]:
WPudo.cross_val_lombra(year_test=[2016, 2017, 2018], caso='sograve_no_GA')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<__main__.WeightedProb at 0x241509f7b70>

In [27]:
WPudo.cross_val_result

[[2016, 173, 25], [2017, 144, 20], [2018, 106, 21]]

In [None]:
WPudo.best_opt_weights

## otimizando

In [None]:
WPudo.optimize(viaturas=3, df_objetivo=df_2019_out, pop_size=20, epochs=5)

Objetivo: 353
353 108 0
353 41 1
353 36 2
353 21 3
353 46 4
353 48 5
353 82 6
353 23 7
353 89 8
353 29 9
353 55 10
353 26 11
353 4 12
353 19 13
353 22 14
353 39 15
353 11 16
353 14 17
353 25 18
353 2 19
108.0
353 108 0
353 108 1
353 89 2
353 89 3
353 82 4
353 39 5
353 50 6
353 65 7
353 22 8
353 74 9
353 13 10
353 27 11
353 1 12
353 2 13
353 13 14
353 10 15
353 29 16
353 4 17
353 3 18
353 1 19
108.0
353 108 0
353 108 1
353 108 2
353 108 3
353 89 4
353 17 5
353 105 6
353 106 7
353 74 8
353 92 9
353 11 10
353 26 11
353 3 12
353 5 13
353 1 14
353 5 15
353 37 16
353 10 17
353 3 18
353 7 19
108.0
353 108 0
353 108 1
353 108 2
353 108 3
353 108 4
353 14 5
353 73 6
353 84 7
353 97 8
353 95 9
353 5 10
353 85 11
353 16 12
353 12 13
353 5 14
353 41 15
353 36 16
353 29 17


In [None]:
df_2019_opt = WPudo.predict(df_2019_out)

In [None]:
metrica_avaliacao(df_2019_opt, viaturas=3)

In [None]:
WPudo.best_opt_weights

In [None]:
df_2019_opt.to_csv('EvidenciaDuvida/2019todos_yes_GA.xlsx')

## Cross Validation

In [None]:
WPudo.cross_val_lombra(caso='todos_yes_GA')

In [None]:
WPudo.cross_val_result