## Recupera os dados analisados e tratados, treina o modelo, fazendo a classificação dos acidentes aeronáuticos por período de ocorrência.

In [91]:
# importa bibliotecas
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', None) # permite exibição de todas as linhas
pd.set_option('display.max_columns', None) # permite exibição de todas as colunas
import warnings
warnings.simplefilter('ignore') # desativa avisos de warning

In [92]:
# carregando arquivo tratado em dataframe
df_acidentes_aero_trans = pd.read_csv('arquivos_tratados/df_acidentes_aero_trans.csv')

In [93]:
# conferindo a dimensão do dataframe, linhas e colunas
df_acidentes_aero_trans.shape

(6888, 46)

In [94]:
# ordenando pelo index o dataframe carregado
df_acidentes_aero_trans = df_acidentes_aero_trans.sort_index()

In [95]:
df_acidentes_aero_trans.head(5)

Unnamed: 0,ocorrencia_classificacao,ocorrencia_saida_pista,aeronave_tipo_veiculo,aeronave_motor_tipo,aeronave_motor_quantidade,aeronave_tipo_operacao,aeronave_nivel_dano,periodo,ocorrencia_latitude,ocorrencia_longitude,aeronave_assentos,one-hot__ocorrencia_classificacao_ACIDENTE,one-hot__ocorrencia_classificacao_INCIDENTE,one-hot__ocorrencia_classificacao_INCIDENTE GRAVE,one-hot__ocorrencia_saida_pista_NÃO,one-hot__ocorrencia_saida_pista_SIM,one-hot__aeronave_tipo_veiculo_***,one-hot__aeronave_tipo_veiculo_AVIÃO,one-hot__aeronave_tipo_veiculo_HELICÓPTERO,one-hot__aeronave_tipo_veiculo_ULTRALEVE,one-hot__aeronave_motor_tipo_***,one-hot__aeronave_motor_tipo_JATO,one-hot__aeronave_motor_tipo_PISTÃO,one-hot__aeronave_motor_tipo_TURBOEIXO,one-hot__aeronave_motor_tipo_TURBOÉLICE,one-hot__aeronave_motor_quantidade_***,one-hot__aeronave_motor_quantidade_BIMOTOR,one-hot__aeronave_motor_quantidade_MONOMOTOR,one-hot__aeronave_motor_quantidade_SEM TRAÇÃO,one-hot__aeronave_tipo_operacao_***,one-hot__aeronave_tipo_operacao_AGRÍCOLA,one-hot__aeronave_tipo_operacao_EXPERIMENTAL,one-hot__aeronave_tipo_operacao_INSTRUÇÃO,one-hot__aeronave_tipo_operacao_POLICIAL,one-hot__aeronave_tipo_operacao_PRIVADA,one-hot__aeronave_tipo_operacao_REGULAR,one-hot__aeronave_tipo_operacao_TÁXI AÉREO,one-hot__aeronave_nivel_dano_***,one-hot__aeronave_nivel_dano_DESTRUÍDA,one-hot__aeronave_nivel_dano_LEVE,one-hot__aeronave_nivel_dano_NENHUM,one-hot__aeronave_nivel_dano_SUBSTANCIAL,minmax__ocorrencia_latitude,minmax__ocorrencia_longitude,minmax__aeronave_assentos,periodo_oe
0,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,PRIVADA,***,00:00 às 06:00,0.0,0.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.528177,0.537901,0.009105,0.0
1,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,PRIVADA,NENHUM,18:00 às 00:00,0.0,0.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.528177,0.537901,0.00607,1.0
2,INCIDENTE,NÃO,AVIÃO,JATO,BIMOTOR,***,***,18:00 às 00:00,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.528177,0.537901,0.0,1.0
3,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,INSTRUÇÃO,***,00:00 às 06:00,0.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.528177,0.537901,0.003035,0.0
4,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,TÁXI AÉREO,NENHUM,18:00 às 00:00,0.0,0.0,10.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.528177,0.537901,0.015175,1.0


In [96]:
# verifica os códigos numéricos dos períodos com os códigos categóricos
print('periodo_oe', df_acidentes_aero_trans['periodo_oe'].value_counts())
print('periodo', df_acidentes_aero_trans['periodo'].value_counts())

periodo_oe periodo_oe
2.0    2548
1.0    1885
0.0    1526
3.0     929
Name: count, dtype: int64
periodo periodo
12:00 às 18:00    2548
18:00 às 00:00    1885
00:00 às 06:00    1526
06:00 às 12:00     929
Name: count, dtype: int64


In [97]:
# criando lista de colunas originais categóricas e numéricas que participarão da clusterização
columns_aero = [
        'ocorrencia_classificacao',
        'ocorrencia_saida_pista',
        'aeronave_tipo_veiculo',
        'aeronave_motor_tipo',
        'aeronave_motor_quantidade',
        'aeronave_tipo_operacao',
        'aeronave_nivel_dano',
        'periodo',     

        #'ocorrencia_latitude',
        #'ocorrencia_longitude', 
        #'aeronave_assentos',

        #'minmax__ocorrencia_latitude',
        #'minmax__ocorrencia_longitude', 
        #'minmax__aeronave_assentos',
        ]

columns_trans_apaga = [
        'ocorrencia_classificacao',
        'ocorrencia_saida_pista',
        'aeronave_tipo_veiculo',
        'aeronave_motor_tipo',
        'aeronave_motor_quantidade',
        'aeronave_tipo_operacao',
        'aeronave_nivel_dano',
        'periodo',     

        'ocorrencia_latitude',
        'ocorrencia_longitude', 
        'aeronave_assentos',

        'minmax__ocorrencia_latitude',
        'minmax__ocorrencia_longitude', 
        #'minmax__aeronave_assentos',
        ]        
      

In [98]:
# carregando colunas listadas anteriormente para um novo dataframe
df_acidentes_aero = df_acidentes_aero_trans[columns_aero]

In [99]:
# conferindo a dimensão(linhas x colunas) do dataframe de colunas originais
# temos 8 colunas categoricas e 2 colunas numericas
df_acidentes_aero.shape

(6888, 8)

In [100]:
# fazendo um reconhecimento inicial das primeiras linhas do dataframe criado com variáveis(colunas) originais
df_acidentes_aero.head(5)

Unnamed: 0,ocorrencia_classificacao,ocorrencia_saida_pista,aeronave_tipo_veiculo,aeronave_motor_tipo,aeronave_motor_quantidade,aeronave_tipo_operacao,aeronave_nivel_dano,periodo
0,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,PRIVADA,***,00:00 às 06:00
1,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,PRIVADA,NENHUM,18:00 às 00:00
2,INCIDENTE,NÃO,AVIÃO,JATO,BIMOTOR,***,***,18:00 às 00:00
3,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,INSTRUÇÃO,***,00:00 às 06:00
4,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,TÁXI AÉREO,NENHUM,18:00 às 00:00


In [101]:
# cria dataframe somente com as colunas que sofreram transformação
df_acidentes_aero_trans = df_acidentes_aero_trans.drop(columns=columns_trans_apaga, axis=1)

- Essas colunas que sofreram transformação serão as colunas utilizadas para a definição do cluster ao qual o acidente de cada linha do dataframe vai pertencer

In [102]:
# verificando o tamanho do dataframe(linhas x colunas) criado com as colunas transformadas
# temos 47 variáveis binárias categóricas e 2 variáveis numéricas que variam de 0 a 1
df_acidentes_aero_trans.shape

(6888, 33)

In [103]:
# verificando as primeiras linhas do novo dataframe
df_acidentes_aero_trans.head(5)

Unnamed: 0,one-hot__ocorrencia_classificacao_ACIDENTE,one-hot__ocorrencia_classificacao_INCIDENTE,one-hot__ocorrencia_classificacao_INCIDENTE GRAVE,one-hot__ocorrencia_saida_pista_NÃO,one-hot__ocorrencia_saida_pista_SIM,one-hot__aeronave_tipo_veiculo_***,one-hot__aeronave_tipo_veiculo_AVIÃO,one-hot__aeronave_tipo_veiculo_HELICÓPTERO,one-hot__aeronave_tipo_veiculo_ULTRALEVE,one-hot__aeronave_motor_tipo_***,one-hot__aeronave_motor_tipo_JATO,one-hot__aeronave_motor_tipo_PISTÃO,one-hot__aeronave_motor_tipo_TURBOEIXO,one-hot__aeronave_motor_tipo_TURBOÉLICE,one-hot__aeronave_motor_quantidade_***,one-hot__aeronave_motor_quantidade_BIMOTOR,one-hot__aeronave_motor_quantidade_MONOMOTOR,one-hot__aeronave_motor_quantidade_SEM TRAÇÃO,one-hot__aeronave_tipo_operacao_***,one-hot__aeronave_tipo_operacao_AGRÍCOLA,one-hot__aeronave_tipo_operacao_EXPERIMENTAL,one-hot__aeronave_tipo_operacao_INSTRUÇÃO,one-hot__aeronave_tipo_operacao_POLICIAL,one-hot__aeronave_tipo_operacao_PRIVADA,one-hot__aeronave_tipo_operacao_REGULAR,one-hot__aeronave_tipo_operacao_TÁXI AÉREO,one-hot__aeronave_nivel_dano_***,one-hot__aeronave_nivel_dano_DESTRUÍDA,one-hot__aeronave_nivel_dano_LEVE,one-hot__aeronave_nivel_dano_NENHUM,one-hot__aeronave_nivel_dano_SUBSTANCIAL,minmax__aeronave_assentos,periodo_oe
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.009105,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.00607,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.003035,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.015175,1.0


In [104]:
# separa variáveis independentes e dependentes
X = df_acidentes_aero_trans.drop(columns=['periodo_oe'], axis=1)
y = df_acidentes_aero_trans[['periodo_oe']]

In [105]:
# separa os sets de treino e teste em 20% para teste e 80% para treino
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [106]:
# verifica o tamanho de cada set resultante do split
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5510, 32)
(5510, 1)
(1378, 32)
(1378, 1)


In [107]:
# treina o modelo
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

In [108]:
# prediz y com predict e predict_proba
y_pred_proba = clf.predict_proba(X_test) # fornece o percentual de cada classe
y_pred = clf.predict(X_test)

In [109]:
# visualiza a predição de y de X_test
y_pred

array([1., 2., 2., ..., 2., 0., 2.])

In [110]:
# gera a matriz de confusão para análise
# tivemos 123 acertos(soma da diagonal principal) contra 78 erros(vrs restantes) 
# de 201 total dando 61,19% de acerto
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[ 74,  23, 194,  15],
       [ 42,  38, 275,  19],
       [ 41,  32, 406,  28],
       [ 24,  11, 129,  27]], dtype=int64)

In [111]:
# verifica a acurácia, precisão e recall do resultado que corresponde à matriz de confusão
print('acurácia =', metrics.accuracy_score(y_test, y_pred))
print('precisão =', metrics.precision_score(y_test, y_pred, average='weighted'))
print('recall   =', metrics.recall_score(y_test, y_pred, average='weighted'))
print('f1-score =', metrics.f1_score(y_test, y_pred, average='weighted'))

acurácia = 0.3955007256894049
precisão = 0.3807870472459023
recall   = 0.3955007256894049
f1-score = 0.3350885080689251


In [112]:
# visualiza a predição das probabilidades de y
print(y_pred_proba)

[[0.15209217 0.39685067 0.35969118 0.09136599]
 [0.19807862 0.21977255 0.4582921  0.12385673]
 [0.22080703 0.24726406 0.44541241 0.0865165 ]
 ...
 [0.20066802 0.26674721 0.42638933 0.10619544]
 [0.30693205 0.29076435 0.24659034 0.15571326]
 [0.17062306 0.31159574 0.38621477 0.13156643]]


In [113]:
# transforma array em dataframe
dfproba = pd.DataFrame(y_pred_proba)

In [114]:
# renomeia as colunas de resultado do predict_proba
dfproba.columns = ['12:00 às 18:00', '18:00 às 00:00', '00:00 às 06:00', '06:00 às 12:00']

In [115]:
# arredonda os resultados com duas casas decimai
dfproba = round(dfproba*100, 2)

In [116]:
# soma o total de percentuais para checar se fecha em 100%
dfproba['soma_perc'] = dfproba['12:00 às 18:00'] + dfproba['18:00 às 00:00'] + \
                       dfproba['00:00 às 06:00'] + dfproba['06:00 às 12:00']

In [117]:
# exibe o dataframe para checar os percentuais
dfproba.head(10)

Unnamed: 0,12:00 às 18:00,18:00 às 00:00,00:00 às 06:00,06:00 às 12:00,soma_perc
0,15.21,39.69,35.97,9.14,100.01
1,19.81,21.98,45.83,12.39,100.01
2,22.08,24.73,44.54,8.65,100.0
3,18.98,25.09,45.91,10.02,100.0
4,32.07,29.0,18.55,20.38,100.0
5,20.07,26.67,42.64,10.62,100.0
6,18.98,25.09,45.91,10.02,100.0
7,18.66,32.62,39.98,8.74,100.0
8,15.84,23.35,46.19,14.62,100.0
9,18.98,25.09,45.91,10.02,100.0


In [118]:
df_acidentes_aero = pd.concat([df_acidentes_aero, dfproba], axis=1)

In [119]:
df_acidentes_aero.head(5)

Unnamed: 0,ocorrencia_classificacao,ocorrencia_saida_pista,aeronave_tipo_veiculo,aeronave_motor_tipo,aeronave_motor_quantidade,aeronave_tipo_operacao,aeronave_nivel_dano,periodo,12:00 às 18:00,18:00 às 00:00,00:00 às 06:00,06:00 às 12:00,soma_perc
0,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,PRIVADA,***,00:00 às 06:00,15.21,39.69,35.97,9.14,100.01
1,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,PRIVADA,NENHUM,18:00 às 00:00,19.81,21.98,45.83,12.39,100.01
2,INCIDENTE,NÃO,AVIÃO,JATO,BIMOTOR,***,***,18:00 às 00:00,22.08,24.73,44.54,8.65,100.0
3,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,INSTRUÇÃO,***,00:00 às 06:00,18.98,25.09,45.91,10.02,100.0
4,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,TÁXI AÉREO,NENHUM,18:00 às 00:00,32.07,29.0,18.55,20.38,100.0


## Faz o Deploy do Modelo

In [120]:
# faz o deploy do modelo
from joblib import dump

dump(clf, 'modelos/LR_cenipa_acidentes_aero.pkl')

['modelos/LR_cenipa_acidentes_aero.pkl']