## Recupera os dados analisados e tratados, treina o modelo, fazendo a classificação dos acidentes aeronáuticos por período de ocorrência.

In [197]:
# importa bibliotecas
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', None) # permite exibição de todas as linhas
pd.set_option('display.max_columns', None) # permite exibição de todas as colunas
import warnings
warnings.simplefilter('ignore') # desativa avisos de warning

In [198]:
# carregando arquivo tratado em dataframe
df_acidentes_aero_trans = pd.read_csv('arquivos_tratados/df_acidentes_aero_trans.csv')

In [199]:
# conferindo a dimensão do dataframe, linhas e colunas
df_acidentes_aero_trans.shape

(6888, 58)

In [200]:
# ordenando pelo index o dataframe carregado
df_acidentes_aero_trans = df_acidentes_aero_trans.sort_index()

In [201]:
df_acidentes_aero_trans.head(5)

Unnamed: 0,ocorrencia_classificacao,ocorrencia_saida_pista,aeronave_tipo_veiculo,aeronave_motor_tipo,aeronave_motor_quantidade,aeronave_tipo_operacao,aeronave_nivel_dano,periodo,ocorrencia_latitude,ocorrencia_longitude,aeronave_assentos,one-hot__ocorrencia_classificacao_ACIDENTE,one-hot__ocorrencia_classificacao_INCIDENTE,one-hot__ocorrencia_classificacao_INCIDENTE GRAVE,one-hot__ocorrencia_saida_pista_NÃO,one-hot__ocorrencia_saida_pista_SIM,one-hot__aeronave_tipo_veiculo_***,one-hot__aeronave_tipo_veiculo_ANFÍBIO,one-hot__aeronave_tipo_veiculo_AVIÃO,one-hot__aeronave_tipo_veiculo_BALÃO,one-hot__aeronave_tipo_veiculo_DIRIGÍVEL,one-hot__aeronave_tipo_veiculo_GIROCÓPTERO,one-hot__aeronave_tipo_veiculo_HELICÓPTERO,one-hot__aeronave_tipo_veiculo_HIDROAVIÃO,one-hot__aeronave_tipo_veiculo_PLANADOR,one-hot__aeronave_tipo_veiculo_TRIKE,one-hot__aeronave_tipo_veiculo_ULTRALEVE,one-hot__aeronave_motor_tipo_***,one-hot__aeronave_motor_tipo_JATO,one-hot__aeronave_motor_tipo_PISTÃO,one-hot__aeronave_motor_tipo_SEM TRAÇÃO,one-hot__aeronave_motor_tipo_TURBOEIXO,one-hot__aeronave_motor_tipo_TURBOÉLICE,one-hot__aeronave_motor_quantidade_***,one-hot__aeronave_motor_quantidade_BIMOTOR,one-hot__aeronave_motor_quantidade_MONOMOTOR,one-hot__aeronave_motor_quantidade_QUADRIMOTOR,one-hot__aeronave_motor_quantidade_SEM TRAÇÃO,one-hot__aeronave_motor_quantidade_TRIMOTOR,one-hot__aeronave_tipo_operacao_***,one-hot__aeronave_tipo_operacao_AGRÍCOLA,one-hot__aeronave_tipo_operacao_ESPECIALIZADA,one-hot__aeronave_tipo_operacao_EXPERIMENTAL,one-hot__aeronave_tipo_operacao_INSTRUÇÃO,one-hot__aeronave_tipo_operacao_NÃO REGULAR,one-hot__aeronave_tipo_operacao_POLICIAL,one-hot__aeronave_tipo_operacao_PRIVADA,one-hot__aeronave_tipo_operacao_REGULAR,one-hot__aeronave_tipo_operacao_TÁXI AÉREO,one-hot__aeronave_nivel_dano_***,one-hot__aeronave_nivel_dano_DESTRUÍDA,one-hot__aeronave_nivel_dano_LEVE,one-hot__aeronave_nivel_dano_NENHUM,one-hot__aeronave_nivel_dano_SUBSTANCIAL,minmax__ocorrencia_latitude,minmax__ocorrencia_longitude,minmax__aeronave_assentos,periodo_oe
0,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,PRIVADA,***,00:00 às 06:00,0.0,0.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.528177,0.537901,0.009105,0.0
1,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,PRIVADA,NENHUM,18:00 às 00:00,0.0,0.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.528177,0.537901,0.00607,1.0
2,INCIDENTE,NÃO,AVIÃO,JATO,BIMOTOR,***,***,18:00 às 00:00,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.528177,0.537901,0.0,1.0
3,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,INSTRUÇÃO,***,00:00 às 06:00,0.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.528177,0.537901,0.003035,0.0
4,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,TÁXI AÉREO,NENHUM,18:00 às 00:00,0.0,0.0,10.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.528177,0.537901,0.015175,1.0


In [202]:
# verifica os códigos numéricos dos períodos com os códigos categóricos
print('periodo_oe', df_acidentes_aero_trans['periodo_oe'].value_counts())
print('periodo', df_acidentes_aero_trans['periodo'].value_counts())

periodo_oe periodo_oe
2.0    2548
1.0    1885
0.0    1526
3.0     929
Name: count, dtype: int64
periodo periodo
12:00 às 18:00    2548
18:00 às 00:00    1885
00:00 às 06:00    1526
06:00 às 12:00     929
Name: count, dtype: int64


In [203]:
# criando lista de colunas originais categóricas e numéricas que participarão da clusterização
columns = [
        'ocorrencia_classificacao',
        'ocorrencia_saida_pista',
        'aeronave_tipo_veiculo',
        'aeronave_motor_tipo',
        'aeronave_motor_quantidade',
        'aeronave_tipo_operacao',
        'aeronave_nivel_dano',
        'periodo',     

        'ocorrencia_latitude',
        'ocorrencia_longitude', 
        'aeronave_assentos',

        'minmax__ocorrencia_latitude',
        'minmax__ocorrencia_longitude', 
        'minmax__aeronave_assentos',

        ]

In [204]:
# carregando colunas listadas anteriormente para um novo dataframe
df_acidentes_aero = df_acidentes_aero_trans[columns]

In [205]:
# conferindo a dimensão(linhas x colunas) do dataframe de colunas originais
# temos 8 colunas categoricas e 2 colunas numericas
df_acidentes_aero.shape

(6888, 14)

In [206]:
# fazendo um reconhecimento inicial das primeiras linhas do dataframe criado com variáveis(colunas) originais
df_acidentes_aero.head(5)

Unnamed: 0,ocorrencia_classificacao,ocorrencia_saida_pista,aeronave_tipo_veiculo,aeronave_motor_tipo,aeronave_motor_quantidade,aeronave_tipo_operacao,aeronave_nivel_dano,periodo,ocorrencia_latitude,ocorrencia_longitude,aeronave_assentos,minmax__ocorrencia_latitude,minmax__ocorrencia_longitude,minmax__aeronave_assentos
0,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,PRIVADA,***,00:00 às 06:00,0.0,0.0,6.0,0.528177,0.537901,0.009105
1,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,PRIVADA,NENHUM,18:00 às 00:00,0.0,0.0,4.0,0.528177,0.537901,0.00607
2,INCIDENTE,NÃO,AVIÃO,JATO,BIMOTOR,***,***,18:00 às 00:00,0.0,0.0,0.0,0.528177,0.537901,0.0
3,INCIDENTE,NÃO,AVIÃO,PISTÃO,MONOMOTOR,INSTRUÇÃO,***,00:00 às 06:00,0.0,0.0,2.0,0.528177,0.537901,0.003035
4,INCIDENTE,NÃO,AVIÃO,PISTÃO,BIMOTOR,TÁXI AÉREO,NENHUM,18:00 às 00:00,0.0,0.0,10.0,0.528177,0.537901,0.015175


In [207]:
# cria dataframe somente com as colunas que sofreram transformação
df_acidentes_aero_trans = df_acidentes_aero_trans.drop(columns=columns, axis=1)

- Essas colunas que sofreram transformação serão as colunas utilizadas para a definição do cluster ao qual o acidente de cada linha do dataframe vai pertencer

In [208]:
# verificando o tamanho do dataframe(linhas x colunas) criado com as colunas transformadas
# temos 47 variáveis binárias categóricas e 2 variáveis numéricas que variam de 0 a 1
df_acidentes_aero_trans.shape

(6888, 44)

In [209]:
# verificando as primeiras linhas do novo dataframe
df_acidentes_aero_trans.head(5)

Unnamed: 0,one-hot__ocorrencia_classificacao_ACIDENTE,one-hot__ocorrencia_classificacao_INCIDENTE,one-hot__ocorrencia_classificacao_INCIDENTE GRAVE,one-hot__ocorrencia_saida_pista_NÃO,one-hot__ocorrencia_saida_pista_SIM,one-hot__aeronave_tipo_veiculo_***,one-hot__aeronave_tipo_veiculo_ANFÍBIO,one-hot__aeronave_tipo_veiculo_AVIÃO,one-hot__aeronave_tipo_veiculo_BALÃO,one-hot__aeronave_tipo_veiculo_DIRIGÍVEL,one-hot__aeronave_tipo_veiculo_GIROCÓPTERO,one-hot__aeronave_tipo_veiculo_HELICÓPTERO,one-hot__aeronave_tipo_veiculo_HIDROAVIÃO,one-hot__aeronave_tipo_veiculo_PLANADOR,one-hot__aeronave_tipo_veiculo_TRIKE,one-hot__aeronave_tipo_veiculo_ULTRALEVE,one-hot__aeronave_motor_tipo_***,one-hot__aeronave_motor_tipo_JATO,one-hot__aeronave_motor_tipo_PISTÃO,one-hot__aeronave_motor_tipo_SEM TRAÇÃO,one-hot__aeronave_motor_tipo_TURBOEIXO,one-hot__aeronave_motor_tipo_TURBOÉLICE,one-hot__aeronave_motor_quantidade_***,one-hot__aeronave_motor_quantidade_BIMOTOR,one-hot__aeronave_motor_quantidade_MONOMOTOR,one-hot__aeronave_motor_quantidade_QUADRIMOTOR,one-hot__aeronave_motor_quantidade_SEM TRAÇÃO,one-hot__aeronave_motor_quantidade_TRIMOTOR,one-hot__aeronave_tipo_operacao_***,one-hot__aeronave_tipo_operacao_AGRÍCOLA,one-hot__aeronave_tipo_operacao_ESPECIALIZADA,one-hot__aeronave_tipo_operacao_EXPERIMENTAL,one-hot__aeronave_tipo_operacao_INSTRUÇÃO,one-hot__aeronave_tipo_operacao_NÃO REGULAR,one-hot__aeronave_tipo_operacao_POLICIAL,one-hot__aeronave_tipo_operacao_PRIVADA,one-hot__aeronave_tipo_operacao_REGULAR,one-hot__aeronave_tipo_operacao_TÁXI AÉREO,one-hot__aeronave_nivel_dano_***,one-hot__aeronave_nivel_dano_DESTRUÍDA,one-hot__aeronave_nivel_dano_LEVE,one-hot__aeronave_nivel_dano_NENHUM,one-hot__aeronave_nivel_dano_SUBSTANCIAL,periodo_oe
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [210]:
# separa variáveis independentes e dependentes
X = df_acidentes_aero_trans.drop(columns=['periodo_oe'], axis=1)
y = df_acidentes_aero_trans[['periodo_oe']]

In [211]:
# separa os sets de treino e teste em 20% para teste e 80% para treino
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [212]:
# verifica o tamanho de cada set resultante do split
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5510, 43)
(5510, 1)
(1378, 43)
(1378, 1)


In [213]:
# treina o modelo
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

In [214]:
# prediz y com predict e predict_proba
y_pred_proba = clf.predict_proba(X_test) # fornece o percentual de cada classe
y_pred = clf.predict(X_test)

In [215]:
# visualiza a predição de y de X_test
y_pred

array([1., 2., 2., ..., 2., 0., 2.])

In [216]:
# gera a matriz de confusão para análise
# tivemos 123 acertos(soma da diagonal principal) contra 78 erros(vrs restantes) 
# de 201 total dando 61,19% de acerto
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[ 73,  24, 191,  18],
       [ 38,  44, 269,  23],
       [ 41,  33, 403,  30],
       [ 25,  13, 125,  28]], dtype=int64)

In [217]:
# verifica a acurácia, precisão e recall do resultado que corresponde à matriz de confusão
print('acurácia =', metrics.accuracy_score(y_test, y_pred))
print('precisão =', metrics.precision_score(y_test, y_pred, average='weighted'))
print('recall   =', metrics.recall_score(y_test, y_pred, average='weighted'))
print('f1-score =', metrics.f1_score(y_test, y_pred, average='weighted'))

acurácia = 0.397677793904209
precisão = 0.3856147319467636
recall   = 0.397677793904209
f1-score = 0.34119115124544974


In [218]:
# visualiza a predição das probabilidades de y
print(y_pred_proba)

[[0.15275294 0.39584737 0.35847716 0.09292253]
 [0.19930636 0.21919559 0.4541196  0.12737845]
 [0.21945304 0.24757876 0.44600344 0.08696477]
 ...
 [0.20018755 0.26707575 0.4263186  0.10641811]
 [0.30498737 0.29192806 0.24490084 0.15818373]
 [0.17051259 0.31143    0.38606506 0.13199234]]


In [219]:
# transforma array em dataframe
dfproba = pd.DataFrame(y_pred_proba)

In [220]:
# renomeia as colunas de resultado do predict_proba
dfproba.columns = ['12:00 às 18:00', '18:00 às 00:00', '00:00 às 06:00', '06:00 às 12:00']

In [221]:
# arredonda os resultados com duas casas decimai
dfproba = round(dfproba*100, 2)

In [222]:
# soma o total de percentuais para checar se fecha em 100%
dfproba['soma_perc'] = dfproba['12:00 às 18:00'] + dfproba['18:00 às 00:00'] + \
                       dfproba['00:00 às 06:00'] + dfproba['06:00 às 12:00']

In [223]:
# exibe o dataframe para checar os percentuais
dfproba.head(10)

Unnamed: 0,12:00 às 18:00,18:00 às 00:00,00:00 às 06:00,06:00 às 12:00,soma_perc
0,15.28,39.58,35.85,9.29,100.0
1,19.93,21.92,45.41,12.74,100.0
2,21.95,24.76,44.6,8.7,100.01
3,18.89,25.04,46.0,10.06,99.99
4,34.53,22.3,18.19,24.98,100.0
5,20.02,26.71,42.63,10.64,100.0
6,18.89,25.04,46.0,10.06,99.99
7,18.86,32.58,39.81,8.75,100.0
8,15.94,23.54,46.17,14.35,100.0
9,18.89,25.04,46.0,10.06,99.99


## Faz o Deploy do Modelo

In [224]:
# faz o deploy do modelo
from joblib import dump

dump(clf, 'modelos/LR_cenipa_acidentes_aero.pkl')

['modelos/LR_cenipa_acidentes_aero.pkl']