<h2>Importar as bibliotecas</h2>

In [11]:
import pandas as pd
import numpy as np
import xgboost
import shap
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

<h2>Conjunto de dados</h2>

In [2]:
dataset = pd.read_csv('Traffic_Incidents_Braga_Until_20190228.csv', na_values='NaN')

dataset.head()

Unnamed: 0,city_name,description,cause_of_incident,from_road,to_road,affected_roads,incident_category_desc,magnitude_of_delay_desc,length_in_meters,delay_in_seconds,incident_date
0,Braga,stationary traffic,,Braga-Circular (A11),Celeirós (IP9) (A11),A11,Jam,Major,583,123,2019-02-28 23:45:00.098000
1,Braga,closed,,Avenida Doutor Francisco Salgado Zenha,Avenida Robert Smith,,Road Closed,Undefined,60,0,2019-02-28 23:45:00.098000
2,Braga,stationary traffic,,Braga-Circular (A11),Celeirós (IP9) (A11),A11,Jam,Major,662,191,2019-02-28 23:25:00.353000
3,Braga,closed,,Avenida Doutor Francisco Salgado Zenha,Avenida Robert Smith,,Road Closed,Undefined,60,0,2019-02-28 23:25:00.353000
4,Braga,slow traffic,,Braga-Circular (A11),Celeirós (IP9) (A11),A11,Jam,Minor,662,26,2019-02-28 23:05:00.154000


<h2>Verificação dos valores em falta</h2>

In [3]:
#dataset.isnull().sum()

<h2>Tratamento dos valores em falta</h2>

In [4]:
dataset[['cause_of_incident','affected_roads']] = dataset[['cause_of_incident','affected_roads']].fillna('None')

<h2>Eliminação de colunas sem valor adicional</h2>

In [5]:
dataset = dataset.drop(['city_name','from_road','to_road','affected_roads','incident_date'], axis = 1)
dataset.head()

Unnamed: 0,description,cause_of_incident,incident_category_desc,magnitude_of_delay_desc,length_in_meters,delay_in_seconds
0,stationary traffic,,Jam,Major,583,123
1,closed,,Road Closed,Undefined,60,0
2,stationary traffic,,Jam,Major,662,191
3,closed,,Road Closed,Undefined,60,0
4,slow traffic,,Jam,Minor,662,26


<h2>Obter os valores</h2>

In [6]:
X = dataset.iloc[:, :]

<h2>Identificar as colunas categóricas</h2>

In [7]:
categorical_feature_mask = X.dtypes==object

categorical_cols = X.columns[categorical_feature_mask].tolist()

<h2>Codificação das variáveis independentes categóricas</h2>

In [10]:
labelEncoder = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: labelEncoder.fit_transform(col))

oneHotEncoder = OneHotEncoder(categorical_features = categorical_feature_mask, sparse=False )

X_oneHotEncoder = oneHotEncoder.fit_transform(X)

X.head()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,description,cause_of_incident,incident_category_desc,magnitude_of_delay_desc,length_in_meters,delay_in_seconds
0,6,0,1,0,583,123
1,1,0,3,3,60,0
2,6,0,1,0,662,191
3,1,0,3,3,60,0
4,5,0,1,1,662,26


<h2>Feature Scaling</h2>

In [15]:
sc = StandardScaler()
X = sc.fit_transform(X)
X

array([[ 1.03454698, -0.02524751, -0.89435272, -1.16973962,  0.62749341,
        -0.00979197],
       [-1.03576071, -0.02524751,  1.09986431,  0.9861592 , -0.75935593,
        -0.75995856],
       [ 1.03454698, -0.02524751, -0.89435272, -1.16973962,  0.83697926,
         0.40493428],
       ...,
       [-0.20763763, -0.02524751, -0.89435272,  0.26752626,  1.33550253,
        -0.17446268],
       [-0.20763763, -0.02524751, -0.89435272, -1.16973962, -0.2820464 ,
        -0.42451821],
       [ 1.03454698, -0.02524751, -0.89435272, -1.16973962, -0.38811519,
         0.44152777]])

<h2>PCA</h2>

In [20]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X = pca.fit_transform(X)
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.6317583 , 0.16695396])

<h2>Calcular fatores de normalização para cada atributo</h2>

In [19]:
accumulated_sum = np.mean(X, axis = 0)
accumulated_sum

array([ 4.97383024e-18, -1.27454400e-17, -3.97906419e-17, -9.94766047e-18,
        1.98953209e-17,  1.98953209e-17])