## Modulos

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from pprint import pprint

---

### Importacao e limpeza

In [2]:
df_raw = pd.read_csv('base.csv', sep=';', encoding='latin-1')
df_raw.head()

Unnamed: 0,Classe de Manutenção - Código,Classe de Manutenção - Descrição,Data de Entrada,Data de Saída,Funcionário - Código,Funcionário - Descrição,Sistema Veicular - Código,Sistema Veicular - Código.1,Subsistema Veicular - Código,Subsistema Veicular - Descrição,\nHoras,\n%Sistema Veicular,\n%Funcionário,\n%Data de Saída,\n%Data de Entrada,\n%Classe de Manutenção,\n% Geral
0,29,Corret. Emergencial,01/04/2020,01/04/2020,22955,Marcelo Venceslau da Silva,1400,SISTEMA HIDRÁULICO,1402,Cilindro hidráulico,2.35,100.0,100.0,2.1,0.11,0.0,0.0
1,29,Corret. Emergencial,01/04/2020,01/04/2020,30413,Agnaldo Aparecido Monteiro,1300,SISTEMA ELÉTRICO,1312,Indicadores Diversos,4.74,100.0,100.0,4.23,0.23,0.0,0.0
2,29,Corret. Emergencial,01/04/2020,01/04/2020,41598,Ademir de Jesus Aparecido,1400,SISTEMA HIDRÁULICO,1410,Mangueiras,2.99,100.0,100.0,2.67,0.15,0.0,0.0
3,29,Corret. Emergencial,01/04/2020,01/04/2020,50082,Alessandro Miranda,1000,PNEUS E AROS,1003,Pneus traseiros,0.29,100.0,100.0,0.26,0.01,0.0,0.0
4,29,Corret. Emergencial,01/04/2020,01/04/2020,52596,Josenaldo Silva Sousa,1300,SISTEMA ELÉTRICO,1301,Alternador,3.35,100.0,100.0,2.99,0.16,0.0,0.0


In [3]:
df_raw.columns = df_raw.columns.str.lower().str.strip('\n')
rename = {
    'funcionário - código': 'cod_funcionario',
    'classe de manutenção - código': 'cod_classe_manutencao',
    'sistema veicular - código': 'cod_sistema',
    'subsistema veicular - código': 'cod_subsistema',
    'data de entrada': 'data_entrada',
    'data de saída': 'data_saida',
    'horas': 'horas',
    '%sistema veicular': '%sistema_veicular',
    '%funcionário': '%funcionario',
    '%data de entrada': '%data-entrada',
    '%data de saída': '%data_saida',
    '%classe de manutenção': '%classe',
    '% geral': '%geral'
    
}
df = df_raw.rename(rename, axis=1)[['cod_classe_manutencao', 'cod_sistema', 'cod_subsistema', 'horas', '%sistema_veicular', '%funcionario', '%data-entrada', '%data_saida', '%classe', '%geral']]
df['data_entrada'] = pd.to_datetime(df_raw['data de entrada'])
df['data_saida'] = pd.to_datetime(df_raw['data de saída'])

---

#### Dados duplicados (ou estranhos mesmo)

In [4]:
manutencao_mesmo_dia = (pd.to_numeric(df['data_saida'] - df['data_entrada'])) == 0
df['manutencao_mesmo_dia'] = manutencao_mesmo_dia

In [5]:
df[(df['manutencao_mesmo_dia'] == True) & (df['horas'] >= 24)]

Unnamed: 0,cod_classe_manutencao,cod_sistema,cod_subsistema,horas,%sistema_veicular,%funcionario,%data-entrada,%data_saida,%classe,%geral,data_entrada,data_saida,manutencao_mesmo_dia
2590,29,1100,1107,219.98,100.0,100.0,4.08,4.18,0.19,0.15,2020-04-30,2020-04-30,True
2591,29,1100,1107,219.98,100.0,100.0,4.08,4.18,0.19,0.15,2020-04-30,2020-04-30,True
2592,29,1100,1107,219.98,100.0,100.0,4.08,4.18,0.19,0.15,2020-04-30,2020-04-30,True
2593,29,1100,1107,219.98,100.0,100.0,4.08,4.18,0.19,0.15,2020-04-30,2020-04-30,True
2594,29,1100,1107,219.98,100.0,100.0,4.08,4.18,0.19,0.15,2020-04-30,2020-04-30,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27641,29,1100,1107,219.90,100.0,100.0,14.17,14.17,0.19,0.15,2020-12-31,2020-12-31,True
27642,29,1100,1107,219.98,100.0,100.0,14.18,14.18,0.19,0.15,2020-12-31,2020-12-31,True
27643,29,1100,1107,219.98,100.0,100.0,14.18,14.18,0.19,0.15,2020-12-31,2020-12-31,True
27644,29,1100,1107,219.98,100.0,100.0,14.18,14.18,0.19,0.15,2020-12-31,2020-12-31,True


Manutencoes que entraram e sairam no mesmo dia, mas duraram mais de 24 horas?<br>
Eles inventaram uma maquina do tempo?

In [6]:
# remocao dessas linhas
df.drop(df[(df['manutencao_mesmo_dia'] == True) & (df['horas'] >= 24)].index, inplace=True)
df.drop(df[df['horas'] == 0].index, inplace=True)

##### Adicao colunas data

In [7]:
df['dia_semana'] = df['data_entrada'].dt.dayofweek
df['mes'] = df['data_entrada'].dt.month

#### Funções para avaliação dos resultados

In [8]:
results = {}

In [9]:
def evaluate(model, x_test, y_test, name='', dicto = results):
    predictions = model.predict(x_test)
    mape = mean_absolute_percentage_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = round(1 - mape, 3)*100
    print('Performance')
    print(f'R2: {r2}')
    print(f'Erro medio (MAE): {mae}')
    print(f'Erro medio %(MAPE): {mape}')
    print(f'Precisão = {accuracy}%')
    
    results[name] = accuracy
    return accuracy

---

### Modelo 1b: Random Forest Regressor
    Features adicionais

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

#### Separacao em set de treino e teste

In [11]:
x1 = df.drop(['horas', 'data_entrada', 'data_saida'], axis=1)
y1 = df['horas']

x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.15)

#### Parametros e fit dos sets

In [12]:
regressor = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
regressor.fit(x1_train, y1_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

#### Avaliacao dos resultados

In [13]:
evaluate(regressor, x1_test, y1_test, 'Random Forest Regressor')

Performance
R2: 0.9806969746514415
Erro medio (MAE): 0.22643811298051586
Erro medio %(MAPE): 0.17390895712637594
Precisão = 82.6%


82.6

In [14]:
x1_train

Unnamed: 0,cod_classe_manutencao,cod_sistema,cod_subsistema,%sistema_veicular,%funcionario,%data-entrada,%data_saida,%classe,%geral,manutencao_mesmo_dia,dia_semana,mes
31384,31,1100,1112,52.00,13.00,0.77,13.00,0.00,0.0,False,0,5
3195,29,600,609,47.96,17.74,0.25,0.55,0.00,0.0,True,1,5
21910,29,1,26,100.00,100.00,1.40,7.35,0.01,0.0,False,1,10
10446,29,600,603,100.00,13.35,0.16,0.29,0.00,0.0,True,4,8
3070,29,1,2,100.00,100.00,0.36,0.48,0.00,0.0,True,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...
1511,29,400,412,100.00,78.42,1.95,4.35,0.00,0.0,True,0,4
29517,31,1600,1676,53.85,10.45,1.09,3.23,0.00,0.0,True,6,1
49813,38,1300,1303,20.35,20.35,2.07,2.07,0.01,0.0,True,2,10
20500,29,1000,1002,9.77,9.77,0.10,0.19,0.00,0.0,True,4,1


### Modelo 1c: Random Forest Regressor
    Hyperparameters, RandomSearchCV

In [14]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

#### Parametros atuais

In [15]:
regressor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

#### Parametros para teste

In [16]:
# ref: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Number of trees in random forest
n_estimators = [50, 200, 500, 1000]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [50, 200, 500, 1000]}


#### Modelo

##### Procura pelos melhores parametros

In [17]:
#rRegressor = RandomForestRegressor()
# random_regressor = RandomizedSearchCV(estimator=rRegressor,
#                                       param_distributions=random_grid,
#                                       n_iter=50,
#                                       cv=3,
#                                       verbose=3,
#                                       random_state=42,
#                                       n_jobs=-1)

In [18]:
# random_regressor.fit(x1_train, y1_train)

In [19]:
# best_params = random_regressor.best_params_
# pprint(best_params)

##### Previsão

In [20]:
best_regressor = RandomForestRegressor(bootstrap=True,
                                       max_depth=110,
                                       max_features='auto',
                                       min_samples_leaf=2,
                                       min_samples_split=5,
                                       n_estimators=50,
                                       n_jobs=-1,
                                       random_state=42
                                      )

In [21]:
best_regressor.fit(x1_train, y1_train)

RandomForestRegressor(max_depth=110, min_samples_leaf=2, min_samples_split=5,
                      n_estimators=50, n_jobs=-1, random_state=42)

#### Avaliação dos resultados

In [22]:
evaluate(best_regressor, x1_test, y1_test, 'Random Forest Regressor - Tunned')

Performance
Erro medio: 0.19005580714754242
Precisão = 81.0%


81.0

---

### Modelo 2: Gradient boost

In [23]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [24]:
xgbr_x_train, xgbr_x_test, xgbr_y_train, xgbr_y_test = train_test_split(x1, y1, 
                                                        test_size=0.15)

#### Modelo

In [25]:
xgbr_model = XGBRegressor()
xgbr_model.fit(xgbr_x_train, xgbr_y_train, verbose=False)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=None)

In [26]:
evaluate(xgbr_model, xgbr_x_test, xgbr_y_test, 'XGBRegressor')

Performance
Erro medio: 0.35729906898950525
Precisão = 64.3%


64.3

<i>ouch</i>

#### Tuning

In [27]:
pprint(xgbr_model.get_params())

{'base_score': 0.5,
 'booster': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'gpu_id': -1,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': 0.300000012,
 'max_delta_step': 0,
 'max_depth': 6,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'objective': 'reg:squarederror',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': None,
 'validate_parameters': False,
 'verbosity': None}


In [28]:
grid_xgbr_params = {'n_estimators': [100, 250],
                    'learning_rate': [0.1, 0.25, 0.5],
                    'max_depth': [10],
                    'random_state': [22]}

In [29]:
grid_xgbr_model = XGBRegressor()
xgbr_grid = GridSearchCV(estimator=grid_xgbr_model,
                         param_grid=grid_xgbr_params,
                         cv=3,
                         verbose=3,
                         n_jobs=-1)

In [30]:
xgbr_grid.fit(xgbr_x_train, xgbr_y_train,
              eval_set=[(xgbr_x_train, xgbr_y_train), (xgbr_x_test, xgbr_y_test)],
              early_stopping_rounds=5)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[0]	validation_0-rmse:3.42276	validation_1-rmse:3.57179
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:3.10051	validation_1-rmse:3.23274
[2]	validation_0-rmse:2.81145	validation_1-rmse:2.93062
[3]	validation_0-rmse:2.55254	validation_1-rmse:2.66058
[4]	validation_0-rmse:2.31963	validation_1-rmse:2.41776
[5]	validation_0-rmse:2.11048	validation_1-rmse:2.20119
[6]	validation_0-rmse:1.92256	validation_1-rmse:2.00545
[7]	validation_0-rmse:1.75417	validation_1-rmse:1.83202
[8]	validation_0-rmse:1.60355	validation_1-rmse:1.67432
[9]	validation_0-rmse:1.46876	validation_1-rmse:1.53384
[10]	validation_0-rmse:1.34822	validation_1-rmse:1.41054
[11]	validation_0-rmse:1.24016	validation_1-rmse:1.29888
[12]	validation_0-rmse:1.14376	validation_1-rmse:1.20133
[13]	validation_0-rmse:1.05753	validation_1-rmse:1.

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_para

Melhores parametros

In [31]:
pprint(xgbr_grid.best_params_)
xgbr_best_params = xgbr_grid.best_params_
# xgbr_best_params_ = {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 250, 'random_state': 22}
learning_rate = 0.25
max_depth = 10
n_estimators = 250
random_state = 22
early_stopping_rounds = 5

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 250, 'random_state': 22}


In [32]:
xgbr_best_model = XGBRegressor(learning_rate=learning_rate,
                               max_depth=max_depth,
                               n_estimators=n_estimators,
                               random_state=22)
xgbr_best_model.fit(xgbr_x_train, xgbr_y_train, verbose=False)
evaluate(xgbr_best_model, xgbr_x_test, xgbr_y_test, 'XGBRegressor - Tunned')

Performance
Erro medio: 0.2241890563367031
Precisão = 77.60000000000001%


77.60000000000001

In [33]:
results

{'Random Forest Regressor': 82.19999999999999,
 'Random Forest Regressor - Tunned': 81.0,
 'XGBRegressor': 64.3,
 'XGBRegressor - Tunned': 77.60000000000001}

### Modelo 3: AdaBoost