# Modulos

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from pprint import pprint
from collections import namedtuple
import numpy as np

---

## Funções auxiliares

### Avaliação dos resultados

In [2]:
results = {}
def evaluate(model, x_test, y_test, name='', dicto = results):
    predictions = model.predict(x_test)
    mape = mean_absolute_percentage_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = round(1 - mape, 3)*100
    print('Performance')
    print(f'R2: {r2}')
    print(f'Erro medio (MAE): {mae}')
    print(f'Erro medio % (MAPE): {mape}')
    print(f'Erro quad medio (MSE): {mse}')
    print(f'Precisão = {accuracy}%')
    
    dif = abs(predictions-y_test)
    dif_in_minutes = dif * 60
    print(f'\nErro medio em minutos: {dif_in_minutes.mean()}')
    predictions_df = pd.DataFrame({'pred': predictions, 'real': y_test, 'dif': dif, 'dif_in_minutes': dif_in_minutes})
    
    results[name] = {'mae': mae, 
                     'mape': mape,
                     'mse': mse,
                     'accuracy': accuracy,
                     'erro_medio_minutos': dif_in_minutes.mean()}

    return predictions_df

In [3]:
def ntuples(df):
    """ Facilita o autocomplete de nomes de colunas """
    list_of_names = df.columns.values
    list_of_names_dict = {x:x for x in list_of_names}

    Varnames = namedtuple('Varnames', list_of_names) 
    return Varnames(**list_of_names_dict)

## Importacao e limpeza

In [4]:
df_raw = pd.read_csv('base19-21.csv', sep=';', encoding='latin-1')
df_raw

Unnamed: 0,cod_classe_manutencao,cod_sistema,cod_subsistema,cod_funcionario,data_entrada,data_saida,horas
0,29,1400,1402,22955,2020-04-01,2020-04-01,2.35
1,29,1300,1312,30413,2020-04-01,2020-04-01,4.74
2,29,1400,1410,41598,2020-04-01,2020-04-01,2.99
3,29,1000,1003,50082,2020-04-01,2020-04-01,0.29
4,29,1300,1301,52596,2020-04-01,2020-04-01,3.35
...,...,...,...,...,...,...,...
103599,38,400,424,110209,2021-03-31,2021-04-02,2.71
103600,38,200,208,57143,2021-03-31,2021-04-05,5.77
103601,38,200,208,94070,2021-03-31,2021-04-05,5.77
103602,38,200,208,108106,2021-03-31,2021-04-05,2.63


In [5]:
def fix_columns(df):
    df_raw.columns = df_raw.columns.str.lower().str.strip('\n')
    rename = {
        'funcionário - código': 'cod_funcionario',
        'classe de manutenção - código': 'cod_classe_manutencao',
        'sistema veicular - código': 'cod_sistema',
        'subsistema veicular - código': 'cod_subsistema',
        'data de entrada': 'data_entrada',
        'data de saída': 'data_saida',
        'horas': 'horas',
        '%sistema veicular': '%sistema_veicular',
        '%funcionário': '%funcionario',
        '%data de entrada': '%data-entrada',
        '%data de saída': '%data_saida',
        '%classe de manutenção': '%classe',
        '% geral': '%geral'

    }
    df = df_raw.rename(rename, axis=1)
    df['data_entrada'] = pd.to_datetime(df_raw['data_entrada'])
    df = df[['cod_classe_manutencao', 'cod_sistema', 'cod_subsistema', 'cod_funcionario', 'data_entrada', 'horas']]
    return df
df = fix_columns(df_raw)
# df = df_raw.copy()

---

### Adição colunas data

In [6]:
def add_weekday_month(df):
    df['dia_semana'] = df['data_entrada'].dt.dayofweek
    df['mes'] = df['data_entrada'].dt.month
    return df
df = add_weekday_month(df)

In [7]:
c = ntuples(df)

### Colunas com estatisticas

In [8]:
def add_describe_columns(df, groupby_column, describe_column):
    """ 
    Adiciona como novas colunas os valores estatisticos 
    do .describe() ao dataframe agrupado 
    """
    # describe_values = df.groupby([groupby_column])[describe_column].describe().to_dict()
    describe_values = df.groupby([groupby_column])[describe_column].agg(['mean', 'std']).to_dict()
    for key in describe_values.keys():
        df[groupby_column + '_' + key] = df[groupby_column]
        df[groupby_column + '_' + key] = df[groupby_column + '_' + key].map(describe_values[key])
    
    return df

In [9]:
# df = add_describe_columns(df, c.cod_sistema, c.horas)
# df = add_describe_columns(df, c.cod_subsistema, c.horas)

### Colunas como categorias (e nao int64)

In [9]:
def convert_to_category(df, columns):
    for col in columns:
        df[col] = df[col].astype('category')
    return df

In [10]:
df = convert_to_category(df, [c.cod_classe_manutencao, c.cod_sistema, c.cod_subsistema, c.cod_funcionario, c.dia_semana, c.mes])

---

## Random Forest Regressor

In [11]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error
from sklearn.decomposition import PCA

### Pre-processamento

In [12]:
df.dropna(inplace=True)
x = df.drop(['horas', 'data_entrada'], axis=1)
y = df['horas']

### Encoding

In [13]:
def custom_onehot_encoder(df, columns):
    for col in columns:
        df = df.join(pd.get_dummies(df[col],
                                    prefix=col)
                    )
        df = df.drop(col, axis=1)

    return df

### PCA

In [14]:
# x = custom_onehot_encoder(x, [c.cod_classe_manutencao, c.cod_sistema, c.cod_subsistema])
# pca = PCA(n_components=100, random_state=22)
# pca.fit(x)
# x_image = pca.transform(x)

### Training/testing split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=22)

In [16]:
rfr = RandomForestRegressor(n_estimators=100, random_state=22, n_jobs=-1)
rfr.fit(x_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=22)

In [17]:
rfr_df = evaluate(rfr, x_test, y_test, 'RFR')

Performance
R2: 0.6929377325492155
Erro medio (MAE): 0.728528894180341
Erro medio % (MAPE): 1.2107372948981736
Erro quad medio (MSE): 1.8109799294124775
Precisão = -21.099999999999998%

Erro medio em minutos: 43.71173365082045


In [18]:
pd.DataFrame({'feature': x.columns, 'importance': rfr.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
2,cod_subsistema,0.280747
3,cod_funcionario,0.237368
5,mes,0.163278
0,cod_classe_manutencao,0.158765
4,dia_semana,0.118655
1,cod_sistema,0.041188


In [19]:
rfr_df.describe()

Unnamed: 0,pred,real,dif,dif_in_minutes
count,31082.0,31082.0,31082.0,31082.0
mean,1.938281,1.915932,0.728529,43.711734
std,1.992947,2.42857,1.131489,67.889324
min,0.01,0.01,0.0,0.0
25%,0.4911,0.33,0.092262,5.535715
50%,1.393329,1.03,0.34609,20.7654
75%,2.779667,2.67,0.912756,54.765346
max,51.139123,65.12,30.289105,1817.346286


---

### Hyperparameters

In [23]:
from sklearn.model_selection import RandomizedSearchCV

In [24]:
# ref: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Number of trees in random forest
n_estimators = [50, 100, 200, 500]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10, 25, 50, 75, 100]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 25, 50, 75, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [50, 100, 200, 500]}


In [25]:
cv_rfr = RandomForestRegressor()
random_regressor = RandomizedSearchCV(estimator=cv_rfr,
                                      param_distributions=random_grid,
                                      n_iter=50,
                                      cv=3,
                                      verbose=3,
                                      random_state=22,
                                      n_jobs=-1)

In [25]:
# random_regressor.fit(x_train, y_train)
# best_params = random_regressor.best_params_

In [26]:
best_params = {'n_estimators': 200,
               'min_samples_split': 2,
               'min_samples_leaf': 1,
               'max_features': 'auto',
               'max_depth': 50,
               'bootstrap': True}

In [27]:
best_rfr = RandomForestRegressor(**best_params, n_jobs=-1, random_state=22)
best_rfr.fit(x_train, y_train)
best_rfr_df = evaluate(best_rfr, x_test, y_test, 'RFR-HyperParameter')

Performance
R2: 0.6926829384332449
Erro medio (MAE): 0.728023592126119
Erro medio % (MAPE): 1.211648319405504
Erro quad medio (MSE): 1.8124826442656763
Precisão = -21.2%

Erro medio em minutos: 43.68141552756714


## Comparação

In [29]:
pd.DataFrame(results)

Unnamed: 0,RFR,RFR-HyperParameter
mae,0.728529,0.728024
mape,1.210737,1.211648
mse,1.81098,1.812483
accuracy,-21.1,-21.2
erro_medio_minutos,43.711734,43.681416
