### Validando "Porcentagem de itens com preços outliers acima do histórico daquele item"

**Issue**: [#61](https://github.com/lappis-unb/salic-ml/issues/61)

### Features Extracted:

- https://github.com/lappis-unb/salic-ml/wiki/Brainstorming-de-features-para-estimar-Complexidade-Financeira

#### Recarregar automaticamente os módulos

### Planila orçamentária

SQL Query: 


In [None]:
%load_ext autoreload
%autoreload 2

### Importing data

In [None]:
import os
import sys
import time
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats


from salicml.utils.dates import Dates
from core.utils.read_csv import read_csv_with_different_type
from salicml.utils.utils import debug
from salicml.outliers import gaussian_outlier

PROJECT_ROOT = os.path.abspath(os.path.join(os.pardir, os.pardir))
DATA_FOLDER = os.path.join(PROJECT_ROOT, 'data', 'raw')

### Dataset de comprovação

In [None]:
dt_orcamentaria_name = 'planilha_orcamentaria.csv'

usecols = ['PRONAC', 'idPlanilhaAprovacao', 'Item', 'idPlanilhaItens', 'VlUnitarioAprovado', 'idSegmento', 'DataProjeto']
dtype = {
    'PRONAC': str,
}

dt_orcamentaria = read_csv_with_different_type(dt_orcamentaria_name, dtype, usecols=usecols)
display(dt_orcamentaria.columns)

dt_orcamentaria.head()

# Treino

In [None]:
dt_train = dt_orcamentaria.copy()

START_DATE = datetime(2013, 1, 1)

dt_train['DataProjeto'] = pd.to_datetime(dt_train['DataProjeto'])
dt_train = dt_train[dt_train.DataProjeto >= START_DATE]
dt_train = dt_train[dt_train.VlUnitarioAprovado > 0.0]

dt_train.sort_values(by='DataProjeto', inplace=True)
display(dt_train.dtypes)
display(dt_train.head())

In [None]:
PRICE_COLUMNS = ['idSegmento', 'idPlanilhaItens', 'VlUnitarioAprovado']
dt_train_agg = dt_train[PRICE_COLUMNS].groupby(by=['idSegmento', 'idPlanilhaItens']).agg([np.mean, lambda x: np.std(x, ddof=0)])
dt_train_agg.columns = dt_train_agg.columns.droplevel(0)
dt_train_agg.rename(columns={'<lambda>': 'std'}, inplace=True)
dt_train_agg.sort_values(by='mean', ascending=False, inplace=True)
dt_train_agg.head()

In [None]:
def is_item_outlier(id_planilha_item, id_segmento, price):
    if (id_segmento, id_planilha_item) not in dt_train_agg.index:
        return False
    
    mean = dt_train_agg.loc[(id_segmento, id_planilha_item)]['mean']
    std = dt_train_agg.loc[(id_segmento, id_planilha_item)]['std']
    outlier = gaussian_outlier.is_outlier(x=price, mean=mean, standard_deviation=std)
    maximum_expected = gaussian_outlier.maximum_expected_value(mean=mean, standard_deviation=std)
    
    print('mean = {}'.format(mean))
    print('std = {}'.format(std))
    print('outlier = {}'.format(outlier))
    print('maximum_expected = {}'.format(maximum_expected))
    
    return outlier

is_item_outlier(1129, '32', 12312123213131.0)

In [None]:
def get_outliers_percentage(pronac):
    items = dt_orcamentaria[dt_orcamentaria.PRONAC == pronac]
    print('items.type = {}'.format(type(items)))
    print('items.shape = {}'.format(items.shape))
    
    outliers = 0
    for index, item in items.iterrows():
        item_id = item['idPlanilhaItens']
        unit_value = item['VlUnitarioAprovado']
        segment_id = item['idSegmento']
        
        outliers += 1 if is_item_outlier(id_planilha_item=item_id, id_segmento=segment_id, price=unit_value) else 0
    
    print('outliers = {}'.format(outliers))
    outliers_percentage = outliers / items.shape[0]
    return outliers_percentage


pronac = np.random.choice(dt_orcamentaria.PRONAC.values)
print('pronac = {}'.format(pronac))
percentage = get_outliers_percentage(pronac)
print('percentage({}) = {}'.format(pronac, percentage))