### Validando "Porcentagem de itens com preços outliers acima do histórico daquele item"

**Issue**: [#61](https://github.com/lappis-unb/salic-ml/issues/61)

### Features Extracted:

- https://github.com/lappis-unb/salic-ml/wiki/Brainstorming-de-features-para-estimar-Complexidade-Financeira

#### Recarregar automaticamente os módulos

### Planila orçamentária

SQL Query: 


In [1]:
%load_ext autoreload
%autoreload 2

### Importing data

In [2]:
import os
import sys
import time
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats


from salicml.utils.dates import Dates
from salicml.utils.read_csv import read_csv_with_different_type
from salicml.utils.read_csv import read_csv
from salicml.utils.utils import debug
from salicml.outliers import gaussian_outlier

PROJECT_ROOT = os.path.abspath(os.path.join(os.pardir, os.pardir))
DATA_FOLDER = os.path.join(PROJECT_ROOT, 'data', 'raw')

  return f(*args, **kwds)
  return f(*args, **kwds)


### Dataset de comprovação

In [3]:
dt_orcamentaria_name = 'planilha_orcamentaria.csv'

usecols = ['PRONAC', 'idPlanilhaAprovacao', 'Item', 'idPlanilhaItens', 'VlUnitarioAprovado', 'idSegmento', 'DataProjeto']
dtype = {
    'PRONAC': str,
}

#dt_orcamentaria = read_csv_with_different_type(dt_orcamentaria_name, dtype, usecols=usecols)
dt_orcamentaria = read_csv(dt_orcamentaria_name, usecols=usecols)
display(dt_orcamentaria.columns)

dt_orcamentaria.head()

Index(['PRONAC', 'idPlanilhaAprovacao', 'Item', 'idPlanilhaItens',
       'VlUnitarioAprovado', 'idSegmento', 'DataProjeto'],
      dtype='object')

Unnamed: 0,PRONAC,idPlanilhaAprovacao,Item,idPlanilhaItens,VlUnitarioAprovado,idSegmento,DataProjeto
0,93004,50109,Montagem e desmontagem,88,140.0,51,2009-06-26 09:46:54
1,103228,239572,Locação de Piano,3040,1500.0,33,2010-05-06 10:49:13
2,103228,239599,Programa,2634,2.0,33,2010-05-06 10:49:13
3,1012471,86608,Transporte Local / Locação de Automóvel / Comb...,134,1.45,71,2010-12-21 17:09:22
4,93932,11387,Confecção de painéis explicativos,90,300.0,71,2009-07-27 08:40:04


# Treino

In [4]:
dt_train = dt_orcamentaria.copy()

START_DATE = datetime(2013, 1, 1)

dt_train['DataProjeto'] = pd.to_datetime(dt_train['DataProjeto'])
dt_train = dt_train[dt_train.DataProjeto >= START_DATE]
dt_train = dt_train[dt_train.VlUnitarioAprovado > 0.0]

dt_train.sort_values(by='DataProjeto', inplace=True)
display(dt_train.dtypes)
display(dt_train.head())

PRONAC                          int64
idPlanilhaAprovacao             int64
Item                           object
idPlanilhaItens                 int64
VlUnitarioAprovado            float64
idSegmento                     object
DataProjeto            datetime64[ns]
dtype: object

Unnamed: 0,PRONAC,idPlanilhaAprovacao,Item,idPlanilhaItens,VlUnitarioAprovado,idSegmento,DataProjeto
1816335,130001,774836,Ensaios,2502,500.0,33,2013-01-02 10:59:07
1635245,130001,774835,Transporte Local / Locação de Automóvel / Comb...,134,800.0,33,2013-01-02 10:59:07
710100,130001,774847,Coordenação Administrativo- Financeiro,3732,3000.0,33,2013-01-02 10:59:07
755448,130001,774846,Mídia impressa,178,2500.0,33,2013-01-02 10:59:07
1007856,130001,774831,Registro videográfico,2628,600.0,33,2013-01-02 10:59:07


In [5]:
PRICE_COLUMNS = ['idSegmento', 'idPlanilhaItens', 'VlUnitarioAprovado']
dt_train_agg = dt_train[PRICE_COLUMNS].groupby(by=['idSegmento', 'idPlanilhaItens']).agg([np.mean, lambda x: np.std(x, ddof=0)])
dt_train_agg.columns = dt_train_agg.columns.droplevel(0)
dt_train_agg.rename(columns={'<lambda>': 'std'}, inplace=True)
dt_train_agg.sort_values(by='mean', ascending=False, inplace=True)
dt_train_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
idSegmento,idPlanilhaItens,Unnamed: 2_level_1,Unnamed: 3_level_1
32,1129,5069422.0,47429800.0
5G,2636,3200000.0,0.0
5F,5047,2473070.0,0.0
33,36,2293842.0,40567150.0
5R,5627,2200000.0,0.0


In [6]:
#train_agg_dict = dt_train_agg.to_dict()
#print(train_agg_dict)

def is_item_outlier(id_planilha_item, id_segmento, price):
    if (id_segmento, id_planilha_item) not in dt_train_agg.index:
        return False
    
    #mean = train_agg_dict['mean'][(id_segmento, id_planilha_item)]
    #std = train_agg_dict['std'][(id_segmento, id_planilha_item)]
    mean = dt_train_agg.loc[(id_segmento, id_planilha_item)]['mean']
    std = dt_train_agg.loc[(id_segmento, id_planilha_item)]['std']
    #mean = 383214.93235
    #std = 1252352.22
    outlier = gaussian_outlier.is_outlier(x=price, mean=mean, standard_deviation=std)
    maximum_expected = gaussian_outlier.maximum_expected_value(mean=mean, standard_deviation=std)
    
#    print('mean = {}'.format(mean))
#    print('std = {}'.format(std))
#    print('outlier = {}'.format(outlier))
#    print('maximum_expected = {}'.format(maximum_expected))
    
    return outlier

is_item_outlier(1129, '32', 12312123213131.0)

True

In [23]:
pronacs_grp = dt_orcamentaria[['PRONAC', 'idPlanilhaItens', 'VlUnitarioAprovado', 'idSegmento']].groupby(['PRONAC'])

def get_outliers_percentage(pronac):
    items = pronacs_grp.get_group(pronac)
    #print('items.type = {}'.format(type(items)))
    #print('items.shape = {}'.format(items.shape))
    
    outliers = 0
    for row in items.itertuples():
        item_id = getattr(row, 'idPlanilhaItens')
        unit_value = getattr(row, 'VlUnitarioAprovado')
        segment_id = getattr(row, 'idSegmento')
        
        outliers += 1 if is_item_outlier(id_planilha_item=item_id, id_segmento=segment_id, price=unit_value) else 0
    
    #print('outliers = {}'.format(outliers))
    outliers_percentage = outliers / items.shape[0]
    return outliers_percentage


pronac = np.random.choice(dt_orcamentaria.PRONAC.values)
print('pronac = {}'.format(pronac))
percentage = get_outliers_percentage(pronac)
print('percentage({}) = {}'.format(pronac, percentage))

pronac = 137225
percentage(137225) = 0.012461059190031152


In [8]:
pronac_cache = {}

print(dt_orcamentaria.shape)
for row in dt_orcamentaria.itertuples():
    pronac = getattr(row, 'PRONAC')
    item_id = getattr(row, 'idPlanilhaItens')
    unit_value = getattr(row, 'VlUnitarioAprovado')
    segment_id = getattr(row, 'idSegmento')
    
    #a = 1 if is_item_outlier(id_planilha_item=item_id, id_segmento=segment_id, price=unit_value) else 0
    pronac_cache.setdefault(pronac, {})
    pronac_cache[pronac].setdefault('outlier_items', 0)
    pronac_cache[pronac].setdefault('total_items', 0)
    
    pronac_cache[pronac]['outlier_items'] += 1 if is_item_outlier(id_planilha_item=item_id, id_segmento=segment_id, price=unit_value) else 0
    pronac_cache[pronac]['total_items'] += 1
    
print('oi')

(2001717, 7)
oi


In [9]:
def train_segment(segment):
    pass

segments_grp = dt_orcamentaria[['PRONAC', 'idSegmento']].groupby(['idSegmento'])
"""
data = {}
for segment_id, group in segments_grp:
    percentages = list(map(get_outliers_percentage, group.PRONAC.unique()))
    print(percentages)
    mean = np.mean(percentages)
    std = np.std(percentages)
    data[segment_id] = {'mean': mean, 'std': std}
    break
"""

"\ndata = {}\nfor segment_id, group in segments_grp:\n    percentages = list(map(get_outliers_percentage, group.PRONAC.unique()))\n    print(percentages)\n    mean = np.mean(percentages)\n    std = np.std(percentages)\n    data[segment_id] = {'mean': mean, 'std': std}\n    break\n"

In [10]:
def is_outlier(pronac):
    segment = projects_grp.get_group(pronac)
    print(segment)

#pronac = 