### Utilizando LOF para detectar projetos outliers nas porcentagens de itens com preços outliers

**Issue**: [#174](https://github.com/lappis-unb/salic-ml/issues/174)

#### Recarregar automaticamente os módulos

In [1]:
%load_ext autoreload
%autoreload 2

### Importing data

In [2]:
import os
import sys
import time
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats


from salicml.utils.dates import Dates

from core.utils.read_csv import read_csv
from salicml.utils.utils import debug
from salicml.outliers import gaussian_outlier

PROJECT_ROOT = os.path.abspath(os.path.join(os.pardir, os.pardir))
DATA_FOLDER = os.path.join(PROJECT_ROOT, 'data', 'raw')

PROJECT_ROOT_ABS = /home/mandala/repos/minc/salic-ml/core/utils/../..
DATa_FOLDER = /home/mandala/repos/minc/salic-ml/core/utils/../../data/raw


In [3]:
from core.utils.read_csv import read_csv_with_different_type

### Dataset de comprovação

In [4]:
dt_orcamentaria_name = 'planilha_orcamentaria.csv'

usecols = ['PRONAC', 'idPlanilhaAprovacao', 'Item', 'idPlanilhaItens', 'VlUnitarioAprovado', 'idSegmento', 'DataProjeto']
dtype = {
    'PRONAC': str,
}

dt_orcamentaria = read_csv_with_different_type(dt_orcamentaria_name, dtype, usecols=usecols)

display(dt_orcamentaria.dtypes)
dt_orcamentaria.head()

PRONAC                  object
idPlanilhaAprovacao      int64
Item                    object
idPlanilhaItens          int64
VlUnitarioAprovado     float64
idSegmento              object
DataProjeto             object
dtype: object

Unnamed: 0,PRONAC,idPlanilhaAprovacao,Item,idPlanilhaItens,VlUnitarioAprovado,idSegmento,DataProjeto
0,90226,136145,Combustível,1019,0.0,71,2009-03-13 11:25:24.430
1,90226,136156,Assessor de imprensa,142,3000.0,71,2009-03-13 11:25:24.430
2,100406,186201,Projeto de iluminação,13,25459.54,52,2010-01-26 17:46:51.453
3,100406,186222,Forros e Cimalhas - Acabamentos,2117,56560.62,52,2010-01-26 17:46:51.453
4,110895,124738,Edição de efeitos sonoros,1191,4500.0,85,2011-02-04 16:04:01.130


# Treino

In [5]:
dt_train = dt_orcamentaria.copy()

START_DATE = datetime(2013, 1, 1)

dt_train['DataProjeto'] = pd.to_datetime(dt_train['DataProjeto'])
dt_train = dt_train[dt_train.DataProjeto >= START_DATE]
dt_train = dt_train[dt_train.VlUnitarioAprovado > 0.0]

display(dt_train.dtypes)
display(dt_train.head())

PRONAC                         object
idPlanilhaAprovacao             int64
Item                           object
idPlanilhaItens                 int64
VlUnitarioAprovado            float64
idSegmento                     object
DataProjeto            datetime64[ns]
dtype: object

Unnamed: 0,PRONAC,idPlanilhaAprovacao,Item,idPlanilhaItens,VlUnitarioAprovado,idSegmento,DataProjeto
154,146032,1224594,Locação de equipamentos,3684,4000.0,12,2014-04-11 18:50:56.473
155,146032,1224601,Hospedagem sem alimentação,130,180.0,12,2014-04-11 18:50:56.473
181,146032,1224599,Produtor local,4548,1500.0,12,2014-04-11 18:50:56.473
266,146032,1224604,Produtor local,4548,1500.0,12,2014-04-11 18:50:56.473
679,160506,1794621,Elaboração e Agenciamento,206,10052.0,6F,2016-02-25 17:57:15.010


# Treino1: Preço de item por segmento

In [6]:
PRICE_COLUMNS = ['idSegmento', 'idPlanilhaItens', 'VlUnitarioAprovado']
dt_train_agg = dt_train[PRICE_COLUMNS].groupby(by=['idSegmento', 'idPlanilhaItens']).agg([np.mean, lambda x: np.std(x, ddof=0)])
dt_train_agg.columns = dt_train_agg.columns.droplevel(0)
dt_train_agg.rename(columns={'<lambda>': 'std'}, inplace=True)
dt_train_agg.sort_index(inplace=True)
dt_train_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
idSegmento,idPlanilhaItens,Unnamed: 2_level_1,Unnamed: 3_level_1
11,10,8120.219844,9506.785625
11,12,10641.954062,11203.905159
11,13,11306.18002,39124.075003
11,14,24016.666667,31566.882259
11,15,5993.625418,10189.21685


In [7]:
train_agg_dict = dt_train_agg.to_dict(orient='dict')

In [8]:
def is_item_outlier(id_planilha_item, id_segmento, price):
    item_key = (id_segmento, id_planilha_item)
    if item_key not in train_agg_dict['mean']:
        return False
    
    mean = train_agg_dict['mean'][item_key]
    std = train_agg_dict['std'][item_key]
    outlier = gaussian_outlier.is_outlier(x=price, mean=mean, standard_deviation=std)
    maximum_expected = gaussian_outlier.maximum_expected_value(mean=mean, standard_deviation=std)
    return outlier

is_item_outlier(1129, '32', 12312123213131.0)

True

In [9]:
pronacs_grp = dt_train.groupby(by=['PRONAC'])

def get_outliers_percentage(pronac):
    items = pronacs_grp.get_group(pronac)
    
    outliers = 0
    for row in items.itertuples():
        item_id = getattr(row, 'idPlanilhaItens')
        unit_value = getattr(row, 'VlUnitarioAprovado')
        segment_id = getattr(row, 'idSegmento')
        
        outliers += 1 if is_item_outlier(id_planilha_item=item_id, id_segmento=segment_id, price=unit_value) else 0
    
    outliers_percentage = outliers / items.shape[0]
    return outliers_percentage


pronac = np.random.choice(dt_train.PRONAC.values)
print('pronac = {}'.format(pronac))
percentage = get_outliers_percentage(pronac)
print('percentage({}) = {}'.format(pronac, percentage))

pronac = 165017
percentage(165017) = 0.03389830508474576


In [10]:
pronac_cache = {}

print(dt_train.shape)
for row in dt_train.itertuples():
    pronac = getattr(row, 'PRONAC')
    item_id = getattr(row, 'idPlanilhaItens')
    unit_value = getattr(row, 'VlUnitarioAprovado')
    segment_id = getattr(row, 'idSegmento')
    
    #a = 1 if is_item_outlier(id_planilha_item=item_id, id_segmento=segment_id, price=unit_value) else 0
    pronac_cache.setdefault(pronac, {})
    pronac_cache[pronac].setdefault('outlier_items', 0)
    pronac_cache[pronac].setdefault('total_items', 0)
    
    pronac_cache[pronac]['outlier_items'] += 1 if is_item_outlier(id_planilha_item=item_id, id_segmento=segment_id, price=unit_value) else 0
    pronac_cache[pronac]['total_items'] += 1
    
print('oi')

(1191816, 7)
oi


In [11]:
print('133818' in dt_train.PRONAC.values)
print('133818' in pronac_cache)

True
True


# Treino2: porcentagem de itens outliers por projeto por segmento

In [15]:
segments_grp = dt_train[['PRONAC', 'idSegmento']].groupby(['idSegmento'])


def train_segment_percentages():
    percentages_train = {}
    for segment_id, group in segments_grp:
        pronacs = group.PRONAC.unique()
        percentages = []
        for pronac in pronacs:
            cache = pronac_cache[pronac]
            outliers = cache['outlier_items']
            total = cache['total_items']
            percentage = outliers / total

            percentages.append(percentage)

            mean = np.mean(percentages)
            std = np.std(percentages)

            percentages_train[segment_id] = {}
            percentages_train[segment_id]['mean'] = mean
            percentages_train[segment_id]['std'] = std
    return percentages_train

percentages_train = train_segment_percentages()
print(percentages_train)

{'11': {'mean': 0.03113127756274827, 'std': 0.06365542641981929}, '12': {'mean': 0.046672607561816105, 'std': 0.07851753409136121}, '13': {'mean': 0.060509843157215776, 'std': 0.08285944864994721}, '14': {'mean': 0.06770677213817795, 'std': 0.08717838667672347}, '15': {'mean': 0.0563973063973064, 'std': 0.019360269360269362}, '17': {'mean': 0.06496349331536863, 'std': 0.09098546506593996}, '18': {'mean': 0.11181870545447882, 'std': 0.1157466442412089}, '1A': {'mean': 0.05092106837965677, 'std': 0.04537019042787697}, '1B': {'mean': 0.06951324103026889, 'std': 0.0776678061647367}, '1D': {'mean': 0.04794364861329147, 'std': 0.07524248947980539}, '1H': {'mean': 0.0, 'std': 0.0}, '1I': {'mean': 0.08024050828664261, 'std': 0.09916943716168863}, '21': {'mean': 0.08269401277758177, 'std': 0.06781630213306031}, '23': {'mean': 0.07712256509206636, 'std': 0.08485646284359348}, '26': {'mean': 0.0836461047539608, 'std': 0.1188426672071037}, '28': {'mean': 0.07224516564815209, 'std': 0.1028707876914

# Teste

In [28]:
def is_project_outlier(pronac):
    cache = pronac_cache[pronac]
    outliers = cache['outlier_items']
    total = cache['total_items']
    percentage = outliers / total

    items = pronacs_grp.get_group(pronac)
    segment_id = items.iloc[0]['idSegmento']
    
    mean = percentages_train[segment_id]['mean']
    std = percentages_train[segment_id]['std']
    is_outlier = gaussian_outlier.is_outlier(x=percentage, mean=mean, standard_deviation=std)
    maximum_expected = gaussian_outlier.maximum_expected_value(mean=mean, standard_deviation=std)
    return is_outlier

pronac = np.random.choice(dt_train.PRONAC.values)
is_outlier = is_project_outlier(pronac)
print('is_outlier = {}'.format(is_outlier))

is_outlier = False
