### Validando a detecção de anomalias para a feature "Novos distribuidores"

**Issue**: [#60](https://github.com/lappis-unb/salic-ml/issues/60)

### Features Extracted:

- https://github.com/lappis-unb/salic-ml/wiki/Brainstorming-de-features-para-estimar-Complexidade-Financeira

#### Recarregar automaticamente os módulos

### Planila comprovação

SQL Query: https://github.com/lappis-unb/salic-ml/blob/master/data/scripts/planilha_comprovacao2.sql


In [1]:
%load_ext autoreload
%autoreload 2

### Importing data

### FEATURES 
    - Novos distribuidores

In [2]:
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats


from salicml.utils.dates import Dates
from salicml.utils.read_csv import read_csv
from salicml.utils.utils import debug

PROJECT_ROOT = os.path.abspath(os.path.join(os.pardir, os.pardir))
DATA_FOLDER = os.path.join(PROJECT_ROOT, 'data', 'raw')

  return f(*args, **kwds)
  return f(*args, **kwds)


### Dataset de comprovação

In [35]:
dt_comprovacao_name = 'planilha_comprovacao_2.csv'

usecols = ['PRONAC', 'nrCNPJCPF', 'DataProjeto', 'idPlanilhaAprovacao', 'Item', 'nmFornecedor', 'idSegmento']
#usecols = None

dt_comprovacao = read_csv(dt_comprovacao_name, usecols=usecols)
display(dt_comprovacao.columns)
dt_comprovacao.head()

Index(['PRONAC', 'idPlanilhaAprovacao', 'idSegmento', 'Item', 'DataProjeto',
       'nrCNPJCPF', 'nmFornecedor'],
      dtype='object')

Unnamed: 0,PRONAC,idPlanilhaAprovacao,idSegmento,Item,DataProjeto,nrCNPJCPF,nmFornecedor
0,1012121,18552,71,Projeto Gráfico,2010-12-14 15:04:51,13751832000191,Flag Comunicação Ltda
1,1012121,18553,71,Produção de texto,2010-12-14 15:04:51,50618057000127,Comunic Comunicadores Associados SC Ltda.
2,1012121,18554,71,"Fotografia artística (fotógrafo, tratamento, r...",2010-12-14 15:04:51,1118139000105,"M, Vitorino Comunicação Ltda. EPP"
3,1012121,18555,71,Tratamento de imagens,2010-12-14 15:04:51,11049176000154,IAC Produções Ltda.
4,1012121,18556,71,Edição de Texto,2010-12-14 15:04:51,50618057000127,Comunic Comunicadores Associados SC Ltda.


In [52]:
def train(dt=dt_comprovacao):
    
    projects = dt.groupby('PRONAC')
    providers_count = {}
    
    for pronac, items in projects:
        cnpjs = items['nrCNPJCPF'].unique()
        
        for cnpj in cnpjs:
            count = providers_count.setdefault(cnpj, 0)
            providers_count[cnpj] = count + 1
    
    return projects, providers_count


def train_average_percentage(projects, providers_count, dt=dt_comprovacao): 
    segment_percentages = {}
    all_projects_percentages = []
    
    for pronac, items in projects:
        cnpjs = items.nrCNPJCPF.unique()
        new_providers = 0
        for cnpj in cnpjs:
            cnpj_count = providers_count.get(cnpj, 0)
            if cnpj_count <= 1: # if cnpj_count == 1 
                                # then the current pronac is the only one with the given provider
                new_providers += 1
                
        id_segmento = items.iloc[0]['idSegmento']
        segment_percentages.setdefault(id_segmento, [])
        providers_percent = new_providers / cnpjs.size
        segment_percentages[id_segmento].append(providers_percent)
        all_projects_percentages.append(providers_percent)
     
    segments_average = {}
    for segment_id, percentages in segment_percentages.items():
        mean = np.mean(percentages)
        segments_average[segment_id] = mean
        
    all_projects_average = np.mean(all_projects_percentages)
        
    print('segments_average = {}'.format(segments_average))
    return segments_average, all_segments_average
        
        
    
def get_metrics(pronac, projects, providers_count, segments_average, all_segments_average):
    items = projects.get_group(pronac)
    
    response = {}
    new_providers = {}
    pronac_segment = None
    
    for index, row in items.iterrows():
        cnpj = row['nrCNPJCPF']
        cnpj_count = providers_count.get(cnpj, 0)
        pronac_segment = row['idSegmento']
        
        if cnpj_count <= 1:
            item_id = row['idPlanilhaAprovacao']
            item_name = row['Item']
            provider_name = row['nmFornecedor']
            
            new_providers.setdefault(cnpj, {})
            new_providers[cnpj].setdefault('name', provider_name)
            new_providers[cnpj].setdefault('items', {})
            
            new_providers[cnpj]['items'][item_id] = item_name
            
    new_providers_percentage = len(new_providers) / len(items['nrCNPJCPF'].unique())
    
    response['new_providers'] = new_providers
    response['new_providers_percentage'] = new_providers_percentage
    response['segment_average_percentage'] = segments_average[pronac_segment]
    response['all_segments_average_percentage'] = all_segments_average
    return response
        
projects, providers_count = train(dt=dt_comprovacao)
segments_average, all_segments_average = train_average_percentage(projects, providers_count, dt_comprovacao)

pronac = np.random.choice(dt_comprovacao.head(15).PRONAC.unique())
response = get_metrics(pronac, projects, providers_count, segments_average, all_segments_average)

print('pronact = {}'.format(pronac))
print('response = {}'.format(response))

segments_average = {'54': 0.6666666666666666, '13': 0.5498270395733517, '31': 0.5156067485375325, '33': 0.4475782745535542, '32': 0.3835276081397546, '44': 0.6666666666666666, '12': 0.4990208967141673, '26': 0.6729717326497135, '61': 0.6661190855927698, '42': 0.7777777777777778, '11': 0.4372123221772933, '59': 0.875, '71': 0.7801827801827802, '41': 0.5479030910609858, '5F': 0.6425794788525121, '2J': 0.44160968218713226, '6E': 0.4737667765467342, '6C': 0.49112754835962824, '6G': 0.6031746031746031, '4B': 0.5020494856232984, '5P': 0.62, '5E': 0.5240016803540452, '6D': 0.5147616153684988, '17': 0.5913188628853553, '5R': 0.9172932330827068, '68': 0.4449843977012076, '5A': 0.6583157992821922, '46': 0.744096292992368, '4G': 0.8333333333333334, '2C': 0.5112154823174657, '6A': 0.6648709315375981, '21': 0.8235294117647058, '2B': 0.0, '5K': 0.5967949510557541, '2L': 0.5, '5N': 0.4709816303099885, '5G': 0.7064764768028493, '49': 0.29656167979002623, '5J': 0.6666666666666666, '2P': 0.7638888888888