# ISOLATION FOREST FOR ANOMALY DETECTION 


Algoritmos de Isolation Forest para detecção de Anolamia na Folha de Pagamento
Importação do Modulo de Pré-processamento

In [3]:
import sys
sys.path.append("C:/Users/joaoc/Documents/MT/pagamento_servidores/src")
import process_servants as ps
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error
import pandas as pd
from janitor import clean_names

## Função para pre-processamento dos dados
Utiliza-se funções importadas do arquivo process_servants.py, no qual foram pré-definidas algumas funções para o pré-processamento dos dados

In [5]:
def process_registration_data_rendim(file_path):
    """Process registration data from Excel file"""
    
    df = (
        pd.read_excel(file_path)
        .pipe(clean_names)
        .pipe(ps.convert_rendim_to_numeric)
        .pipe(ps.process_month_column)
        )

    df = df.groupby(['cpf_servidor', 'mes', 'rubrica'])['rendim'].sum().reset_index()
    df = df.pivot(index = ['cpf_servidor', 'mes'], columns='rubrica', values='rendim').reset_index()

    return df

## Função para Leitura dos Dados

In [7]:
def processed_data(brute_data_file_path, file_name): 

    registration_df = process_registration_data_rendim(brute_data_file_path)
    
    processed_path  = "C:/Users/joaoc/Documents/MT/pagamento_servidores/data/processed/" + file_name + ".xlsx"
    print("Processed data shapes:")
    print(f"Registration data: {registration_df.shape}")
    
    #registration_df.to_excel(processed_path, index=False)
    
    return registration_df

Importação da Base de Dados

In [9]:
brute_file_path = 'C:/Users/joaoc/Documents/MT/pagamento_servidores/data/raw/servidores_mes_rubricas.xlsx'

Limpeza e Pre-processamento dos dados para tentar identificar o caso a seguir como um Anomaly Detection
![image.png](attachment:1cf54f27-e734-4093-a92f-766002dc7ed0.png)

In [11]:
registration_df = processed_data(brute_file_path, "dados_12_meses_cadastro_e_std")

Processed data shapes:
Registration data: (151, 25)


  warn("Workbook contains no default style, apply openpyxl's default")


## Isolation Forest - Rubricas Detalhadas

In [293]:
resultados = []

dados = registration_df[registration_df['cpf_servidor'] == 35042770763]

dados = dados.fillna(0)
series = dados.drop('cpf_servidor', axis=1)
series = series.loc[:, (series != 0).any(axis=0)]
        


In [295]:
series = series.set_index('mes')

In [242]:
contamination = 1/len(series)
iso_forest = IsolationForest(
        contamination = contamination,
        random_state = 42
    )

In [244]:
x_iso_forest = series

In [246]:
iso_forest.fit(x_iso_forest)

In [248]:
anomalias_rubricas = iso_forest.predict(x_iso_forest)
series['anomalias'] = anomalias_rubricas
anomalias_rubricas
series[series['anomalias'] == -1]


rubrica,AUXILIO ALIMENTACAO,SALARIO - CLT,anomalias
mes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-03-01,458.0,103.37,-1


## Isolation Forest - Remuneracao Total

In [351]:
resultados = []

series_remun = dados.drop('cpf_servidor', axis=1)
series_remun = series_remun.loc[:, (series_remun != 0).any(axis=0)]
series_remun['redim'] = series_remun.iloc[:, 1:].sum(axis=1)
series_remun = series_remun.set_index('mes')
series_remun = series_remun.iloc[:, -1:]


In [353]:
series_remun

rubrica,redim
mes,Unnamed: 1_level_1
2020-01-01,0.0
2020-02-01,0.0
2020-03-01,0.0
2020-04-01,0.0
2020-05-01,0.0
2020-06-01,0.0
2020-07-01,0.0
2020-08-01,0.0
2020-09-01,0.0
2020-10-01,0.0


In [267]:
contamination = 1/len(series_remun)
iso_forest = IsolationForest(
        contamination = contamination,
        random_state = 42
    )

In [269]:
x_iso_forest = series_remun

In [271]:
iso_forest.fit(x_iso_forest)

In [273]:
anomalias_remum = iso_forest.predict(x_iso_forest)
anomalias_remum
series_remun['anomalias'] = anomalias_remum
series_remun[series_remun['anomalias'] == -1]

rubrica,redim,anomalias
mes,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-03-01,561.37,-1


# ARIMA (Univariavel)

In [540]:
serie_arrima = registration_df[registration_df['cpf_servidor'] == 35042770763]
serie_arrima = serie_arrima.fillna(0)
serie_arrima = serie_arrima.drop('cpf_servidor', axis=1)
serie_arrima = serie_arrima.loc[:, (serie_arrima != 0).any(axis=0)]
serie_arrima['redim'] = serie_arrima.iloc[:, 1:].sum(axis=1)
serie_arrima = serie_arrima.set_index('mes')
serie_arrima = serie_arrima.iloc[:, -1:]

In [542]:
train, test = serie_arrima[:-3], serie_arrima[-3:]
train = train.asfreq('MS')
test = test.asfreq('MS')

In [544]:
model = ARIMA(train, order=(1, 1, 1)).fit()

In [546]:
forecast = model.forecast(steps=3)

In [548]:
conf_int = model.get_forecast(steps=3).conf_int()

In [550]:
test_values = test.values

In [552]:
lower_values = conf_int.iloc[:, 0].values

In [554]:
upper_values = conf_int.iloc[:, 1].values