## Feature engineering

El objetivo de este notebook es generar nuevas variables a partir de las analizadas. Tambíen se hará una breve exploración para determinar si son variables que pueden ser importantes para el modelo.

### Import de los paquetes

In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Ignorar todas las advertencias
warnings.filterwarnings("ignore")


%matplotlib inline

### Carga de datos

In [2]:
df = pd.read_csv("../data/intermediate/dataset_outliers_processed.csv")
df.shape

(3000, 11)

### Creacion de nuevas variables

In [7]:
df = df.sort_values(["payerId", "diasActividadPagador"]).copy()

In [8]:
# Calcular la suma acumulativa de campo1 y campo2 por grupo
df["payerNroMorosidad"] = df.groupby("payerId")["moroso"].cumsum()
df["payerNroFacturas"] = df.groupby("payerId")["auxAmountFactura"].cumsum()
df["payerAmount"] = df.groupby("payerId")["montoFactura"].cumsum()
df["payerDiasMora"] = df.groupby("payerId")["mora"].cumsum()


df["payerNroMorosidad"] = df["payerNroMorosidad"] - df["moroso"]
df["payerNroFacturas"] = df["payerNroFacturas"] - df["auxAmountFactura"]
df["payerAmount"] = df["payerAmount"] - df["montoFactura"]
df["payerDiasMora"] = df["payerDiasMora"] - df["mora"]

df["payerRateMororsidad"] = df["payerNroMorosidad"] / df["payerNroFacturas"]
df["payerAvgAmount"] = df["payerAmount"] / df["payerNroFacturas"]
df["payerAvgDiasMora"] = df["payerDiasMora"] / df["payerNroFacturas"]

df.fillna(0, inplace=True)

df["payerAvgAmount"] = df["payerAvgAmount"].astype(int)

df["payerDesvAmount"] = df["montoFactura"] - df["payerAvgAmount"]

In [9]:
df.loc[df["payerId"] == 5015].sort_values("diasActividadPagador").head(10)

Unnamed: 0,invoiceId,businessId,payerId,montoFactura,relationDays,relationRecurrence,issuerInvoicesAmount,issuerCancelledInvoices,diasActividadPagador,Clients12Months,...,moroso,auxAmountFactura,payerNroMorosidad,payerNroFacturas,payerAmount,payerDiasMora,payerRateMororsidad,payerAvgAmount,payerAvgDiasMora,payerDesvAmount
2631,124,2729,5015,1823675,384.0,15.32,31155691,0.056013,2416.0,3,...,1,1,0,0,0,0.0,0.0,0,0.0,1823675
577,87,3132,5015,6784785,334.0,1.958824,651763252,0.050363,2420.0,742,...,1,1,1,1,1823675,14.0,1.0,1823675,14.0,4961110
2659,516,1522,5015,1213800,1189.0,5.915423,214319000,0.016761,2429.0,20,...,0,1,2,2,8608460,21.0,1.0,4304230,10.5,-3090430
591,1022,1522,5015,1011500,1198.0,5.925743,234697750,0.017531,2438.0,20,...,0,1,2,3,9822260,6.0,0.666667,3274086,2.0,-2262586
2794,379,701,5015,1385160,172.0,8.095238,210201112,0.124591,2438.0,18,...,0,1,2,4,10833760,-2.0,0.5,2708440,-0.5,-1323280
89,1120,2729,5015,1734425,409.0,15.148148,36266741,0.054863,2441.0,3,...,0,1,2,5,12218920,-16.0,0.4,2443784,-3.2,-709359
1190,1513,2391,5015,8526945,144.0,4.5,230227515,0.0,2449.0,10,...,0,1,2,6,13953345,-16.0,0.333333,2325557,-2.666667,6201388
2879,1397,701,5015,4662134,186.0,7.956522,224151529,0.119703,2452.0,17,...,1,1,2,7,22480290,-21.0,0.285714,3211470,-3.0,1450664
390,1868,3132,5015,3598560,372.0,1.927461,858767769,0.051878,2458.0,736,...,0,1,3,8,27142424,-12.0,0.375,3392803,-1.5,205757
803,2174,2729,5015,2440988,430.0,14.333333,41480429,0.051335,2462.0,3,...,0,1,3,9,30740984,-18.0,0.333333,3415664,-2.0,-974676


In [11]:
df = df.sort_values(["businessId", "invoiceId"]).copy()

In [12]:
# Calcular la suma acumulativa de campo1 y campo2 por grupo
df["businessNroMorosidad"] = df.groupby("businessId")["moroso"].cumsum()
df["businessNroFacturas"] = df.groupby("businessId")["auxAmountFactura"].cumsum()
df["businessDiasMora"] = df.groupby("businessId")["mora"].cumsum()


df["businessNroMorosidad"] = df["businessNroMorosidad"] - df["moroso"]
df["businessNroFacturas"] = df["businessNroFacturas"] - df["auxAmountFactura"]
df["businessDiasMora"] = df["businessDiasMora"] - df["mora"]

df["businessRateMororsidad"] = df["businessNroMorosidad"] / df["businessNroFacturas"]
df["businessAvgDiasMora"] = df["businessDiasMora"] / df["businessNroFacturas"]

df.fillna(0, inplace=True)

In [15]:
pd.set_option("display.max_columns", None)
df.loc[df["businessId"] == 3560].sort_values("invoiceId").head(10)

Unnamed: 0,invoiceId,businessId,payerId,montoFactura,relationDays,relationRecurrence,issuerInvoicesAmount,issuerCancelledInvoices,diasActividadPagador,Clients12Months,mora,moroso,auxAmountFactura,payerNroMorosidad,payerNroFacturas,payerAmount,payerDiasMora,payerRateMororsidad,payerAvgAmount,payerAvgDiasMora,payerDesvAmount,businessNroMorosidad,businessNroFacturas,businessDiasMora,businessRateMororsidad,businessAvgDiasMora
321,2182,3560,5015,16856207,322.0,6.254902,422006904,0.043667,2465.0,2,-19.0,0,1,3,10,33181972,-30.0,0.3,3318197,-3.0,13538010,0,0,0.0,0.0,0.0
540,2957,3560,5015,7073312,340.0,6.203704,465179985,0.041512,2483.0,2,-1.0,0,1,3,12,54054429,-63.0,0.25,4504535,-5.25,2568777,0,1,-19.0,0.0,-19.0
1785,4924,3560,5015,1349817,376.0,6.064516,571005639,0.038231,2519.0,2,28.0,1,1,4,18,71984540,-67.0,0.222222,3999141,-3.722222,-2649324,0,2,-20.0,0.0,-10.0
2134,8449,3560,5015,691414,435.0,6.041667,735087527,0.034095,2578.0,1,9.0,1,1,10,32,130290055,16.0,0.3125,4071564,0.5,-3380150,1,3,8.0,0.333333,2.666667
2795,8559,3560,5015,8265157,438.0,6.0,743544821,0.033837,2581.0,1,0.0,0,1,11,33,130981469,25.0,0.333333,3969135,0.757576,4296022,2,4,17.0,0.5,4.25
1658,9604,3560,5015,608899,454.0,5.536585,789673114,0.035192,2597.0,1,13.0,1,1,12,38,168366447,38.0,0.315789,4430695,1.0,-3821796,2,5,17.0,0.4,3.4
1329,10248,3560,5015,397734,466.0,5.177778,829917238,0.034461,2609.0,1,14.0,1,1,14,41,172779205,57.0,0.341463,4214126,1.390244,-3816392,3,6,30.0,0.5,5.0
1308,10619,3560,5015,453890,473.0,5.031915,853194639,0.033848,2616.0,1,7.0,1,1,15,42,173176939,71.0,0.357143,4123260,1.690476,-3669370,4,7,44.0,0.571429,6.285714
2758,10800,3560,5015,3838571,475.0,4.947917,861452835,0.033573,2618.0,1,17.0,1,1,17,44,177770958,85.0,0.386364,4040249,1.931818,-201678,5,8,51.0,0.625,6.375
1101,13425,3560,5015,4641000,516.0,4.128,978624470,0.030561,2659.0,1,0.0,0,1,22,50,204074057,152.0,0.44,4081481,3.04,559519,6,9,68.0,0.666667,7.555556


In [16]:
df.to_csv("test.csv", index_label=False)