In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, classification_report,
    mean_squared_error, r2_score, f1_score
)
from sklearn.preprocessing import OneHotEncoder
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor



In [2]:
df = pd.read_csv('../dados_limpos/dados.csv')


In [3]:

df['DatGeracaoConjuntoDados'] = pd.to_datetime(df['DatGeracaoConjuntoDados'], errors='coerce')
df['DatVencimentoTitulo']     = pd.to_datetime(df['DatVencimentoTitulo'], errors='coerce')
df['DatIncidenciaMultaMora']  = pd.to_datetime(df['DatIncidenciaMultaMora'], errors='coerce')

df['Codcvnarr']      = pd.to_numeric(df['Codcvnarr'], errors='coerce').astype('Int64')

df['NumCPFCNPJ'] = (
    df['NumCPFCNPJ']
    .astype(str)
    .str.replace(r'\D+', '', regex=True)
    .replace({'', 'nan', 'None'}, np.nan)
)

string_cols = [
    'AnmArrecadacao',
    'SigNomAgente',
    'DscSituacaoArrecadacao',
    'DscSituacaoCredito'
]

for col in string_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .replace({'nan': np.nan, 'None': np.nan})
    )

for col in string_cols:
    df[col] = df[col].astype('category')

    
df['QtdDiasEmAtraso'] = (
    pd.to_numeric(df['QtdDiasEmAtraso'], errors='coerce')
      .astype('Int64')
)

def to_float_br(series):
    return (series.astype(str)
                  .str.strip()
                  .str.replace('.', '', regex=False)
                  .str.replace(',', '.', regex=False)
                  .replace(['', 'nan', 'None'], np.nan)
                  .astype(float))

df['VlrPcpPrvArr']      = to_float_br(df['VlrPcpPrvArr'])
df['VlrTotPvrArr']      = to_float_br(df['VlrTotPvrArr'])
df['VlrTotPagArr']      = to_float_br(df['VlrTotPagArr'])
df['VlrTotDifPvrPagArr'] = to_float_br(df['VlrTotDifPvrPagArr'])
df['VlrSelic']          = to_float_br(df['VlrSelic'])

df['VlrPcpPrvArr']       = df['VlrPcpPrvArr'].round(2)
df['VlrTotPvrArr']       = df['VlrTotPvrArr'].round(2)
df['VlrTotPagArr']       = df['VlrTotPagArr'].round(2)
df['VlrTotDifPvrPagArr'] = df['VlrTotDifPvrPagArr'].round(2)
df['VlrSelic']           = df['VlrSelic'].round(2)

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415065 entries, 0 to 415064
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   DatGeracaoConjuntoDados  415065 non-null  datetime64[ns]
 1   Codcvnarr                415065 non-null  Int64         
 2   AnmArrecadacao           415065 non-null  category      
 3   DatVencimentoTitulo      415065 non-null  datetime64[ns]
 4   DatIncidenciaMultaMora   415065 non-null  datetime64[ns]
 5   QtdDiasEmAtraso          415065 non-null  Int64         
 6   NumCPFCNPJ               415065 non-null  object        
 7   SigNomAgente             415065 non-null  category      
 8   DscSituacaoArrecadacao   415065 non-null  category      
 9   DscSituacaoCredito       415065 non-null  category      
 10  VlrPcpPrvArr             415065 non-null  float64       
 11  VlrTotPvrArr             415065 non-null  float64       
 12  VlrTotPagArr    

In [5]:
df.columns


Index(['DatGeracaoConjuntoDados', 'Codcvnarr', 'AnmArrecadacao',
       'DatVencimentoTitulo', 'DatIncidenciaMultaMora', 'QtdDiasEmAtraso',
       'NumCPFCNPJ', 'SigNomAgente', 'DscSituacaoArrecadacao',
       'DscSituacaoCredito', 'VlrPcpPrvArr', 'VlrTotPvrArr', 'VlrTotPagArr',
       'VlrTotDifPvrPagArr', 'VlrSelic', 'AnoArrec', 'MesArrec', 'fatura_paga',
       'fatura_atrasado', 'fatura_nao_paga', 'TrimestreVencimento'],
      dtype='object')

In [6]:
df['prop_pago'] = df['VlrTotPagArr'] / df['VlrTotPvrArr']
df['prop_pago'] = df['prop_pago'].fillna(0)


primeiro

In [12]:
cols_to_drop = [
    'fatura_atrasado',
    'fatura_paga',
    'fatura_nao_paga',
    'QtdDiasEmAtraso',
    'VlrTotPvrArr',
    'VlrTotPagArr',
    'VlrTotDifPvrPagArr',
    'DscSituacaoArrecadacao',
    'DscSituacaoCredito',
    'Codcvnarr',
    'NumCPFCNPJ',
    'DatIncidenciaMultaMora',
    'DatVencimentoTitulo',
    'DatGeracaoConjuntoDados'
]

X = df.drop(columns=cols_to_drop)
y = df['fatura_atrasado']


In [None]:
numerical = X.select_dtypes(include=['float64','int64','Int64']).columns
categorical = X.select_dtypes(include=['category']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

In [10]:
log_reg = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', LogisticRegression(max_iter=200))
])

log_reg.fit(X_train, y_train)
pred = log_reg.predict(X_test)

print("Acurácia baseline:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

Acurácia baseline: 0.9902337399597789
              precision    recall  f1-score   support

           0       0.79      0.52      0.63      1322
           1       0.99      1.00      1.00     81719

    accuracy                           0.99     83041
   macro avg       0.89      0.76      0.81     83041
weighted avg       0.99      0.99      0.99     83041



In [11]:
result = permutation_importance(
    log_reg, X_test, y_test, n_repeats=5, random_state=42
)

importances = result.importances_mean
indices = np.argsort(importances)[::-1]

for idx in indices[:20]:
    print(X.columns[idx], importances[idx])

SigNomAgente 0.017752676388771826
VlrSelic 0.0012692525379029984
AnoArrec 0.00038535181416410325
AnmArrecadacao 6.984501631726036e-05
VlrPcpPrvArr 4.81689767704907e-05
TrimestreVencimento 1.685914186968507e-05
MesArrec 7.225346515604691e-06
prop_pago 0.0


In [None]:
rf_cls = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', RandomForestClassifier(n_estimators=180))
])

rf_cls.fit(X_train, y_train)
pred_rf = rf_cls.predict(X_test)

print("Acurácia RF:", accuracy_score(y_test, pred_rf))
print(classification_report(y_test, pred_rf))
print("F1 Score RF:", f1_score(y_test, pred_rf))

Acurácia RF: 0.9968569742657242
              precision    recall  f1-score   support

           0       0.96      0.84      0.89      1322
           1       1.00      1.00      1.00     81719

    accuracy                           1.00     83041
   macro avg       0.98      0.92      0.95     83041
weighted avg       1.00      1.00      1.00     83041

F1 Score RF: 0.9984046357251573


In [None]:
xgb_cls = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBClassifier(
        eval_metric='logloss',
        tree_method='hist'
    ))
])

xgb_cls.fit(X_train, y_train)
pred_xgb = xgb_cls.predict(X_test)

print("Acurácia XGBClassifier:", accuracy_score(y_test, pred_xgb))
print(classification_report(y_test, pred_xgb))

Acurácia XGBClassifier: 0.9945569056249323
              precision    recall  f1-score   support

           0       0.96      0.69      0.80      1322
           1       0.99      1.00      1.00     81719

    accuracy                           0.99     83041
   macro avg       0.98      0.84      0.90     83041
weighted avg       0.99      0.99      0.99     83041



segundo

In [None]:
colunas_para_remover = [
    'VlrTotDifPvrPagArr',
    'VlrTotPvrArr',
    'VlrTotPagArr',
    'fatura_paga',
    'fatura_atrasado',
    'fatura_nao_paga',
    'NumCPFCNPJ',
    'DatGeracaoConjuntoDados',
    'DatIncidenciaMultaMora',
    'AnoArrec',
    'MesArrec',
    'TrimestreVencimento',
]
df_sample = df.sample(frac=0.20, random_state=42)

X2 = df_sample.drop(columns=colunas_para_remover)
y2 = df_sample['VlrTotDifPvrPagArr']

In [None]:
from sklearn.decomposition import TruncatedSVD
numerical = X2.select_dtypes(include=['int64', 'float64']).columns
categorical = X2.select_dtypes(include=['object', 'category']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore')),
            ('svd', TruncatedSVD(n_components=100))
        ]), categorical)
    ]
)

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42
)

In [None]:
lin_reg = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', LinearRegression())
])

lin_reg.fit(X2_train, y2_train)
pred = lin_reg.predict(X2_test)

print("MSE (LinearRegression):", mean_squared_error(y2_test, pred))
print("R² (LinearRegression):", r2_score(y2_test, pred))


MSE (LinearRegression): 2.745828613171009e+30
R² (LinearRegression): 0.004655729699746591


In [None]:
xgb_reg = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBRegressor(tree_method='hist', max_depth=5, learning_rate=0.1))
])

xgb_reg.fit(X2_train, y2_train)
pred_xgb = xgb_reg.predict(X2_test)

print("MSE (XGBRegressor):", mean_squared_error(y2_test, pred_xgb))
print("R² (XGBRegressor):", r2_score(y2_test, pred_xgb))

MSE (XGBRegressor): 2.534871958674874e+30
R² (XGBRegressor): 0.08112608780121233


Terceiro

In [None]:
colunas_leak = [
    'VlrTotPagArr',
    'VlrTotDifPvrPagArr',
    'prop_pago',
    'fatura_paga',
    'fatura_atrasado',
    'fatura_nao_paga',
    'DscSituacaoCredito',
    'DscSituacaoArrecadacao',
    'DatIncidenciaMultaMora',
    'DatGeracaoConjuntoDados',
    'AnoArrec',
    'MesArrec',
    'NumCPFCNPJ'
]
y3 = df['QtdDiasEmAtraso']
X3 = df.drop(columns=colunas_leak + ['QtdDiasEmAtraso'])

X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.2, random_state=42
)

In [17]:
numerical = X3.select_dtypes(include=['float64','int64','Int64']).columns
categorical = X3.select_dtypes(include=['category']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
    ]
)

In [18]:
df.columns

Index(['DatGeracaoConjuntoDados', 'Codcvnarr', 'AnmArrecadacao',
       'DatVencimentoTitulo', 'DatIncidenciaMultaMora', 'QtdDiasEmAtraso',
       'NumCPFCNPJ', 'SigNomAgente', 'DscSituacaoArrecadacao',
       'DscSituacaoCredito', 'VlrPcpPrvArr', 'VlrTotPvrArr', 'VlrTotPagArr',
       'VlrTotDifPvrPagArr', 'VlrSelic', 'AnoArrec', 'MesArrec', 'fatura_paga',
       'fatura_atrasado', 'fatura_nao_paga', 'TrimestreVencimento',
       'prop_pago'],
      dtype='object')

In [None]:
lin_reg2 = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', LinearRegression())
])

lin_reg2.fit(X3_train, y3_train)
pred = lin_reg2.predict(X3_test)

print("MSE (LinearRegression - dias atraso):", mean_squared_error(y3_test, pred))
print("R² (LinearRegression - dias atraso):", r2_score(y3_test, pred))

MSE (LinearRegression - dias atraso): 7666.393753342452
R² (LinearRegression - dias atraso): 0.562489233319342


In [None]:
xgb_reg2 = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBRegressor(tree_method='hist', max_depth=5, learning_rate=0.1))
])

xgb_reg2.fit(X3_train, y3_train)
pred = xgb_reg2.predict(X3_test)

print("MSE (XGBRegressor - dias atraso):", mean_squared_error(y3_test, pred))
print("R² (XGBRegressor - dias atraso):", r2_score(y3_test, pred))

MSE (XGBRegressor - dias atraso): 7038.85302734375
R² (XGBRegressor - dias atraso): 0.5983021855354309
