In [3]:
# Treina modelo baseline: Regressão Logística
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, classification_report,
    mean_squared_error, r2_score, f1_score
)
from sklearn.preprocessing import OneHotEncoder

from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor



In [4]:
# Carrega o conjunto de dados limpos
df = pd.read_csv('../dados_limpos/dados.csv')


In [5]:
# Converte datas e normaliza tipos numéricos e monetários
# Ajustes de tipos das variáveis

# Datas
df['DatGeracaoConjuntoDados'] = pd.to_datetime(df['DatGeracaoConjuntoDados'], errors='coerce')
df['DatVencimentoTitulo']     = pd.to_datetime(df['DatVencimentoTitulo'], errors='coerce')
df['DatIncidenciaMultaMora']  = pd.to_datetime(df['DatIncidenciaMultaMora'], errors='coerce')

# Numérico categórico Codcvnarr
df['Codcvnarr'] = pd.to_numeric(df['Codcvnarr'], errors='coerce').astype('Int64')

# CPF/CNPJ – mantém apenas dígitos
df['NumCPFCNPJ'] = (
    df['NumCPFCNPJ']
      .astype(str)
      .str.replace(r'\D+', '', regex=True)
      .replace({'', 'nan', 'None'}, np.nan)
)

# QtdDiasEmAtraso como inteiro (pode ter nulos)
df['QtdDiasEmAtraso'] = (
    pd.to_numeric(df['QtdDiasEmAtraso'], errors='coerce')
      .astype('Int64')
)

# Função auxiliar para converter valores monetários no formato brasileiro
def to_float_br(series):
    return (
        series.astype(str)
              .str.replace('.', '', regex=False)   # remove separador de milhar
              .str.replace(',', '.', regex=False)  # troca vírgula por ponto
              .replace(['', 'nan', 'None'], np.nan)
              .astype(float)
    )

# Converte e arredonda colunas monetárias principais
for col in ['VlrPcpPrvArr', 'VlrTotPvrArr', 'VlrTotPagArr', 'VlrTotDifPvrPagArr', 'VlrSelic']:
    df[col] = to_float_br(df[col]).round(2)



In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415203 entries, 0 to 415202
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   DatGeracaoConjuntoDados  415203 non-null  datetime64[ns]
 1   Codcvnarr                415203 non-null  Int64         
 2   AnmArrecadacao           415203 non-null  object        
 3   DatVencimentoTitulo      415203 non-null  datetime64[ns]
 4   DatIncidenciaMultaMora   415203 non-null  datetime64[ns]
 5   QtdDiasEmAtraso          415203 non-null  Int64         
 6   NumCPFCNPJ               415203 non-null  object        
 7   SigNomAgente             415203 non-null  object        
 8   DscSituacaoArrecadacao   415203 non-null  object        
 9   DscSituacaoCredito       415203 non-null  object        
 10  VlrPcpPrvArr             415203 non-null  float64       
 11  VlrTotPvrArr             415203 non-null  float64       
 12  VlrTotPagArr    

In [None]:
df.columns


Index(['DatGeracaoConjuntoDados', 'Codcvnarr', 'AnmArrecadacao',
       'DatVencimentoTitulo', 'DatIncidenciaMultaMora', 'QtdDiasEmAtraso',
       'NumCPFCNPJ', 'SigNomAgente', 'DscSituacaoArrecadacao',
       'DscSituacaoCredito', 'VlrPcpPrvArr', 'VlrTotPvrArr', 'VlrTotPagArr',
       'VlrTotDifPvrPagArr', 'VlrSelic', 'AnoArrec', 'MesArrec', 'fatura_paga',
       'fatura_atrasado', 'fatura_nao_paga', 'TrimestreVencimento'],
      dtype='object')

In [8]:
# Cria variável derivada representando proporção paga
df['prop_pago'] = df['VlrTotPagArr'] / df['VlrTotPvrArr']
df['prop_pago'] = df['prop_pago'].fillna(0)



In [26]:
# Define colunas a remover e separa X e y para classificação
cols_to_drop = [
    'fatura_atrasado',
    'fatura_paga',
    'fatura_nao_paga',
    'QtdDiasEmAtraso',
    'VlrTotPvrArr',
    'VlrTotPagArr',
    'VlrTotDifPvrPagArr',
    'DscSituacaoArrecadacao',
    'DscSituacaoCredito',
    'Codcvnarr',
    'NumCPFCNPJ',
    'DatIncidenciaMultaMora',
    'DatVencimentoTitulo',
    'DatGeracaoConjuntoDados'
]

X = df.drop(columns=cols_to_drop)
y = df['fatura_atrasado']
X


Unnamed: 0,AnmArrecadacao,SigNomAgente,VlrPcpPrvArr,VlrSelic,AnoArrec,MesArrec,TrimestreVencimento,prop_pago
0,2005-11,WOBBEN WINDPOWER INDUSTRIA E COMERCIO LTDA,312038.0,202.0,2005,11,4,1.00000
1,2020-10,WOBBEN WINDPOWER INDUSTRIA E COMERCIO LTDA,25491.0,51.0,2020,10,4,1.00000
2,2007-01,WOBBEN WINDPOWER INDUSTRIA E COMERCIO LTDA,120508.0,18.0,2007,1,1,1.00000
3,2014-07,WOBBEN WINDPOWER INDUSTRIA E COMERCIO LTDA,156877.0,105.0,2014,7,3,1.00000
4,2014-12,WOBBEN WINDPOWER INDUSTRIA E COMERCIO LTDA,156877.0,10.0,2014,12,1,1.00000
...,...,...,...,...,...,...,...,...
415198,2017-08,AGROPEU-AGRO INDUSTRIAL DE POMPEU S/A,523561.0,67.0,2017,8,3,1.00000
415199,2017-05,AGROPEU-AGRO INDUSTRIAL DE POMPEU S/A,523561.0,7.0,2017,5,2,1.00000
415200,2017-07,AGROPEU-AGRO INDUSTRIAL DE POMPEU S/A,523561.0,68.0,2017,7,3,0.99671
415201,2017-07,AGROPEU-AGRO INDUSTRIAL DE POMPEU S/A,1728.0,68.0,2017,7,3,1.00000


In [None]:
numerical = X.select_dtypes(include=['float64','int64','Int64']).columns
categorical = X.select_dtypes(include=['category']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
    ]
)

In [11]:
# Divide dados em treino e teste com estratificação
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)



In [12]:
# Treina modelo baseline: Regressão Logística
log_reg = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', LogisticRegression(max_iter=200))
])

log_reg.fit(X_train, y_train)
pred = log_reg.predict(X_test)

print("Acurácia baseline:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))



Acurácia baseline: 0.9764092436266423
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1959
           1       0.98      1.00      0.99     81082

    accuracy                           0.98     83041
   macro avg       0.49      0.50      0.49     83041
weighted avg       0.95      0.98      0.96     83041

F1 Score: 0.9880638301761484


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# Treina modelo RandomForestClassifier para classificação
# inicialmente testamos com svm, mas demorou mais de 50 minutos, então passamos a usar o RandomForestClassifier
rf_cls = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', RandomForestClassifier(n_estimators=200))
])

rf_cls.fit(X_train, y_train)
pred_rf = rf_cls.predict(X_test)

print("Acurácia RF:", accuracy_score(y_test, pred_rf))
print(classification_report(y_test, pred_rf))
print("F1 Score RF:", f1_score(y_test, pred_rf))


Acurácia RF: 0.982791633048735
              precision    recall  f1-score   support

           0       0.71      0.46      0.56      1959
           1       0.99      1.00      0.99     81082

    accuracy                           0.98     83041
   macro avg       0.85      0.73      0.78     83041
weighted avg       0.98      0.98      0.98     83041

F1 Score RF: 0.991224137613383


In [14]:
# Treina modelo XGBoostClassifier para classificação
xgb_cls = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBClassifier(
        eval_metric='logloss',
        tree_method='hist'
    ))
])

xgb_cls.fit(X_train, y_train)
pred_xgb = xgb_cls.predict(X_test)

print("Acurácia XGBClassifier:", accuracy_score(y_test, pred_xgb))



Acurácia XGBClassifier: 0.9858503630736624


In [15]:
# Prepara X2 e y2 para regressão da diferença prevista/paga
X2 = df.drop(columns=['VlrTotDifPvrPagArr'])
y2 = df['VlrTotDifPvrPagArr']


In [16]:
# Divide X2 e y2 em treino e teste
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42
)



In [17]:
# Divide X2 e y2 em treino e teste
lin_reg = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', LinearRegression())
])

lin_reg.fit(X2_train, y2_train)
pred = lin_reg.predict(X2_test)

print("MSE (LinearRegression):", mean_squared_error(y2_test, pred))
print("R² (LinearRegression):", r2_score(y2_test, pred))



MSE (LinearRegression): 3.1720447483143884e+30
R² (LinearRegression): 0.0010254984664199718


In [18]:
# Divide X2 e y2 em treino e teste
rf_reg = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', RandomForestRegressor())
])

rf_reg.fit(X2_train, y2_train)
pred_rf = rf_reg.predict(X2_test)

print("MSE (RandomForestRegressor):", mean_squared_error(y2_test, pred_rf))
print("R² (RandomForestRegressor):", r2_score(y2_test, pred_rf))



MSE (RandomForestRegressor): 3.6633258589427696e+30
R² (RandomForestRegressor): -0.15369404099267747


In [19]:
# Divide X2 e y2 em treino e teste
xgb_reg = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBRegressor(tree_method='hist'))
])

xgb_reg.fit(X2_train, y2_train)
pred_xgb = xgb_reg.predict(X2_test)

print("MSE (XGBRegressor):", mean_squared_error(y2_test, pred_xgb))
print("R² (XGBRegressor):", r2_score(y2_test, pred_xgb))



MSE (XGBRegressor): 3.1398295609371585e+30
R² (XGBRegressor): 0.011171052298559747


In [20]:
# Prepara X3 e y3 para regressão de dias de atraso
y3 = df['QtdDiasEmAtraso']
X3 = df.drop(columns=['QtdDiasEmAtraso'])

X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.2, random_state=42
)



In [21]:
# Prepara X3 e y3 para regressão de dias de atraso
lin_reg2 = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', LinearRegression())
])

lin_reg2.fit(X3_train, y3_train)
pred = lin_reg2.predict(X3_test)

print("MSE (LinearRegression - dias atraso):", mean_squared_error(y3_test, pred))
print("R² (LinearRegression - dias atraso):", r2_score(y3_test, pred))



MSE (LinearRegression - dias atraso): 25588.469935923826
R² (LinearRegression - dias atraso): 0.005424526653712736


In [22]:
# Prepara X3 e y3 para regressão de dias de atraso
rf_reg2 = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', RandomForestRegressor())
])

rf_reg2.fit(X3_train, y3_train)
pred = rf_reg2.predict(X3_test)

print("MSE (RandomForestRegressor - dias atraso):", mean_squared_error(y3_test, pred))
print("R² (RandomForestRegressor - dias atraso):", r2_score(y3_test, pred))



MSE (RandomForestRegressor - dias atraso): 12117.1644031051
R² (RandomForestRegressor - dias atraso): 0.5290287167614525


In [23]:
# Prepara X3 e y3 para regressão de dias de atraso
xgb_reg2 = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBRegressor(tree_method='hist'))
])

xgb_reg2.fit(X3_train, y3_train)
pred = xgb_reg2.predict(X3_test)

print("MSE (XGBRegressor - dias atraso):", mean_squared_error(y3_test, pred))
print("R² (XGBRegressor - dias atraso):", r2_score(y3_test, pred))



MSE (XGBRegressor - dias atraso): 14767.939453125
R² (XGBRegressor - dias atraso): 0.42599809169769287


In [24]:
# Divide X2 e y2 em treino e teste
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None],
}

grid = GridSearchCV(
    rf_reg, param_grid, cv=3, scoring='r2', n_jobs=-1
)

grid.fit(X2_train, y2_train)

print("Melhor combinação de hiperparâmetros:", grid.best_params_)
print("Melhor R² médio (validação cruzada):", grid.best_score_)



Melhor combinação de hiperparâmetros: {'model__max_depth': 10, 'model__n_estimators': 200}
Melhor R² médio (validação cruzada): 0.0348217589352664


In [25]:
# Extrai importância das variáveis do melhor modelo
best_rf = grid.best_estimator_
rf_model = best_rf['model']
prep_best = best_rf['prep']

importances = rf_model.feature_importances_
feat_names = prep_best.get_feature_names_out()

feature_importance = pd.DataFrame({
    'Feature': feat_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

feature_importance.head(20)



Unnamed: 0,Feature,Importance
0,num__VlrPcpPrvArr,0.524041
1,num__VlrSelic,0.187746
3,num__MesArrec,0.09838
5,num__prop_pago,0.085714
2,num__AnoArrec,0.064567
4,num__TrimestreVencimento,0.039553
