In [151]:
%matplotlib inline
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
from quickda.explore_data import *
from quickda.clean_data import *
from quickda.explore_numeric import *
from quickda.explore_categoric import *
from quickda.explore_numeric_categoric import *
from quickda.explore_time_series import *

import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import KNNImputer

sns.set_style('whitegrid')
sns.set_context('notebook')

import warnings
warnings.filterwarnings('ignore')

In [141]:
df = pd.read_csv("solicitacoescredito.csv");
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8973 entries, 0 to 8972
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   numero_solicitacao           8973 non-null   int64  
 1   razaoSocial                  8973 non-null   object 
 2   nomeFantasia                 8973 non-null   object 
 3   cnpjSemTraco                 8973 non-null   object 
 4   maiorAtraso                  8973 non-null   int64  
 5   margemBrutaAcumulada         8973 non-null   float64
 6   percentualProtestos          7475 non-null   float64
 7   primeiraCompra               8867 non-null   object 
 8   prazoMedioRecebimentoVendas  8973 non-null   int64  
 9   titulosEmAberto              8973 non-null   float64
 10  valorSolicitado              8973 non-null   float64
 11  status                       8973 non-null   object 
 12  definicaoRisco               8973 non-null   object 
 13  diferencaPercentua

In [142]:
df_temporal = df[['dataAprovadoEmComite', 'dataAprovadoNivelAnalista', 'valorAprovado']]

In [143]:
df_temporal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8973 entries, 0 to 8972
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   dataAprovadoEmComite       558 non-null    object 
 1   dataAprovadoNivelAnalista  7011 non-null   object 
 2   valorAprovado              7569 non-null   float64
dtypes: float64(1), object(2)
memory usage: 210.4+ KB


In [144]:
df_temporal['dataAprovadoEmComite'].fillna(df_temporal['dataAprovadoNivelAnalista'], inplace=True)
df_temporal = df_temporal.drop(['dataAprovadoNivelAnalista'], axis=1)
df_temporal.rename(columns={'dataAprovadoEmComite': 'data'}, inplace=True)
df_temporal.dropna(inplace=True)

In [145]:
df_temporal.data = [x[:-9] for x in df_temporal.data]
df_ano = df_temporal.groupby('data', as_index=False).sum()
df_ano.sort_values('data', inplace= True)

In [146]:
df_ano

Unnamed: 0,data,valorAprovado
0,2020-02-03,50000.0
1,2020-02-04,1243000.0
2,2020-02-05,2952000.0
3,2020-02-06,5545000.0
4,2020-02-07,4685000.0
...,...,...
259,2021-02-19,7911900.0
260,2021-02-22,4272000.0
261,2021-02-23,13826500.0
262,2021-02-24,7255000.0


In [149]:
df_dias = pd.DataFrame(pd.date_range(start='2020-02-03', end='2021-02-25').format(formatter=lambda x: x.strftime('%Y-%m-%d')), columns=['data'])
df_data_final = pd.merge(df_dias, df_ano, how='left')

In [150]:
df_data_final

Unnamed: 0,data,valorAprovado
0,2020-02-03,50000.0
1,2020-02-04,1243000.0
2,2020-02-05,2952000.0
3,2020-02-06,5545000.0
4,2020-02-07,4685000.0
...,...,...
384,2021-02-21,
385,2021-02-22,4272000.0
386,2021-02-23,13826500.0
387,2021-02-24,7255000.0


In [152]:
imputer = KNNImputer(n_neighbors=5)
df_data_final['valorAprovado'] = imputer.fit_transform(df_data_final[['valorAprovado']])

In [153]:
df_data_final

Unnamed: 0,data,valorAprovado
0,2020-02-03,5.000000e+04
1,2020-02-04,1.243000e+06
2,2020-02-05,2.952000e+06
3,2020-02-06,5.545000e+06
4,2020-02-07,4.685000e+06
...,...,...
384,2021-02-21,5.441439e+06
385,2021-02-22,4.272000e+06
386,2021-02-23,1.382650e+07
387,2021-02-24,7.255000e+06


In [154]:
df_data_final.to_csv('valores_dia.csv', index=False)