# ETL Reporte para OBEX

* Extraer la data de los JSON
* Transformar en una tabla con los campos requeridos
* Guardar en formato CSV

In [46]:
import boto3
import json
import pandas as pd
from tqdm.notebook import trange, tqdm

In [24]:
bucketname = 'bi-obex-study-json'
test_file = '77559056-4.json'

In [26]:
client = boto3.client('s3')

In [None]:
#Create a file object using the bucket and object key. 
fileobj = client.get_object(
    Bucket=bucketname,
    Key=test_file
    ) 
# open the file object and read it into the variable filedata. 
filedata = fileobj['Body'].read()

# file data will be a binary stream.  We have to decode it 
test_json = json.loads(filedata)

#print(json.dumps(test_json, indent=4, sort_keys=True))

In [None]:

paginator = client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucketname)

data = []

j=0
for page in tqdm(pages):
    for obj in tqdm(page['Contents']):
        
        if j < 10000000:
            # Read data
            fileobj = client.get_object(Bucket=bucketname,
                                        Key=obj['Key']
                                       ) 
            filedata = fileobj['Body'].read().decode('utf-8') 
            obj_json = json.loads(filedata)

            # Parse and create data row
            item = dict()
            item['rut'] = obj_json['data']['estado']['rut']
            item['razon_social'] = obj_json['data']['estado']['razonSocial']
            #item['fecha_actuacion'] = obj_json['data']['estado']['fecha']
            item['fecha_actuacion'] = obj_json['data']['estudio']['prestudios'][0]['fecha']
            item['fecha_estudio'] = obj_json['data']['estudio']['fechaEstudio']
            item['socios'] = obj_json['data']['estudio']['prestudios'][-1]['accionistas']['valor']
            item['pre_aprobado'] = obj_json['data']['estudio']['preAprobado']
            item['tipo_firma'] = obj_json['data']['estudio']['prestudios'][-1]['administracion']['valor']['comoAdministran']
            item['apoderados'] = obj_json['data']['estudio']['prestudios'][-1]['administracion']['valor']['administradores']
            data.append(item)
        j+=1
        
print(j)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

In [None]:
df = pd.DataFrame(data)

## Transformations

In [None]:
df.head(10)

Fechas

In [None]:
#df['fecha_actuacion_2'] = pd.to_datetime(df['fecha_actuacion'], format='%Y%m%d%H%M%S%f')
df['fecha_actuacion_2'] = pd.to_datetime(df['fecha_actuacion'], format='%Y-%m-%d')

In [None]:
df.head(2)

Expandir Socios

In [None]:
#df.loc[0, 'socios']

In [None]:
df['n_socios'] = df['socios'].apply(lambda x: len(x))
df['n_apoderados'] = df['apoderados'].apply(lambda x: len(x))

In [None]:
def expand_list(series, field):
    list_df = pd.DataFrame(series.tolist())
    for col in list_df.columns:
    #for col in [0]:
        list_df[field + '_' + series.name + '_' + str(col + 1)] = \
            socios_df[col].apply(lambda x: x.get(field) if x is not None else None)
        del list_df[col]
    return list_df

def clean_ruts(series):
    series = series.str.replace('\.', '')
    series = series.str.replace('-', '')
    
    return series

In [None]:
df_socios = expand_list(df['socios'], 'rut')
df_apoderados = expand_list(df['apoderados'], 'rut')

for col in df_socios:
    df_socios[col] = clean_ruts(df_socios[col])
for col in df_apoderados:
    df_apoderados[col] = clean_ruts(df_apoderados[col])
    

In [None]:
df_socios

In [None]:
df1 = pd.concat([df, df_socios, df_apoderados], axis=1)

In [None]:
df1.head(2)

Organizar columnas

In [None]:
del df1['fecha_actuacion']
df1 = df1.rename(columns={'fecha_actuacion_2' : 'fecha_actuacion'})

In [None]:
del df1['socios']
del df1['apoderados']

Reorder Columns

In [None]:
max_socios = df1['n_socios'].max()
max_apoderados = df1['n_apoderados'].max()

In [None]:
col_socios = ['rut_socios_' + str(i) for i in range(1, max_socios + 1)]
col_apoderados = ['rut_apoderados_' + str(i) for i in range(1, max_apoderados + 1)]

In [None]:
df1.columns

In [None]:
df1.shape

In [None]:
cols = ['rut', 'razon_social', 'tipo_firma', 'fecha_actuacion', 'fecha_estudio',
              'pre_aprobado'] + ['n_socios'] + col_socios + ['n_apoderados'] + col_apoderados
#cols

In [None]:
df1 = df1.loc[:, cols]

## EDA

In [None]:
df1['tipo_firma'].value_counts()

In [None]:
df1['pre_aprobado'].value_counts()

In [None]:
df1['n_socios'].value_counts()

In [None]:
df1['n_apoderados'].value_counts()

In [None]:
vol_dia = df1.groupby(pd.Grouper(key='fecha_actuacion', freq='d'))['rut'].count()
vol_dia

## Output

In [None]:
df1['razon_social'] = df1['razon_social'].str.normalize('NFKD').str.title() \
                    .str.encode('ascii', errors='ignore').str.decode('utf-8')

In [None]:
#df1

In [None]:
df1.to_csv('../reports/100. Obex intelligence/corporate_intelligence_DEMO-202205.csv', 
           index=False,
           encoding='utf-8',
           quoting=2)