In [1]:
import os
os.chdir('..')

import pandas as pd
pd.set_option('display.max_rows', None)
from IPython.display import Image, display

from src import DFVC

raw_data_path = 'data/raw.dfvc'
raw_hash = '9a21a0d38bb4ee2b5b1d98af17072c08c9657d7031d218ad6003b03333f2baf7'
prp_s1_hash = 'd854acea4c5654937d9275b86e8e0bc1ee0ff79dd16160b9a96960f681e4a705'
output_dir = 'data'
output_filename = 'prp_s1'
output_path = f'{output_dir}/{output_filename}'

In [2]:
raw_dfvc = DFVC.load_dfvc_file(raw_data_path)
display(raw_dfvc)
raw_dfvc.compare_versions(raw_hash)

Version integrity verified successfully.


In [None]:
raw = raw_dfvc.get_dataframe()
raw.head()

Unnamed: 0,iframe_title,timeseries_category,timeseries_value
0,Instagram revenues - Infogram,Q1 2015,111
1,Instagram revenues - Infogram,Q2 2015,139
2,Instagram revenues - Infogram,Q3 2015,141
3,Instagram revenues - Infogram,Q4 2015,201
4,Instagram revenues - Infogram,Q1 2016,261


In [None]:
# quarter_label
prp_s1_interim = pd.DataFrame(columns=['quarter', 'quarter_label', 'ig_maus', 'ig_revs', 'tk_maus', 'tk_revs'])
prp_s1_interim['quarter_label'] = raw['timeseries_category'].unique()

In [None]:
# quarter
def converter_para_data(trimestre):
    tri, ano = trimestre.split()
    tri = int(tri[1])
    if tri == 1:
        mes, dia = 3, 31
    elif tri == 2:
        mes, dia = 6, 30
    elif tri == 3:
        mes, dia = 9, 30
    elif tri == 4:
        mes, dia = 12, 31
    return pd.Timestamp(year=int(ano), month=mes, day=dia)

# Aplicar a transformação ao DataFrame df
prp_s1_interim['quarter'] = prp_s1_interim['quarter_label'].apply(converter_para_data)

prp_s1_interim = prp_s1_interim.sort_values(by='quarter').reset_index(drop=True)

# Definir o ano atual
current_year = pd.Timestamp.now().year

# Filtrar os dados para manter apenas os últimos 5 anos
prp_s1_interim = prp_s1_interim[prp_s1_interim['quarter'].dt.year >= (current_year - 6)]

# Resetar o índice
prp_s1_interim.reset_index(drop=True, inplace=True)

In [None]:
# Função para preencher o prp_s1_interim com os valores do raw
def preencher_prp_s1_interim(raw, prp_s1_interim):
    for index, row in raw.iterrows():
        if 'Instagram' in row['iframe_title']:
            if 'monthly app users' in row['iframe_title']:
                prp_s1_interim.loc[prp_s1_interim['quarter_label'] == row['timeseries_category'], 'ig_maus'] = row['timeseries_value']
            elif 'revenues' in row['iframe_title']:
                prp_s1_interim.loc[prp_s1_interim['quarter_label'] == row['timeseries_category'], 'ig_revs'] = row['timeseries_value']
        elif 'TikTok' in row['iframe_title']:
            if 'MAUs' in row['iframe_title']:
                prp_s1_interim.loc[prp_s1_interim['quarter_label'] == row['timeseries_category'], 'tk_maus'] = row['timeseries_value']
            elif 'revenues' in row['iframe_title']:
                prp_s1_interim.loc[prp_s1_interim['quarter_label'] == row['timeseries_category'], 'tk_revs'] = row['timeseries_value']
    return prp_s1_interim

# Preencher o prp_s1_interim
prp_s1_interim = preencher_prp_s1_interim(raw, prp_s1_interim)

In [None]:
# Verifique se as colunas existem no DataFrame
columns_to_convert = ['ig_maus', 'ig_revs', 'tk_maus', 'tk_revs']
for column in columns_to_convert:
    if column in prp_s1_interim.columns:
        # Converta a coluna para int, lidando com possíveis valores nulos
        prp_s1_interim[column] = pd.to_numeric(prp_s1_interim[column], errors='coerce').fillna(0).astype(int)
    else:
        print(f"Coluna {column} não encontrada no DataFrame.")

In [None]:
prp_s1 = DFVC(prp_s1_interim)
display(prp_s1)
prp_s1.compare_versions(prp_s1_hash)

Version integrity verified successfully.


In [None]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
prp_s1.export_as_dfvc_file(output_path)

DFVC object successfully saved to data/prp_s1.dfvc.
