# Setup

## Webscraping Setup

In [1]:
from playwright.async_api import async_playwright
from tqdm.asyncio import tqdm

targets_url = ['https://www.businessofapps.com/data/instagram-statistics/', 
               'https://www.businessofapps.com/data/tik-tok-statistics/']
target_data_selector = 'infogram-embed'
target_data_iframe_selector = 'iframe[title="{chart_iframe_title_goes_here}"]'

p = await async_playwright().start()
browser = await p.chromium.launch()
context = await browser.new_context()

## Data Exportation and Integratiy Setup

In [None]:
import os 
os.chdir('..')

from IPython.display import Image, display
import pandas as pd
from src import DFVC

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

raw_data_path = 'data/raw.dfvc'
raw_hash = '9a21a0d38bb4ee2b5b1d98af17072c08c9657d7031d218ad6003b03333f2baf7'
output_dir = 'data'
output_filename = 'raw'
output_path = f'{output_dir}/{output_filename}'


### Target Pages Reacher

In [None]:
ig = await context.new_page()
tk = await context.new_page()

await ig.goto(targets_url[0])
print(await ig.title())

await tk.goto(targets_url[1])
print(await tk.title())

### Target Data (charts) Locator

In [3]:
async def chart_finder(page: 'PageObject', chart_selector: str, screenshot: bool = True) -> list:
    """
    Encontra gráficos em uma página e retorna os seletores de iframe correspondentes.

    Args:
        page (PageObject): Objeto da página do Playwright.
        chart_selector (str): Seletor CSS para identificar os gráficos na página.
        screenshot (bool, optional): Se True, tira uma captura de tela de cada gráfico encontrado. Padrão é True.

    Returns:
        list: Lista de seletores de iframe correspondentes aos gráficos encontrados.
    """
    founded_iframes_selectors = []
    founded_charts = await page.query_selector_all(f'.{chart_selector}')

    for target in founded_charts:
        await target.scroll_into_view_if_needed()
        await page.wait_for_timeout(2000)

        title = await target.get_attribute('data-title')
        iframe_selector = f'iframe[title="{title}"]'
        founded_iframes_selectors.append(iframe_selector)

        if screenshot:
            page_title = await page.title()
            print(f'Page: {page_title} || Chart iframe: {iframe_selector}')
            display(Image(await page.screenshot()))

    return founded_iframes_selectors

ig_iframes = await chart_finder(page=ig, chart_selector=target_data_selector)
tk_iframes = await chart_finder(page=tk, chart_selector=target_data_selector)

### Target Data Iframe Selector Constructor

In [4]:
for iframe in [ig_iframes, tk_iframes]:
    print(iframe)

In [5]:
iframe_to_remove_1 = 'iframe[title="Social App Users"]'
iframe_to_remove_2 = 'iframe[title="TikTok Quarterly Downloads"]'

def remove_iframe(iframes):
    if iframe_to_remove_1 in iframes:
        iframes.remove(iframe_to_remove_1)
    if iframe_to_remove_2 in iframes:
        iframes.remove(iframe_to_remove_2)
    return iframes

ig_iframes = remove_iframe(ig_iframes)
tk_iframes = remove_iframe(tk_iframes)

In [6]:
for iframe in [ig_iframes, tk_iframes]:
    print(iframe)

### Target Data Webscraper

In [7]:
async def barchart_scraper(page: 'PageObject', target_iframes: list) -> pd.DataFrame:
    """
    target_iframes: list within iframes target selectors
    """
    scraped_barchart_data = []

    print('Webscraping process started')
    print(f'Page: {await page.title()}')
    print()
    for barchart in target_iframes:
        barchart_iframe_title = f'{await page.frame_locator(barchart).locator("title").inner_text()}'

        columns_qtd = await page.frame_locator(barchart).locator('.igc-graph .igc-column').all()
        columns_qtd = len(columns_qtd) + 1

        # Mensagem inicial sem a barra de progresso
        print(f'-> barchart ({barchart_iframe_title}):', end='')

        # Inicialize uma barra de progresso para o gráfico atual
        pbar = tqdm(total=columns_qtd - 1, desc='Progress', unit=' Datapoint', leave=True)
        
        for i in range(1, columns_qtd):
            await page.frame_locator(barchart).locator(f'path:nth-child({i})').hover()

            target_time_series_category = await page.frame_locator(barchart).locator('.tt_text').inner_text()
            target_time_series_value = await page.frame_locator(barchart).locator('.tt_value').inner_text()

            # Adicione os dados extraídos à lista
            scraped_barchart_data.append({
                'iframe_title': barchart_iframe_title,
                'timeseries_category': target_time_series_category,
                'timeseries_value': target_time_series_value
            })

            # Atualize a barra de progresso
            pbar.update(1)

        # Finalize a barra de progresso
        pbar.close()

        # Imprima a mensagem de conclusão após fechar a barra de progresso
        print(f'-> barchart ({barchart_iframe_title}) webscraping ended')
        print()
    
    print('Webscraping process ended')
    print('-'*70)
    print()

    return pd.DataFrame(scraped_barchart_data)

In [8]:
raw_scraped_data = []
raw_scraped_data.append(await barchart_scraper(page=ig, target_iframes=ig_iframes))
raw_scraped_data.append(await barchart_scraper(page=tk, target_iframes=tk_iframes))