# Setup

## Parser Setup

In [1]:
import configparser

file_path = 'config.ini'
config = configparser.ConfigParser()
config.read(file_path)

['config.ini']

## Webscraping Setup

In [2]:
from playwright.async_api import async_playwright, Page, BrowserContext
from tqdm.asyncio import tqdm

# Define constants/webscraping parameters from config.ini
TARGETS_URL = config['webscraping']['targets_url'].split(', ')
IG_IFRAMES = config['webscraping']['ig_iframes'].split(', ')
TK_IFRAMES = config['webscraping']['tk_iframes'].split(', ')

# Start Playwright
playwright = await async_playwright().start()
browser = await playwright.chromium.launch()
context = await browser.new_context()

## Data Exportation and Integratiy Setup

In [None]:
import os
from IPython.display import Image, display
import pandas as pd

# Change to the parent directory to access the DFVC module
os.chdir('..')
from src import DFVC

# Configure pandas display options for better readability
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None) 

# Define constants for raw data extraction/exportation
DB_DIR = config['database']['db_dir']
RAW_DF_NAME = config['database']['raw_data_df_name']
RAW_DATA_FILENAME = config['database']['raw_data_filename']
RAW_HASH = config['database']['raw_data_version_hash']

RAW_DATA_PATH = os.path.join(DB_DIR, RAW_DATA_FILENAME)

# Webscraping

### Target Pages Reacher

In [4]:
async def target_page_reacher(context: BrowserContext, TARGETS_URL: list[str]) -> tuple[Page, Page]:
    """
    Opens, navigates to the research target URLs in new browser pages, and scrolls each page to the bottom.

    Args:
        context (BrowserContext): The browser context to use for opening new pages.
        TARGETS_URL (list[str]): A list of URLs to be accessed. Assumes the list contains at least two URLs.

    Returns:
        tuple[Page, Page]: A tuple containing the two pages corresponding to the research target URLs.
    """
    pages = []

    for url in TARGETS_URL:
        # Open a new page
        page = await context.new_page()

        # Navigate to the URL
        await page.goto(url)
        print(f"Page reached: {await page.title()} ({url})")

        # Scroll to the bottom to ensure all content is loaded
        scroll_height = await page.evaluate("() => document.body.scrollHeight")
        current_position = 0

        while current_position < scroll_height:
            current_position += 500  # Adjust scroll step as needed
            await page.evaluate(f"window.scrollTo(0, {current_position})")
            await page.wait_for_timeout(400)  # Wait for dynamic content to load
            scroll_height = await page.evaluate("() => document.body.scrollHeight")  # Update height if new content loads

        print(f"Page fully loaded: {await page.title()} ({url})")
        pages.append(page)

    return pages


ig, tk = await target_page_reacher(context, TARGETS_URL)

Page reached: Instagram Revenue and Usage Statistics (2024) - Business of Apps (https://www.businessofapps.com/data/instagram-statistics/)
Page fully loaded: Instagram Revenue and Usage Statistics (2024) - Business of Apps (https://www.businessofapps.com/data/instagram-statistics/)
Page reached: TikTok Revenue and Usage Statistics (2024) - Business of Apps (https://www.businessofapps.com/data/tik-tok-statistics/)
Page fully loaded: TikTok Revenue and Usage Statistics (2024) - Business of Apps (https://www.businessofapps.com/data/tik-tok-statistics/)


### Target Pages Webscraper

In [5]:
async def target_pages_webscraper(page: Page, target_iframes: list[str]) -> pd.DataFrame:
    """
    Extract specific bar chart values from the target pages within iframes.

    Args:
        page (Page): The Playwright Page object used to interact with the webpage.
        target_iframes (list[str]): A list of iframe selectors containing the bar charts to scrape.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted data with columns: 
                      'iframe_title', 'timeseries_category', and 'timeseries_value'.
    """
    scraped_barchart_data = []

    print('Webscraping process started')
    print(f'Page Title: {await page.title()}')
    print()

    # Loop through each iframe containing bar chart data
    for barchart_selector in target_iframes:
        # Extract the title of the iframe
        barchart_iframe_title = await page.frame_locator(barchart_selector).locator("title").inner_text()

        # Count the number of columns in the bar chart
        columns_qtd = len(await page.frame_locator(barchart_selector).locator('.igc-graph .igc-column').all()) + 1

        print(f'-> Starting webscraping for barchart ({barchart_iframe_title}):')

        # Initialize a progress bar
        pbar = tqdm(total=columns_qtd - 1, desc='Progress', unit=' Datapoint', leave=True)

        # Extract data for each column in the bar chart
        for i in range(1, columns_qtd):
            await page.frame_locator(barchart_selector).locator(f'path:nth-child({i})').hover()

            # Retrieve the category and value for the current column
            target_time_series_category = await page.frame_locator(barchart_selector).locator('.tt_text').inner_text()
            target_time_series_value = await page.frame_locator(barchart_selector).locator('.tt_value').inner_text()

            # Append the extracted data to the list
            scraped_barchart_data.append({
                'iframe_title': barchart_iframe_title,
                'timeseries_category': target_time_series_category,
                'timeseries_value': target_time_series_value
            })

            # Update the progress bar
            pbar.update(1)

        # Close the progress bar and log completion for the current iframe
        pbar.close()
        print(f'-> Completed webscraping for barchart ({barchart_iframe_title})')
        print()

    print('Webscraping process completed successfully')
    print('-' * 80)
    print()

    # Convert the scraped data into a pandas DataFrame
    return pd.DataFrame(scraped_barchart_data)


raw_data_interim = []
raw_data_interim.append(await target_pages_webscraper(page=ig, target_iframes=IG_IFRAMES))
raw_data_interim.append(await target_pages_webscraper(page=tk, target_iframes=TK_IFRAMES))

Webscraping process started
Page Title: Instagram Revenue and Usage Statistics (2024) - Business of Apps

-> Starting webscraping for barchart (Instagram revenues - Infogram):


Progress: 100%|██████████| 39/39 [00:03<00:00, 12.50 Datapoint/s]


-> Completed webscraping for barchart (Instagram revenues - Infogram)

-> Starting webscraping for barchart (Instagram monthly app users - Infogram):


Progress: 100%|██████████| 47/47 [00:03<00:00, 14.16 Datapoint/s]


-> Completed webscraping for barchart (Instagram monthly app users - Infogram)

Webscraping process completed successfully
--------------------------------------------------------------------------------

Webscraping process started
Page Title: TikTok Revenue and Usage Statistics (2024) - Business of Apps

-> Starting webscraping for barchart (TikTok quarterly revenues - Infogram):


Progress: 100%|██████████| 31/31 [00:02<00:00, 13.30 Datapoint/s]


-> Completed webscraping for barchart (TikTok quarterly revenues - Infogram)

-> Starting webscraping for barchart (TikTok MAUs - Infogram):


Progress: 100%|██████████| 27/27 [00:01<00:00, 13.86 Datapoint/s]

-> Completed webscraping for barchart (TikTok MAUs - Infogram)

Webscraping process completed successfully
--------------------------------------------------------------------------------






# Data Exportation and Integrity Verification

In [6]:
# Data preparation to export - Initialize the DFVC object with the raw data
raw_data_interim = pd.concat(raw_data_interim)
raw = DFVC(raw_data_interim, RAW_DF_NAME)
display(raw)

# Integrity verification
raw.compare_versions(RAW_HASH)

# At least, raw data exportation
raw.export_as_dfvc_file(RAW_DATA_PATH)

Version integrity verified successfully.
DFVC object successfully saved to data/1_raw.dfvc.
