In [1]:
from webscrape_utils import read_credentials
import os
import json
from datetime import datetime
import boto3
import pandas as pd
from multiprocessing import Pool

# Fetch JSONs Helper Functions

def initialize_worker(credentials_file, bucket_name):
    """Initialize the S3 client for each worker."""
    global s3_client, bucket
    home_directory = os.path.expanduser('~')
    credentials = read_credentials(os.path.join(home_directory, credentials_file))
    session = boto3.Session(
        aws_access_key_id=credentials['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=credentials['AWS_SECRET_ACCESS_KEY'],
        region_name=credentials['AWS_DEFAULT_REGION']
    )
    s3_client = session.client('s3')
    bucket = bucket_name

def fetch_object_from_s3(key):
    """Fetch a single JSON object from S3."""
    try:
        file_obj = s3_client.get_object(Bucket=bucket, Key=key)
        file_content = file_obj['Body'].read().decode('utf-8')
        return json.loads(file_content)
    except Exception as e:
        print(f"Error fetching {key}: {e}")
        return None

def fetch_jsons_from_s3(credentials_file, bucket_name, prefix, num_processes=16):
    """Fetch JSON files from S3 in parallel using multiprocessing."""
    # Create an S3 client to list objects
    home_directory = os.path.expanduser('~')
    credentials = read_credentials(os.path.join(home_directory, credentials_file))
    session = boto3.Session(
        aws_access_key_id=credentials['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=credentials['AWS_SECRET_ACCESS_KEY'],
        region_name=credentials['AWS_DEFAULT_REGION']
    )
    s3 = session.client('s3')

    # List objects in the specified S3 bucket and prefix
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)big_mac_prices
    # Filter JSON files
    keys = [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.json')]

    # Initialize multiprocessing pool
    with Pool(
        processes=num_processes,
        initializer=initialize_worker,
        initargs=(credentials_file, bucket_name)
    ) as pool:
        all_data = pool.map(fetch_object_from_s3, keys)

    # Filter out None values and convert to DataFrame
    all_data = [data for data in all_data if data is not None]
    return pd.DataFrame(all_data)

# Saving and Plotting Functions

def save_to_json(dataframe, output_file):
    """Save DataFrame to a JSON file."""
    with open(output_file, 'w') as f:
        json.dump(dataframe.to_dict(orient='records'), f, indent=4)
    print(f"Data saved to {output_file}")

# Main Function
credentials_file = "credentials/aws.credentials"  # Replace with your credentials file path
bucket_name = 'prices-for-inflation-estimation'
prefix = 'inflation/'

# Fetch JSONs into a DataFrame
dataframe = fetch_jsons_from_s3(credentials_file, bucket_name, prefix)

# Save DataFrame to JSON
output_file = 'data/inflation_argentina_bigmac_and_yerba_mate.json'
save_to_json(dataframe, output_file)

Data saved to data/inflation_argentina_bigmac_and_yerba_mate.json


In [5]:
import altair as alt
import pandas as pd

def plot_big_mac_over_time(dataframe):
    # Ensure timestamp is in datetime format
    dataframe['ts'] = pd.to_datetime(dataframe['ts'])

    # Filter only Big Mac data
    big_mac_data = dataframe[dataframe['product'] == 'Big Mac'].sort_values('ts')

    # Convert price to numeric (if it's a string)
    big_mac_data['price'] = pd.to_numeric(big_mac_data['price'], errors='coerce')

    # Calculate cumulative percentage
    big_mac_data['cumulative_percent'] = (
        (big_mac_data['price'] - big_mac_data['price'].iloc[0]) / big_mac_data['price'].iloc[0]
    ) * 100

    # Altair base chart for price
    base = alt.Chart(big_mac_data).encode(
        x=alt.X('ts:T', title='Timestamp', axis=alt.Axis(format='%b %Y')),
        tooltip=[
            alt.Tooltip('ts:T', title='Date'),
            alt.Tooltip('price:Q', title='Price (ARS)', format=','),
            alt.Tooltip('cumulative_percent:Q', title='Cumulative % Change', format='.2f')
        ]
    ).properties(
        title='Big Mac Price Over Time',
        width=900,
        height=500
    )

    # Price Line Chart
    price_line = base.mark_line(color='steelblue', strokeWidth=6).encode(
        y=alt.Y('price:Q', title='Price (ARS)', axis=alt.Axis(titleColor='steelblue'))
    )

    # Cumulative Percentage Line Chart
    cumulative_percent_line = base.mark_line(color='orange', strokeWidth=3, strokeDash=[5, 5]).encode(
        y=alt.Y(
            'cumulative_percent:Q',
            title='Cumulative % Change',
            axis=alt.Axis(titleColor='orange'),
        )
    )

    # Combine price and cumulative percentage charts
    combined_chart = alt.layer(price_line, cumulative_percent_line).resolve_scale(
        y='independent'
    )

    return combined_chart

# Example Usage
big_mac_df = pd.read_json('data/inflation_argentina_bigmac_and_yerba_mate.json')

# Ensure 'price' is numeric
big_mac_df['price'] = pd.to_numeric(big_mac_df['price'], errors='coerce')

chart = plot_big_mac_over_time(big_mac_df)
chart.show()
