In [2]:
# Import
import sys
import logging
import os
import time
import io

from google.colab import drive

In [3]:
# configure ->

# Constant values
bucket_name = 'gcp_latam_twitter'
folder_name = 'raw'
zip_file_name = 'tweets.json.zip'

# ID de ejemplo del archivo de Google Drive
file_id = '1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis'
gcloud_url = f"gs://{bucket_name}/{folder_name}/"
start_time = str(time.time())

# Record the start time
start_time = str(time.time())


# Define local file paths
drive_mount_point = '/content/drive/MyDrive'
source_path = 'leonardora/de/latam-challenge/'

# Set Google Cloud project and dataset info
project_id = "latam-challenge-leonardora"
project_name = "latam-challenge-leonardora"
dataset = "latam_tweets_dataset"
table ="tweets"

# Logging
logging_level = str(logging.DEBUG)
logging.basicConfig(level=int(logging_level))

In [4]:
# Reloads all modules automatically before executing a new line, so your latest changes are always available
%load_ext autoreload
%autoreload 2

In [5]:
# test.py ->
from google.colab import drive
import os
import logging
import subprocess

drive.mount('/content/drive', force_remount=True)

drive_mount_point = '/content/drive/MyDrive'
source_path = 'leonardora/de/latam-challenge/'
target_dir = os.path.join(drive_mount_point, source_path)

if not os.path.exists(target_dir):
    os.makedirs(target_dir)
    print(f"Directorio creado: {target_dir}")
else:
    print(f"Directorio existente: {target_dir}")

os.chdir(target_dir)
print(f"Directorio actual: {os.getcwd()}")

if os.path.exists(os.path.join(target_dir, ".git")):
    print("Repositorio ya existe. Haciendo pull de los últimos cambios...")
    !git checkout develop
    !git pull origin develop
else:
    repo_url = "https://github.com/leoengufmg/latam-challenge.git"
    print("Clonando el repositorio...")
    !git clone {repo_url} .

    !git checkout develop


Mounted at /content/drive
Directorio existente: /content/drive/MyDrive/leonardora/de/latam-challenge/
Directorio actual: /content/drive/MyDrive/leonardora/de/latam-challenge
Repositorio ya existe. Haciendo pull de los últimos cambios...
M	requirements.txt
Already on 'develop'
Your branch and 'origin/develop' have diverged,
and have 5 and 30 different commits each, respectively.
  (use "git pull" to merge the remote branch into yours)
From https://github.com/leoengufmg/latam-challenge
 * branch            develop    -> FETCH_HEAD
[33mhint: You have divergent branches and need to specify how to reconcile them.[m
[33mhint: You can do so by running one of the following commands sometime before[m
[33mhint: your next pull:[m
[33mhint: [m
[33mhint:   git config pull.rebase false  # merge (the default strategy)[m
[33mhint:   git config pull.rebase true   # rebase[m
[33mhint:   git config pull.ff only       # fast-forward only[m
[33mhint: [m
[33mhint: You can replace "git confi

In [6]:
# Test ->
from google.colab import auth
auth.authenticate_user()

In [7]:
# Functions.py ->
import importlib.util
import subprocess
import sys

def install_requirements(requirements_path: str = "./requirements.txt") -> bool:
    """
    Installs libraries listed in the requirements file if they are not already installed.

    Args:
        requirements_path (str): Path to the requirements.txt file. Defaults to "./requirements.txt".

    Returns:
        bool: True if installation is successful or libraries are already installed, False if an error occurs.
    """
    try:
        with open(requirements_path, 'r') as file:
            requirements = [line.strip() for line in file if line.strip()]

        for requirement in requirements:
            print(f"Installing {requirement}...")
            subprocess.run([sys.executable, "-m", "pip", "install", requirement], check=True)
            print(f"{requirement} installed successfully.")

        print("All required libraries were installed successfully.")
        return True

    except subprocess.CalledProcessError as e:
        print(f"Error installing libraries: {e}")
        return False
    except FileNotFoundError:
        print(f"Requirements file not found at: {requirements_path}")
        return False
    except Exception as e:
        print(f"Unexpected error: {e}")
        return False

In [8]:
# test -> installing requirements
if install_requirements():
    print("Procediendo a descargar datos...")
    # Código para descargar datos u otras acciones adicionales
else:
    print("Error al instalar los requisitos. Abortando acciones adicionales.")

Installing memory-profiler==0.61.0...
memory-profiler==0.61.0 installed successfully.
Installing google_cloud_bigquery==3.20.1...
google_cloud_bigquery==3.20.1 installed successfully.
Installing line_profiler==4.1.2...
line_profiler==4.1.2 installed successfully.
All required libraries were installed successfully.
Procediendo a descargar datos...


In [9]:
####
# Function.py -> setup bucket
from google.cloud import storage
from google.api_core.exceptions import Conflict

def create_bucket_if_not_exists(bucket_name: str, project_id: str, location: str = "US") -> storage.Bucket:
    """Creates a new bucket in Google Cloud Storage if it doesn't already exist.

    Args:
        bucket_name (str): The unique name of the bucket to create.
        project_id (str): The ID of the Google Cloud project.
        location (str): The location where the bucket will be created (default is "US").

    Returns:
        storage.Bucket: The bucket object, either newly created or already existing.
    """
    # Initialize the Google Cloud Storage client
    storage_client = storage.Client(project=project_id)

    # Check if the bucket already exists
    try:
        bucket = storage_client.get_bucket(bucket_name)
        print(f"Bucket {bucket_name} already exists.")
        return bucket
    except Exception:
        print(f"Bucket {bucket_name} does not exist, attempting to create it.")

    # Create the bucket if it does not exist
    try:
        bucket = storage_client.create_bucket(bucket_name, location=location)
        print(f"Bucket {bucket.name} created in {location}.")
        return bucket
    except Conflict:
        print(f"Bucket {bucket_name} already exists (Conflict error).")
        return storage_client.get_bucket(bucket_name)
    except Exception as e:
        print(f"Error creating bucket: {e}")
        raise

new_bucket = create_bucket_if_not_exists(bucket_name, project_id)

Bucket gcp_latam_twitter already exists.


In [10]:
# functions.py ->
import io
import logging
from google.colab import auth
from google.colab import drive
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.cloud import storage  # For Google Cloud Storage access
from typing import Any

# Configura el log
logging.basicConfig(level=logging.INFO)

# Autentica en Google Colab
def authenticate_google_drive() -> None:
    """Authenticate the user with Google Drive."""
    auth.authenticate_user()

def mount_google_drive(mount_point: str = '/content/drive') -> None:
    """Mounts Google Drive to a specified mount point."""
    drive.mount(mount_point, force_remount=True)

def download_file_from_drive(drive_service: Any, file_id: str) -> io.BytesIO:
    """Downloads a file from Google Drive and returns it as a BytesIO object."""
    downloaded = io.BytesIO()
    try:
        request = drive_service.files().get_media(fileId=file_id)
        downloader = MediaIoBaseDownload(downloaded, request)

        done = False
        while not done:
            status, done = downloader.next_chunk()
            print(f'Downloading {int(status.progress() * 100)}%')

        downloaded.seek(0)
        return downloaded
    except Exception as e:
        logging.error(f"Error downloading file: {e}")
        raise

def upload_drive_file_to_cloud_storage(bucket: storage.Bucket, folder_name: str, file_data: io.BytesIO, file_name: str) -> storage.Blob:
    """Uploads a file to Google Cloud Storage."""
    blob = bucket.blob(f"{folder_name}/{file_name}")
    file_data.seek(0)  # Resetea el puntero del archivo
    blob.upload_from_file(file_data)
    return blob


def extract_zip_file_conditionally(
    bucket: storage.Bucket, folder_name: str, zip_file_name: str
) -> str:
    """Extracts a ZIP file in Google Cloud Storage conditionally.

    Checks if the ZIP file exists and is identical to the local version before
    extracting its contents to avoid redundant operations.

    Args:
        bucket (storage.Bucket): The Google Cloud Storage bucket object.
        folder_name (str): The name of the folder containing the ZIP file.
        zip_file_name (str): The name of the ZIP file to extract.

    Returns:
        str: The name of the extracted JSON file, or an empty string if no
              extraction occurred.
    """
    json_file_name = ''
    blob_name = ''

    try:
        # Verify ZIP file existence in the bucket
        zip_blob = bucket.blob(f'{folder_name}/{zip_file_name}')
        if not zip_blob.exists():
            print(f"ZIP file '{zip_file_name}' does not exist in bucket '{bucket.name}'.")
            return False

        # Open the ZIP archive in memory for efficient processing
        with zipfile.ZipFile(io.BytesIO(zip_blob.download_as_string()), 'r') as z:
            for file_info in z.infolist():  # Iterate through each file in the ZIP archive
                blob_name = f'{folder_name}/{file_info.filename}'  # Construct blob path
                json_file_name = file_info.filename  # Store the JSON file name
                json_blob = bucket.blob(blob_name)

                # Download as string and get the size if the JSON blob exists in the bucket
                if json_blob.exists():
                    existing_blob_data: str = json_blob.download_as_string()
                    existing_blob_size: int = len(existing_blob_data)

                # Check for file existence and size match for conditional extraction
                if json_blob.exists() and existing_blob_size == file_info.file_size:
                    print(f"File '{json_file_name}' already exists on cloud storage with exact matching size, skipping extraction.")
                else:
                    # Extract and upload the file if conditions are not met
                    with z.open(file_info) as file:
                        json_blob.upload_from_file(file)  # Upload extracted file

                    print(f'ZIP File extracted to gs://{bucket.name}/{blob_name}')

    except zipfile.BadZipFile:
        logging.warning(f'Invalid ZIP file: gs://{bucket.name}/{folder_name}/{zip_file_name}')
    except Exception as e:
        logging.error(f'Error extracting ZIP file: {e}')

    finally:
        return json_file_name

In [11]:
# Inicializa el objeto para el archivo descargado
downloaded: io.BytesIO = io.BytesIO()

try:
    # Autenticación y montaje de Google Drive
    authenticate_google_drive()
    mount_google_drive()
    drive_service: Any = build('drive', 'v3')

    # Acceso a Google Cloud Storage
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    downloaded = download_file_from_drive(drive_service, file_id)

    # Verificar si el archivo tiene contenido
    if downloaded.getbuffer().nbytes == 0:
        logging.info("Skipping upload as the file is empty.")
    else:
        # Cargar el archivo en Google Cloud Storage
        uploaded_blob = upload_drive_file_to_cloud_storage(bucket, folder_name, downloaded, zip_file_name)

        # Descomprimir el archivo si es un ZIP
        json_file_name: str = extract_zip_file_conditionally(bucket, folder_name, zip_file_name)

    logging.info("File transfer successful!")

except Exception as e:
    logging.error(f"An error occurred: {e}")

finally:
    downloaded.close()
    print("File transfer process completed.")

Mounted at /content/drive
Downloading 100%
File transfer process completed.


In [1]:
# Functions
import logging
from google.cloud import bigquery
from google.api_core.exceptions import NotFound


def authenticate_bigquery(project_id: str) -> bigquery.Client:
    """
    Authenticates to BigQuery and returns the client object.
    """
    try:
        client = bigquery.Client(project=project_id)
        logging.info(f"Authenticated to BigQuery using project ID '{project_id}'.")
        return client
    except Exception as e:
        logging.error(f"Authentication to BigQuery failed: {e}")
        raise


def create_dataset(client: bigquery.Client, dataset_id: str, mode: str = "create") -> None:
    """
    Creates a BigQuery dataset, with options for existence checks and overwriting.
    """
    dataset_ref = client.dataset(dataset_id)

    try:
        client.get_dataset(dataset_ref)
        if mode == "overwrite":
            logging.info(f"Overwriting dataset '{dataset_id}'...")
            client.delete_dataset(dataset_ref, delete_contents=True)
            client.create_dataset(dataset_ref)
            logging.info(f"Dataset '{dataset_id}' overwritten.")
        else:
            logging.info(f"Dataset '{dataset_id}' already exists. Skipping creation.")
    except NotFound:
        logging.info(f"Dataset '{dataset_id}' not found, creating...")
        client.create_dataset(dataset_ref)
        logging.info(f"Dataset '{dataset_id}' created.")
    except Exception as e:
        logging.error(f"Error managing dataset '{dataset_id}': {e}")
        raise


def create_table(client: bigquery.Client, dataset_id: str, table_name: str, mode: str = "create") -> None:
    """
    Creates a BigQuery table, with options for existence checks and overwriting.
    """
    table_ref = client.dataset(dataset_id).table(table_name)

    try:
        client.get_table(table_ref)
        if mode == "overwrite":
            logging.info(f"Overwriting table '{table_name}'...")
            client.delete_table(table_ref)
            client.create_table(bigquery.Table(table_ref))
            logging.info(f"Table '{table_name}' overwritten.")
        else:
            logging.info(f"Table '{table_name}' already exists. Skipping creation.")
    except NotFound:
        logging.info(f"Table '{table_name}' not found, creating...")
        client.create_table(bigquery.Table(table_ref))
        logging.info(f"Table '{table_name}' created.")
    except Exception as e:
        logging.error(f"Error managing table '{table_name}': {e}")
        raise


def load_data_from_storage(
    client: bigquery.Client,
    source_uri: str,
    dataset_name: str,
    table_name: str,
    json_file_name: str
) -> None:

    full_source_uri = source_uri + json_file_name
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
        autodetect=True,
        ignore_unknown_values=True
    )
    load_job = client.load_table_from_uri(
        full_source_uri,
        client.dataset(dataset_name).table(table_name),
        job_config=job_config
    )

    try:
        load_job.result()
        logging.info(f"Data loaded from '{full_source_uri}' to table '{dataset_name}.{table_name}'.")
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        raise

In [23]:
# Import
import sys
import logging
import os
import time
import io

from google.colab import drive

# Constant values
bucket_name = 'gcp_latam_twitter'
folder_name = 'raw'
zip_file_name = 'tweets.json.zip'

# ID de ejemplo del archivo de Google Drive
file_id = '1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis'
gcloud_url = f"gs://{bucket_name}/{folder_name}/"
start_time = str(time.time())

# Record the start time
start_time = str(time.time())


# Define local file paths
drive_mount_point = '/content/drive/MyDrive'
source_path = 'leonardora/de/latam-challenge/'

# Set Google Cloud project and dataset info
project_id = "latam-challenge-leonardora"
project_name = "latam-challenge-leonardora"
dataset = "latam_tweets_dataset"
table ="tweets"
json_file_name = "farmers-protest-tweets-2021-2-4.json"
# Logging
logging_level = str(logging.DEBUG)
logging.basicConfig(level=int(logging_level))

In [24]:
# test
# Authenticate to BigQuery
bigquery_client: bigquery.Client = authenticate_bigquery(project_id)

# Create dataset (overwrite if needed)
create_dataset(bigquery_client, dataset, mode='overwrite')
print(f"Overwrite dataset {dataset}")

# Create table (overwrite if needed)
create_table(bigquery_client, dataset, table, mode='overwrite')
print(f"Overwrite table {dataset}.{table}")

# Load data from Cloud Storage
load_data_from_storage(bigquery_client, gcloud_url, dataset, table, json_file_name)
print(f"Store {gcloud_url}{json_file_name} into BigQuery {dataset}.{table}")

print("Data storage completed!")


Overwrite dataset latam_tweets_dataset
Overwrite table latam_tweets_dataset.tweets
Store gs://gcp_latam_twitter/raw/farmers-protest-tweets-2021-2-4.json into BigQuery latam_tweets_dataset.tweets
Data storage completed!


In [25]:
from google.api_core.exceptions import BadRequest, NotFound
from google.cloud import bigquery
from typing import List, Tuple, Any


def launch_bigquery(client: bigquery.Client, query: str) -> List[Tuple[Any, Any]]:
    data_extracted : List[Tuple[Any, Any]] = []

    try:
        query_job = client.query(query)
        results = query_job.result()

        if results.total_rows == 0:
            raise NotFound("No results found for the query.")

        data_extracted  = [(row[0], row[1]) for row in results]

    except BadRequest as e:
        print(f"BigQuery BadRequest error: {e}")
        raise
    except NotFound as e:
        print(f"Query returned no results: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error: {e}")
        raise

    return data_extracted


In [35]:

top_dates_with_top_users = r"""
    WITH
    TopDates AS (
        SELECT
            CAST(date AS DATE) AS tweets_date,
            COUNT(id) AS tweet_count
        FROM latam_tweets_dataset.tweets
        WHERE id IS NOT NULL
        GROUP BY tweets_date
        ORDER BY tweet_count DESC
        LIMIT 10
    ),
    TopUsersDate AS (
        SELECT
            TD.tweets_date,
            TW.user.username,
            MAX(TD.tweet_count) AS max_tweet_count,
            COUNT(TW.id) AS user_tweet_count,
            ROW_NUMBER() OVER (
                PARTITION BY TD.tweets_date
                ORDER BY MAX(TD.tweet_count) DESC, COUNT(*) DESC
            ) AS row_number
        FROM latam_tweets_dataset.tweets AS TW
        INNER JOIN TopDates AS TD
            ON TD.tweets_date = CAST(TW.date AS DATE)
        WHERE TW.id IS NOT NULL
        GROUP BY
            TD.tweets_date,
            TW.user.username
        ORDER BY
            max_tweet_count DESC,
            user_tweet_count DESC,
            TW.user.username ASC
    )

    SELECT
        tweets_date,
        username
    FROM TopUsersDate
    WHERE row_number = 1
"""

"""
Top 10 most used emojis with their respective counts.
Assumption: The query does not contain duplicate entries, as the tweet with 'id = 1362813218952007687' contains two heart and two fist emojis.
Assumption: A valid tweet has an id
"""
top_emojis = r"""
    WITH
    ExtractedEmojis AS (
        SELECT
            REGEXP_EXTRACT_ALL(
                content,
                FORMAT(
                    r"(?:[\x{1F300}-\x{1F5FF}]|[\x{1F900}-\x{1F9FF}]|[\x{1F600}-\x{1F64F}]|[\x{1F680}-\x{1F6FF}]" ||
                    r"|[\x{2600}-\x{26FF}]\x{FE0F}?|[\x{2700}-\x{27BF}]\x{FE0F}?|\x{24C2}\x{FE0F}?|[\x{1F1E6}-\x{1F1FF}]{1,2}" ||
                    r"|[\x{1F170}\x{1F171}\x{1F17E}\x{1F17F}\x{1F18E}\x{1F191}-\x{1F19A}]\x{FE0F}?" ||
                    r"|[\\x{0023}\x{002A}\x{0030}-\x{0039}]\x{FE0F}?\x{20E3}|[\x{2194}-\x{2199}\x{21A9}-\x{21AA}]\x{FE0F}?" ||
                    r"|[\x{2B05}-\x{2B07}\x{2B1B}\x{2B1C}\x{2B50}\x{2B55}]\x{FE0F}?|[\x{2934}\x{2935}]\x{FE0F}?" ||
                    r"|[\x{3297}\x{3299}]\x{FE0F}?|[\x{1F201}\x{1F202}\x{1F21A}\x{1F22F}\x{1F232}\x{1F23A}\x{1F250}\x{1F251}]\x{FE0F}?" ||
                    r"|[\x{203C}-\x{2049}]\x{FE0F}?|[\x{00A9}-\x{00AE}]\x{FE0F}?|[\x{2122}\x{2139}]\x{FE0F}?" ||
                    r"|\x{1F004}\x{FE0F}?|\x{1F0CF}\x{FE0F}?|[\x{231A}\x{231B}\x{2328}\x{23CF}\x{23E9}\x{23F3}\x{23F8}\x{23FA}]\x{FE0F}?)"
                )
            ) AS emojis
        FROM latam_tweets_dataset.tweets
        WHERE id IS NOT NULL
    )

    SELECT
        emoji,
        COUNT(emoji) AS count
    FROM ExtractedEmojis
    CROSS JOIN UNNEST(emojis) AS emoji
    GROUP BY emoji
    ORDER BY count DESC
    LIMIT 10
"""

"""
Top 10 all-time most influential users (username) based on the count of mentions (@) each of them receives.
Assumption: A valid tweet has an id
"""
top_influential_users = r"""
    WITH
    MentionedUsersCount AS (
        SELECT
            user.username AS username,
            COUNT(user.username) AS count
        FROM
            latam_tweets_dataset.tweets as TW,
            UNNEST(mentionedUsers) AS user
        WHERE TW.id IS NOT NULL
        GROUP BY username
    )

    SELECT
        username,
        count AS mention_count
    FROM MentionedUsersCount
    ORDER BY count DESC
    LIMIT 10
"""

In [27]:
import datetime
from typing import List, Tuple
from google.cloud import bigquery
import line_profiler


@line_profiler.profile
def q1_time(client: bigquery.Client, query: str) -> List[Tuple[datetime.date, str]]:
  return launch_bigquery(client, query)

In [28]:
from line_profiler import LineProfiler

bigquery_client: bigquery.Client = authenticate_bigquery(project_id)

print("LATAM Challenge Time - Top 10 Dates with more Tweets and the Username with more Tweets for each Day")

profiler = LineProfiler()
profiler.add_function(q1_time)
profiler.enable_by_count()
q1_time_tuple = q1_time(bigquery_client, top_dates_with_top_users)
profiler.print_stats()
display(q1_time_tuple)


LATAM Challenge Time - Top 10 Dates with more Tweets and the Username with more Tweets for each Day
Timer unit: 1e-09 s

Total time: 1.85814 s
File: <ipython-input-27-7b5add99afba>
Function: q1_time at line 7

Line #      Hits         Time  Per Hit   % Time  Line Contents
     7                                           @line_profiler.profile
     8                                           def q1_time(client: bigquery.Client, query: str) -> List[Tuple[datetime.date, str]]:
     9         1 1858141975.0    2e+09    100.0    return launch_bigquery(client, query)



[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

In [29]:
from typing import List, Tuple
from google.cloud import bigquery
import line_profiler

@line_profiler.profile
def q2_time(client: bigquery.Client, query: str) -> List[Tuple[str, int]]:
    return launch_bigquery(client, query)


In [30]:
from line_profiler import LineProfiler

bigquery_client: bigquery.Client = authenticate_bigquery(project_id)

print("LATAM Challenge Time - Top 10 Dates Emojis")

profiler = LineProfiler()
profiler.add_function(q2_time)
profiler.enable_by_count()
q2_time_tuple = q2_time(bigquery_client, top_emojis)
profiler.print_stats()
display(q2_time_tuple)

LATAM Challenge Time - Top 10 Dates Emojis
Timer unit: 1e-09 s

Total time: 1.48123 s
File: <ipython-input-29-dc45e6cba86f>
Function: q2_time at line 5

Line #      Hits         Time  Per Hit   % Time  Line Contents
     5                                           @line_profiler.profile
     6                                           def q2_time(client: bigquery.Client, query: str) -> List[Tuple[str, int]]:
     7         1 1481227660.0    1e+09    100.0      return launch_bigquery(client, query)



[('✊', 2402),
 ('❤️', 1382),
 ('❤', 397),
 ('☮️', 316),
 ('♂️', 179),
 ('✌️', 168),
 ('♀️', 148),
 ('✌', 106),
 ('‼️', 74),
 ('♥️', 73)]

In [31]:
from typing import List, Tuple
from google.cloud import bigquery
import line_profiler

@line_profiler.profile
def q3_time(client: bigquery.Client, query: str) -> List[Tuple[str, int]]:
    return launch_bigquery(client, query)


In [32]:
from line_profiler import LineProfiler

bigquery_client: bigquery.Client = authenticate_bigquery(project_id)

print("LATAM Challenge Time - Top 10 Influential Users")
profiler = LineProfiler()
profiler.add_function(q3_time)
profiler.enable_by_count()
q3_time_tuple = q3_time(bigquery_client, top_influential_users)
profiler.print_stats()
display(q3_time_tuple)

LATAM Challenge Time - Top 10 Influential Users
Timer unit: 1e-09 s

Total time: 1.17013 s
File: <ipython-input-31-32f9b243065b>
Function: q3_time at line 5

Line #      Hits         Time  Per Hit   % Time  Line Contents
     5                                           @line_profiler.profile
     6                                           def q3_time(client: bigquery.Client, query: str) -> List[Tuple[str, int]]:
     7         1 1170131743.0    1e+09    100.0      return launch_bigquery(client, query)



[('narendramodi', 2265),
 ('Kisanektamorcha', 1840),
 ('RakeshTikaitBKU', 1644),
 ('PMOIndia', 1427),
 ('RahulGandhi', 1146),
 ('GretaThunberg', 1048),
 ('RaviSinghKA', 1019),
 ('rihanna', 986),
 ('UNHumanRights', 962),
 ('meenaharris', 926)]

In [36]:
import datetime
from typing import List, Tuple
from google.cloud import bigquery
import memory_profiler

@memory_profiler.profile
def q1_memory(client: bigquery.Client, query: str) -> List[Tuple[datetime.date, str]]:
    return launch_bigquery(client, query)

In [37]:
bigquery_client: bigquery.Client = authenticate_bigquery(project_id)
print("LATAM Challenge Memory - Top 10 Dates with more Tweets and the Username with more Tweets for each Day")
q1_memory(bigquery_client, top_dates_with_top_users)

LATAM Challenge Memory - Top 10 Dates with more Tweets and the Username with more Tweets for each Day
ERROR: Could not find file <ipython-input-36-2e1c5205567b>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.


[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

In [38]:
from typing import List, Tuple
from google.cloud import bigquery
import memory_profiler

@memory_profiler.profile
def q2_memory(client: bigquery.Client, query: str) -> List[Tuple[str, int]]:
    try:
        results = launch_bigquery(client, query)
        return [(row[0], int(row[1])) for row in results]
    except ValueError as e:
        print(f"Error converting data to string and integer pairs: {e}")
        return []


In [39]:
# Print a title describing the memory usage analysis
print("LATAM Challenge Memory - Top 10 Dates Emojis")
q2_memory(bigquery_client, top_emojis)

LATAM Challenge Memory - Top 10 Dates Emojis
ERROR: Could not find file <ipython-input-38-419bc5c9ce2b>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.


[('✊', 2402),
 ('❤️', 1382),
 ('❤', 397),
 ('☮️', 316),
 ('♂️', 179),
 ('✌️', 168),
 ('♀️', 148),
 ('✌', 106),
 ('‼️', 74),
 ('♥️', 73)]

In [40]:
from typing import List, Tuple
from google.cloud import bigquery
import memory_profiler

@memory_profiler.profile
def q3_memory(client: bigquery.Client, query: str) -> List[Tuple[str, int]]:
   try:
       results = launch_bigquery(client, query)
       formatted_results = [(row[0], int(row[1])) for row in results]
       if len(formatted_results) > 1000:
           print("Warning: Returning a large dataset. Consider using streaming or pagination for memory optimization.")
       return formatted_results
   except ValueError as e:
       print(f"Error converting data to string and integer pairs: {e}")
       return []


In [41]:
print("LATAM Challenge Memory - Top 10 Influential Users")
q3_memory(bigquery_client, top_influential_users)

LATAM Challenge Memory - Top 10 Influential Users
ERROR: Could not find file <ipython-input-40-629405867ca4>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.


[('narendramodi', 2265),
 ('Kisanektamorcha', 1840),
 ('RakeshTikaitBKU', 1644),
 ('PMOIndia', 1427),
 ('RahulGandhi', 1146),
 ('GretaThunberg', 1048),
 ('RaviSinghKA', 1019),
 ('rihanna', 986),
 ('UNHumanRights', 962),
 ('meenaharris', 926)]