In [38]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
import os
from apache_beam import window
from apache_beam.transforms.trigger import AfterWatermark, AfterProcessingTime, AccumulationMode, AfterCount
from datetime import datetime
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [39]:
GCP_PROJECT_ID=os.getenv('GCP_PROJECT_ID')

# Replace 'my-service-account-path' with your service account path
SERVICE_ACCOUNT_KEY_PATH = "key.json"


print("Service account file : ", SERVICE_ACCOUNT_KEY_PATH)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SERVICE_ACCOUNT_KEY_PATH

PROJECT_ID = GCP_PROJECT_ID
INPUT_SUBSCRIPTION_ADDRESS = f"projects/{GCP_PROJECT_ID}/subscriptions/poc-apache-beam-sub"
OUTPUT_TOPIC_ADDRESS = f"projects/{GCP_PROJECT_ID}/topics/apache-beam-bounded"

# print(INPUT_SUBSCRIPTION_ADDRESS)
# print(OUTPUT_TOPIC_ADDRESS)


Service account file :  key.json


In [40]:
options = PipelineOptions()
options.view_as(StandardOptions).streaming = True

p = beam.Pipeline(options=options)

def calculateProfit(elements):
    """Calculates profit by subtracting cost from revenue"""
    try:
        revenue = int(elements[6])
        cost = int(elements[5])
        profit = revenue - cost
        return elements + [profit]
    except (IndexError, ValueError):
        return elements + [0]  # Default profit to 0 in case of issues

def custom_timestamp(elements):
    """Apply a timestamp to the elements"""
    try:
        event_timestamp = int(elements[-2])  # Assuming timestamp is second last column
        return beam.window.TimestampedValue(elements, event_timestamp)
    except (IndexError, ValueError):
        return beam.window.TimestampedValue(elements, 0)

def encode_byte_string(elements):
    """Encode elements as byte strings for Pub/Sub"""
    print(elements)
    return str(elements).encode('utf-8')

In [None]:
pubsub_data = (
    p 
    | 'Read from pub sub' >> beam.io.ReadFromPubSub(subscription=INPUT_SUBSCRIPTION_ADDRESS)  
    | 'Decode Byte Data' >> beam.Map(lambda data: data.decode('utf-8').strip())  # <-- **Ensure decoding**
    | 'Split Row' >> beam.Map(lambda row: row.split(','))  # <-- **Now safe to split**
    | 'Filter By Country' >> beam.Filter(lambda elements: len(elements) > 1 and elements[1] in ["Mumbai", "Bangalore"])  
    | 'Create Profit Column' >> beam.Map(calculateProfit)  
    | 'Apply custom timestamp' >> beam.Map(custom_timestamp)  
    | 'Form Key Value pair' >> beam.Map(lambda elements: (elements[0], int(elements[-1])) if len(elements) > 8 else (elements[0], 0))  
    | 'Window' >> beam.WindowInto(window.FixedWindows(20))  
    | 'Sum values' >> beam.CombinePerKey(sum)  
    | 'Encode to byte string' >> beam.Map(encode_byte_string)  
    | 'Write to pub sub' >> beam.io.WriteToPubSub(OUTPUT_TOPIC_ADDRESS)  
)

result = p.run()
result.wait_until_finish()

