# Python based CCloud Producer Demo¶

How can I get some Wikipedia pageview data for timeseries analysis using OpenTSx into my Kafka Cluster?
https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews


In [1]:
import sys

In [4]:
!pip install "confluent-kafka[avro,json,protobuf]"

Collecting confluent-kafka[avro,json,protobuf]
  Downloading confluent_kafka-1.7.0-cp36-cp36m-manylinux2010_x86_64.whl (2.7 MB)
[K     |████████████████████████████████| 2.7 MB 22.7 MB/s eta 0:00:01
Collecting avro-python3==1.10.0
  Using cached avro-python3-1.10.0.tar.gz (37 kB)
Building wheels for collected packages: avro-python3
  Building wheel for avro-python3 (setup.py) ... [?25ldone
[?25h  Created wheel for avro-python3: filename=avro_python3-1.10.0-py3-none-any.whl size=43734 sha256=aae6181c1342ec0c5bae90e6d14671f91a7fc0759576abb095538d2224bc65a8
  Stored in directory: /home/ec2-user/.cache/pip/wheels/92/2f/1b/ea81375735558c2dd9b957cf5cba81d4a49ee6aba03a3fec60
Successfully built avro-python3
Installing collected packages: confluent-kafka, avro-python3
Successfully installed avro-python3-1.10.0 confluent-kafka-1.7.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
!pip install --no-binary :all: argparse
!pip install --no-binary :all: requests
!pip install --no-binary :all: certifi
!pip install mwviews
!pip install git+https://github.com/Commonists/pageview-api.git

Collecting argparse
  Downloading argparse-1.4.0.tar.gz (70 kB)
[K     |████████████████████████████████| 70 kB 12.2 MB/s eta 0:00:01
[?25hSkipping wheel build for argparse, due to binaries being disabled for it.
Installing collected packages: argparse
    Running setup.py install for argparse ... [?25ldone
[?25hSuccessfully installed argparse-1.4.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
Collecting mwviews
  Downloading mwviews-0.1.0.tar.gz (5.8 kB)
Collecting mwcli
  Downloading mwcli-0.0.3-py2.py3-none-any.whl (8.4 kB)
Collecting mwapi
  Downloading mwapi-0.5.1-py2.py3-none-any.whl (10 kB)
Collecting para
  Downloading para-0.0.8-py

In [6]:
import argparse

from confluent_kafka import avro, KafkaError
from confluent_kafka.admin import AdminClient, NewTopic
from uuid import uuid4

In [7]:
class Wikipage(object):
    """
        Wikipage stores the meta-data for a Wikipage used in the analysis Kafka key.
    """

    # Use __slots__ to explicitly declare all data members.
    __slots__ = ["pagename", "id"]

    def __init__(self, pagename=None):
        self.pagename = pagename
        # Unique id used to track produce request success/failures.
        # Do *not* include in the serialized object.
        self.id = uuid4()

    @staticmethod
    def dict_to_name(obj, ctx):
        return Wikipage(obj['pagename'])

    @staticmethod
    def name_to_dict(name, ctx):
        return Wikipage.to_dict(name)

    def to_dict(self):
        """
            We must provide a dict representation of our class for serialization.
        """
        return dict(pagename=self.pagename)

In [8]:
class Count(object):
    """
        Count stores the pageview counts.
    """

    # Use __slots__ to explicitly declare all data members.
    __slots__ = ["count", "id"]

    def __init__(self, count=None):
        self.count = count
        # Unique id used to track produce request success/failures.
        # Do *not* include in the serialized object.
        self.id = uuid4()

    @staticmethod
    def dict_to_count(obj, ctx):
        return Count(obj['count'])

    @staticmethod
    def count_to_dict(count, ctx):
        return Count.to_dict(count)

    def to_dict(self):
        """
            We must provide a dict representation of our class for serialization.
        """
        return dict(count=self.count)

In [9]:
#
# Helper function to read CCloud configuration.
#
def read_ccloud_config(config_file):
    """Read Confluent Cloud configuration for librdkafka clients"""

    conf = {}
    with open(config_file) as fh:
        for line in fh:
            line = line.strip()
            if len(line) != 0 and line[0] != "#":
                parameter, value = line.strip().split('=', 1)
                conf[parameter] = value.strip()

    return conf

#
# Helper function to create Kafka Topic.
#
def create_topic(conf, topic):
    """
        Create a topic if needed
        Examples of additional admin API functionality:
        https://github.com/confluentinc/confluent-kafka-python/blob/master/examples/adminapi.py
    """

    a = AdminClient({
           'bootstrap.servers': conf['bootstrap.servers'],
           'sasl.mechanisms': 'PLAIN',
           'security.protocol': 'SASL_SSL',
           'sasl.username': conf['sasl.username'],
           'sasl.password': conf['sasl.password']
    })
    fs = a.create_topics([NewTopic(
         topic,
         num_partitions=1,
         replication_factor=3
    )])
    for topic, f in fs.items():
        try:
            f.result()  # The result itself is None
            print("Topic {} created".format(topic))
        except Exception as e:
            # Continue if error code TOPIC_ALREADY_EXISTS, which may be true
            # Otherwise fail fast
            if e.args[0].code() != KafkaError.TOPIC_ALREADY_EXISTS:
                print("Failed to create topic {}: {}".format(topic, e))
                sys.exit(1)

In [11]:

# =============================================================================
#
# Produce messages to Confluent Cloud
#
# Using Confluent Python Client for Apache Kafka
# Writes JSON data, no integration with Confluent Cloud Schema Registry
#
# =============================================================================
from confluent_kafka import Producer
from confluent_kafka.serialization import StringSerializer

import json

In [19]:



# Define arguments and configurations and initialize

#config_file = "/Users/mkaempf/.confluent/python.config"
#
# The current path is inside the cloned project ... so we can use relative paths.
#
config_file = "./config/private/ccloud.props"

topic = "test_sagemaker"

conf = read_ccloud_config(config_file)
create_topic( conf, topic )

print( conf )

#
# The producer doesn't like some properties:
#
# for full list of configurations, see:
#  https://docs.confluent.io/platform/current/clients/confluent-kafka-python/#serializingproducer
conf.pop( 'schema.registry.url' )
conf.pop( 'basic.auth.credentials.source' )
conf.pop( 'basic.auth.user.info' )
conf.pop( 'key.serializer' )
conf.pop( 'value.serializer' )



{'request.timeout.ms': '20000', 'retry.backoff.ms': '500', 'key.serializer': 'org.apache.kafka.common.serialization.StringSerializer', 'value.serializer': 'io.confluent.kafka.serializers.KafkaAvroSerializer', 'bootstrap.servers': 'pkc-zm3p0.eu-north-1.aws.confluent.cloud:9092', 'security.protocol': 'SASL_SSL', 'sasl.mechanisms': 'PLAIN', 'sasl.username': '2JMCB3VQUXPPZM7D', 'sasl.password': '1WiprUIWgGmNAgQV1GW0rQi0S3szeGHDSGTLsQDZ0giMbzXXkQU6MjZlpYgDf6T+', 'schema.registry.url': 'https://psrc-xm8wx.eu-central-1.aws.confluent.cloud', 'basic.auth.credentials.source': 'USER_INFO', 'basic.auth.user.info': 'QI747SYN7RCNNNI5:4OCvylbx/64yXCEf5UFsY4jMiZ5krg9O80bu3e5R4T2Zm4Bad99PhfVbYmLD60oM'}


'io.confluent.kafka.serializers.KafkaAvroSerializer'

In [20]:
producer = Producer(conf)

delivered_records = 0

In [21]:
# Optional per-message on_delivery handler (triggered by poll() or flush())
# when a message has been successfully delivered or
# permanently failed delivery (after retries).
def acked(err, msg):
    global delivered_records
    """Delivery report handler called on
    successful or failed delivery of message
    """
    if err is not None:
        print("Failed to deliver message: {}".format(err))
    else:
        delivered_records += 1
        print("Produced record to topic {} partition [{}] @ offset {}"
              .format(msg.topic(), msg.partition(), msg.offset()))
        

In [28]:
pages = ["Paris", "London", "Berlin", "New-York" ]

import pageviewapi

from datetime import datetime

# datetime object containing current date and time
tStart = datetime.now()
 
print("now =", tStart)

# dd/mm/YY H:M:S
dt_string = tStart.strftime("%d/%m/%Y %H:%M:%S")
print("date and time =", dt_string)

n = 1
for pn in pages:
    n = n+1
    page_object = Wikipage()
    page_object.pagename = pn
    count_object = Count()
    count_object.count = n
    
    print("Producing JSON record: {}\t{}".format(page_object.pagename, count_object.count))
    
    data = pageviewapi.per_article('en.wikipedia', p, '20200101', '20201231',
                        access='all-access', agent='all-agents', granularity='daily')
    
    producer.produce(topic="topic1", key=json.dumps(page_object.to_dict()), value=json.dumps(data), on_delivery=acked)

    producer.poll(0)

producer.flush()

print("{} messages were produced to topic {}!".format(delivered_records, topic))

tEnd = datetime.now()
 
print("now =", tEnd)

print("loadtime => ", tEnd - tStart)

now = 2021-11-20 10:39:05.906939
date and time = 20/11/2021 10:39:05
Producing JSON record: Paris	2
Producing JSON record: London	3
Producing JSON record: Berlin	4
Producing JSON record: New-York	5
Produced record to topic topic1 partition [0] @ offset 12
Produced record to topic topic1 partition [0] @ offset 13
Produced record to topic topic1 partition [0] @ offset 14
Produced record to topic topic1 partition [0] @ offset 15
4 messages were produced to topic test_sagemaker!
now = 2021-11-20 10:39:07.245270
loadtime =>  0:00:01.338331


In [29]:

pageviewapi.per_article('en.wikipedia', 'Paris', '20151106', '20151120',
                        access='all-access', agent='all-agents', granularity='daily')

AttrDict({'items': [{'project': 'en.wikipedia', 'article': 'Paris', 'granularity': 'daily', 'timestamp': '2015110600', 'access': 'all-access', 'agent': 'all-agents', 'views': 9168}, {'project': 'en.wikipedia', 'article': 'Paris', 'granularity': 'daily', 'timestamp': '2015110700', 'access': 'all-access', 'agent': 'all-agents', 'views': 7939}, {'project': 'en.wikipedia', 'article': 'Paris', 'granularity': 'daily', 'timestamp': '2015110800', 'access': 'all-access', 'agent': 'all-agents', 'views': 8337}, {'project': 'en.wikipedia', 'article': 'Paris', 'granularity': 'daily', 'timestamp': '2015110900', 'access': 'all-access', 'agent': 'all-agents', 'views': 9355}, {'project': 'en.wikipedia', 'article': 'Paris', 'granularity': 'daily', 'timestamp': '2015111000', 'access': 'all-access', 'agent': 'all-agents', 'views': 9485}, {'project': 'en.wikipedia', 'article': 'Paris', 'granularity': 'daily', 'timestamp': '2015111100', 'access': 'all-access', 'agent': 'all-agents', 'views': 9140}, {'projec

In [30]:
pageviewapi.legacy_pagecounts('fr.wikipedia', '2010010100', '2011010100', granularity='daily')

AttrDict({'items': [{'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'daily', 'timestamp': '2010010100', 'count': 14939284}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'daily', 'timestamp': '2010010200', 'count': 15944604}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'daily', 'timestamp': '2010010300', 'count': 15641373}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'daily', 'timestamp': '2010010400', 'count': 14295617}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'daily', 'timestamp': '2010010500', 'count': 13814118}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'daily', 'timestamp': '2010010600', 'count': 17645546}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'daily', 'timestamp': '2010010700', 'count': 17253156}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'daily', 'timestamp'