In [1]:
import pandas as pd
from pymongo import MongoClient
from tqdm import tqdm

def fetch_data_in_batches_with_progress(mongo_uri, db_name, collection_name, username, password, batch_size=100000):
    # Format the MongoDB URI with the provided username and password
    mongo_uri = f"mongodb://{username}:{password}@{mongo_uri}"
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    # Determine the total number of documents to set up the progress bar
    total_documents = collection.count_documents({})

    # Initialize the progress bar
    pbar = tqdm(total=total_documents, desc='Fetching Data', unit='doc')

    data_batches = []  # Store data in batches
    for skip in range(0, total_documents, batch_size):
        # Use skip and limit to fetch the batch
        cursor = collection.find().skip(skip).limit(batch_size)
        batch = list(cursor)

        # Convert the batch to a DataFrame and add it to the list
        batch_df = pd.DataFrame(batch)
        data_batches.append(batch_df)

        # Update the progress bar
        pbar.update(len(batch))

    # Combine all batches into a single DataFrame
    combined_df = pd.concat(data_batches, ignore_index=True)

    # Close the progress bar and clean up
    pbar.close()
    cursor.close()
    client.close()

    return combined_df

# Use this function to fetch data with progress tracking
data = fetch_data_in_batches_with_progress('mongodb:27017', 'Epsymolo', 'power_flow_data', 'root', 'root')

Fetching Data: 100%|██████████| 1655424/1655424 [00:11<00:00, 142131.94doc/s]


In [2]:
data.shape

(1655424, 5)

In [3]:
data.dtypes

_id                object
index               int64
Timestamp          object
PowerLineID         int64
PowerFlowValue    float64
dtype: object

In [4]:
data.head()

Unnamed: 0,_id,index,Timestamp,PowerLineID,PowerFlowValue
0,65802e6a95367e1134e9ea1b,14698731,2010-09-02 00:00:00,1237,0.0
1,65802e6a95367e1134e9ea1c,13915587,2010-09-02 00:00:00,361,0.0
2,65802e6a95367e1134e9ea1d,15135897,2010-09-02 00:00:00,1726,0.0
3,65802e6a95367e1134e9ea1e,14559267,2010-09-02 00:00:00,1081,-110.94
4,65802e6a95367e1134e9ea1f,14342025,2010-09-02 00:00:00,838,-4.3


In [3]:
data_copy=data

In [4]:
from datetime import timedelta

df = pd.DataFrame(data_copy)
# Convert 'Timestamp' to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Set the correct start and end date to the current date and 5 days back respectively
start_date = pd.to_datetime('2023-12-15')
end_date = start_date - timedelta(days=2)  # Includes the end date in the 5-day period

# Calculate the date shift
days_to_shift = (start_date - df['Timestamp'].min().normalize()).days
df['Timestamp'] = df['Timestamp'] + pd.DateOffset(days=days_to_shift)

# Group by PowerLineID and the Timestamp rounded down to the nearest hour, and calculate the average PowerFlowValue
df_grouped = df.groupby(['PowerLineID', df['Timestamp'].dt.floor('H')])['PowerFlowValue'].mean().reset_index()
df_grouped['PowerLineID'] = df_grouped['PowerLineID'].astype(str)
# Display the first few rows of the final DataFrame with the full datetime information
df_grouped.head()

Unnamed: 0,PowerLineID,Timestamp,PowerFlowValue
0,0,2023-12-15 00:00:00,360.853333
1,0,2023-12-15 01:00:00,386.47
2,0,2023-12-15 02:00:00,400.74
3,0,2023-12-15 03:00:00,401.080833
4,0,2023-12-15 04:00:00,419.024167


In [13]:
df_grouped.shape

(137952, 3)

In [38]:
df_grouped.dtypes

PowerLineID               object
Timestamp         datetime64[ns]
PowerFlowValue           float64
dtype: object

In [7]:
# Filter the DataFrame for PowerLineID equal to 0
df_powerline_0 = df_grouped[df_grouped['PowerLineID'] == '0']

df_powerline_0

Unnamed: 0,PowerLineID,Timestamp,PowerFlowValue
0,0,2023-12-15 00:00:00,360.853333
1,0,2023-12-15 01:00:00,386.470000
2,0,2023-12-15 02:00:00,400.740000
3,0,2023-12-15 03:00:00,401.080833
4,0,2023-12-15 04:00:00,419.024167
...,...,...,...
67,0,2023-12-17 19:00:00,245.624167
68,0,2023-12-17 20:00:00,287.956667
69,0,2023-12-17 21:00:00,257.184167
70,0,2023-12-17 22:00:00,230.953333


In [6]:
pip install confluent-kafka

Collecting confluent-kafka
  Downloading confluent_kafka-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading confluent_kafka-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: confluent-kafka
Successfully installed confluent-kafka-2.3.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
from confluent_kafka import Producer
import json
import numpy as np



def delivery_report(err, msg):
    if err is not None:
        print(f"Delivery failed for message: {err}")
    else:
        print(f"Message delivered to {msg.topic()} [{msg.partition()}]")
def send_data_to_kafka(bootstrap_servers, topic, df):
    producer = Producer({'bootstrap.servers': bootstrap_servers})
    batch_size = 10000  # Define the batch size

    # Split the DataFrame into chunks
    chunks = np.array_split(df, range(batch_size, len(df), batch_size))
    
    for chunk in chunks:
        print(f"Sending batch of {len(chunk)} rows")

        for index, row in chunk.iterrows():
            try:
                # Ensure 'PowerLineID' is a string and convert Timestamp
                row['PowerLineID'] = str(row['PowerLineID'])
                row['Timestamp'] = int(row['Timestamp'].timestamp())

                payload = json.dumps(row.to_dict())
                #print(payload)
                producer.produce(topic=topic, value=payload.encode('utf-8'), callback=delivery_report)
                producer.poll(0)

            except Exception as e:
                print(f"An error occurred: {e}")

        producer.flush()
        print(f"\nFinished sending batch of {len(chunk)} rows to Kafka topic {topic}")

# The rest of your script remains unchanged

if __name__ == "__main__":
    BOOTSTRAP_SERVER = 'kafka:9092'  # Replace with your actual bootstrap server
    TOPIC = 'Epsymolo'  # Replace with your actual topic

    # Assuming df_grouped is the DataFrame with your data
    send_data_to_kafka(BOOTSTRAP_SERVER, TOPIC, df_grouped)


Sending batch of 10000 rows
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to Epsymolo [0]
Message delivered to