In [1]:
!pip install "pymongo[srv]"
import pymongo
from pymongo import MongoClient
import pandas as pd



In [2]:
# Database access credentials: to establish a connection to the MongoDB server using the MongoClient.
# The connection string includes the username (user), password(123), and database details.
client = MongoClient("mongodb+srv://user:123@cluster0.9d0ja.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")

# Access the 'blancco' database
db = client["blancco"]

# Access the 'all_reports' collection within the 'blancco' database
all_reports_collection = db["all_reports"]

# Can now use 'all_reports_collection' to perform operations like find, insert, update, or aggregate data

In [7]:
pipeline = [
    { "$unwind": "$erasures" },
    {
        "$match": {
            "erasures.state": "Successful"
        }
    },

    # Convert the 'elapsed_time' from "hh:mm:ss" to total seconds
    {
        "$project": {
            "elapsed_time_seconds": {
                "$add": [
                    # Convert hours to seconds
                    { "$multiply": [{ "$toInt": { "$arrayElemAt": [{ "$split": ["$erasures.elapsed_time", ":" ] }, 0] } }, 3600] },
                    # Convert minutes to seconds
                    { "$multiply": [{ "$toInt": { "$arrayElemAt": [{ "$split": ["$erasures.elapsed_time", ":" ] }, 1] } }, 60] },
                    # Add the seconds directly
                    { "$toInt": { "$arrayElemAt": [{ "$split": ["$erasures.elapsed_time", ":" ] }, 2] } }
                ]
            },
            # Retain the 'date' field to extract year and month (on next step)
            "date": "$date"
        }
    },
    {
        "$project": {
            "elapsed_time_seconds": 1,
            "year": { "$year": "$date" },
            "month": { "$month": "$date" }
        }
    },

    # Group by 'year' and 'month' to calculate the average elapsed time per month
    {
        "$group": {
            "_id": {
                "year": "$year",
                "month": "$month"
            },
            "average_erasure_time": { "$avg": "$elapsed_time_seconds" }
        }
    },

    # Restructure the output, renaming fields and excluding '_id'
    {
        "$project": {
            "year": "$_id.year",
            "month": "$_id.month",
            "average_erasure_time": "$average_erasure_time",
            "_id": 0
        }
    },

    # Sort by 'year' and 'month' in ascending order
    {
        "$sort": {
            "year": 1,
            "month": 1
        }
    },

    # Calculate the overall average erasure time across all months
    {
        "$group": {
            "_id": None,
            "overall_average_erasure_time": { "$avg": "$average_erasure_time" }
        }
    },
    {
        "$project": {
            "_id": 0,
            "overall_average_erasure_time": 1
        }
    }
]

# Run the aggregation pipeline
result = list(all_reports_collection.aggregate(pipeline))
result

[{'overall_average_erasure_time': 4541.637590845506}]