In [1]:
!pip install "pymongo[srv]"
import pymongo
from pymongo import MongoClient
import pandas as pd



In [2]:
# Database access credentials: to establish a connection to the MongoDB server using the MongoClient.
# The connection string includes the username (user), password(123), and database details.
client = MongoClient("mongodb+srv://user:123@cluster0.9d0ja.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")

# Access the 'blancco' database
db = client["blancco"]

# Access the 'all_reports' collection within the 'blancco' database
all_reports_collection = db["all_reports"]

# Can now use 'all_reports_collection' to perform operations like find, insert, update, or aggregate data

In [7]:
pipeline = [
    # Unwind the disks array to create a document for each disk
    { "$unwind": "$disks" },
    
    # Unwind the erasures array to create a document for each erasure
    { "$unwind": "$erasures" },

    # Match documents where the disks have a non-null interface type 
    # and the erasure state is "Successful"
    {
        "$match": {
            "disks.interface_type": { "$ne": None },  # Check for non-null interface_type
            "erasures.state": "Successful"              # Check for successful erasure state
        }
    },
    
    # Project the fields we are interested in for the next stage
    {
        "$project": {
            "interface_type": "$disks.interface_type",
            "model": "$disks.model",
            "serial_number": "$disks.serial",
            "raw_capacity": "$disks.capacity",  # Keep the raw capacity for debugging
            "capacity_gb": { "$divide": ["$disks.capacity", 1073741824] },  # Convert bytes to GB
            "raw_elapsed_time": "$erasures.elapsed_time",  # Keep raw elapsed time for debugging
            "elapsed_time_seconds": {
                "$add": [
                    { 
                        "$multiply": [
                            { 
                                "$toInt": { 
                                    "$arrayElemAt": [ 
                                        { "$split": ["$erasures.elapsed_time", ":"] }, 0 
                                    ] 
                                } 
                            },
                            3600  # Convert hours to seconds
                        ]
                    },
                    { 
                        "$multiply": [
                            { 
                                "$toInt": { 
                                    "$arrayElemAt": [ 
                                        { "$split": ["$erasures.elapsed_time", ":"] }, 1 
                                    ] 
                                } 
                            },
                            60  # Convert minutes to seconds
                        ]
                    },
                    { 
                        "$toInt": { 
                            "$arrayElemAt": [ 
                                { "$split": ["$erasures.elapsed_time", ":"] }, 2 
                            ] 
                        } 
                    }  # Get seconds
                ]
            }
        }
    },
    
    # Filter to only include documents with positive raw capacity and elapsed time
    {
        "$match": {
            "raw_capacity": { "$gt": 0 },  # Keep documents where raw_capacity is greater than 0
            "elapsed_time_seconds": { "$gt": 0 }  # Ensure elapsed time is greater than 0
        }
    },
    
    # Final projection of the necessary fields for output
    {
        "$project": {
            "interface_type": 1,
            "model": 1,
            "serial_number": 1,
            "raw_capacity": 1,  # Include raw capacity for comparison
            "capacity_gb": 1,  # Include capacity_gb in the final output
            "raw_elapsed_time": 1,  # Include raw elapsed time for debugging
            "elapsed_time_seconds": 1,  # Ensure this is included for distinct records
            
            # Calculate the erasure rate in GB per minute
            "erasure_rate": {
                "$cond": {
                    "if": { "$gt": ["$elapsed_time_seconds", 0] },
                    "then": {
                        "$divide": [
                            "$capacity_gb",  # Use capacity_gb directly
                            { "$divide": ["$elapsed_time_seconds", 60] }  # Convert elapsed time to minutes
                        ]
                    },
                    "else": None
                }
            }
        }
    },
    
    # Sort the results by interface type and serial number for clarity
    {
        "$sort": { "interface_type": 1, "serial_number": 1 }
    }
]

# Run the pipeline
results = all_reports_collection.aggregate(pipeline)
list(results)

[{'_id': ObjectId('66e8140c8bad1c495d53b79a'),
  'interface_type': 'EMMC',
  'model': 'MMC64G',
  'serial_number': '0x00022c02',
  'raw_capacity': 62537072640,
  'capacity_gb': 58.2421875,
  'raw_elapsed_time': '00:05:01',
  'elapsed_time_seconds': 301,
  'erasure_rate': 11.609738372093023},
 {'_id': ObjectId('66e8140c8bad1c495d53eb17'),
  'interface_type': 'EMMC',
  'model': 'HAG2e',
  'serial_number': '0x011f6a02',
  'raw_capacity': 15762194432,
  'capacity_gb': 14.6796875,
  'raw_elapsed_time': '00:05:00',
  'elapsed_time_seconds': 300,
  'erasure_rate': 2.9359375},
 {'_id': ObjectId('66e8140c8bad1c495d53b9e9'),
  'interface_type': 'EMMC',
  'model': 'HAG2e',
  'serial_number': '0x0532893e',
  'raw_capacity': 15762194432,
  'capacity_gb': 14.6796875,
  'raw_elapsed_time': '00:05:09',
  'elapsed_time_seconds': 309,
  'erasure_rate': 2.8504247572815533},
 {'_id': ObjectId('66e8140c8bad1c495d53cde6'),
  'interface_type': 'EMMC',
  'model': 'SEM16G',
  'serial_number': '0x0544b0ad',
  '