In [35]:
import cobra_db
print(cobra_db.__version__)
from cobra_db import Connector, StudyDao

connector_kwargs = dict(
    host="cluster0.ftnca3b.mongodb.net",
    port=27017,
    db_name="my_tutorial_0",
    username="test",
)

connector = Connector.get_pass(**connector_kwargs)

0.2.4


In [36]:
study_dao = StudyDao(connector)

# Find the total number of studies in the database
study_count = study_dao.collection.count_documents({})

print(f"Total number of studies: {study_count}")


Total number of studies: 20


In [37]:
from cobra_db import ImageMetadataDao
# create the data access object
im_dao = ImageMetadataDao(connector)
# prepare the aggregation pipeline.
pipeline = [
    {
        "$group": {
            "_id": {"$first": "$dicom_tags.SOPClassUID.Value"},
            "n": {"$sum": 1},
        }
    },
    {"$sort": {"n": -1}},
]
sop_classes = list(im_dao.collection.aggregate(pipeline, allowDiskUse=True))

print(sop_classes)

[{'_id': '1.2.840.10008.5.1.4.1.1.128', 'n': 2956}, {'_id': '1.2.840.10008.5.1.4.1.1.2', 'n': 852}, {'_id': '1.2.840.10008.5.1.4.1.1.7', 'n': 210}, {'_id': '1.2.840.10008.5.1.4.1.1.4', 'n': 92}, {'_id': '1.2.840.10008.5.1.4.1.1.1.1', 'n': 42}, {'_id': '1.2.840.10008.5.1.4.1.1.1', 'n': 16}, {'_id': '1.2.840.10008.5.1.4.1.1.1.2', 'n': 1}]


In [59]:
import pandas as pd
from pydicom._uid_dict import UID_dictionary
for i, c in enumerate(sop_classes):
    sop_classes[i]['sop_class_name'] = UID_dictionary.get(c['_id'], ['Unknown'])[0]
pd.DataFrame(sop_classes)

Unnamed: 0,_id,n,sop_class_name
0,1.2.840.10008.5.1.4.1.1.128,2956,Positron Emission Tomography Image Storage
1,1.2.840.10008.5.1.4.1.1.2,852,CT Image Storage
2,1.2.840.10008.5.1.4.1.1.7,210,Secondary Capture Image Storage
3,1.2.840.10008.5.1.4.1.1.4,92,MR Image Storage
4,1.2.840.10008.5.1.4.1.1.1.1,42,Digital X-Ray Image Storage - For Presentation
5,1.2.840.10008.5.1.4.1.1.1,16,Computed Radiography Image Storage
6,1.2.840.10008.5.1.4.1.1.1.2,1,Digital Mammography X-Ray Image Storage - For ...


In [56]:
from typing import Any, Dict

def improve_analyse_sop_class(uid: str) -> Dict[str, Any]:
    """
    Analyze SOP Class by aggregating and counting various DICOM tags.
    
    Args:
        uid: SOP Class UID.
    
    Returns:
        A dictionary containing aggregated data for the given SOP Class UID.
    """
    def create_pipeline(tag: str):
        return [
            {"$match": {"dicom_tags.SOPClassUID.Value": uid}},
            {
                "$group": {
                    "_id": {"$first": f"$dicom_tags.{tag}.Value"},
                    "n": {"$sum": 1},
                },
            },
            {
                "$facet": {
                    f"n_{tag}": [{"$count": "n"}],
                    f"min_n_images_per_{tag}": [{"$sort": {"n": 1}}, {"$limit": 1}],
                    f"max_n_images_per_{tag}": [{"$sort": {"n": -1}}, {"$limit": 1}],
                }
            },
            {
                "$project": {
                    f"n_{tag}": {"$first": f"$n_{tag}.n"},
                    f"min_n_images_per_{tag}": {"$first": f"$min_n_images_per_{tag}.n"},
                    f"max_n_images_per_{tag}": {"$first": f"$max_n_images_per_{tag}.n"},
                }
            },
        ]

    def group_and_count(tag: str) -> Dict[str, Any]:
        return list(im_dao.collection.aggregate(create_pipeline(tag), allowDiskUse=True))[0]

    class_name = UID_dictionary.get(uid, ["Unknown"])[0]
    n_images = im_dao.collection.count_documents({"dicom_tags.SOPClassUID.Value": uid})

    ans = {"SOPClassUID": uid, "class_name": class_name, "n_images": n_images}
    ans.update(group_and_count("SOPInstanceUID"))
    ans.update(group_and_count("SeriesInstanceUID"))
    ans.update(group_and_count("StudyInstanceUID"))
    ans.update(group_and_count("PatientID"))
    
    return ans


In [57]:
from pprint import pprint
sop_classes_analysis = [improve_analyse_sop_class(doc['_id']) for doc in sop_classes]
pprint(sop_classes_analysis)

[{'SOPClassUID': '1.2.840.10008.5.1.4.1.1.128',
  'class_name': 'Positron Emission Tomography Image Storage',
  'max_n_images_per_PatientID': 1582,
  'max_n_images_per_SOPInstanceUID': 8,
  'max_n_images_per_SeriesInstanceUID': 1582,
  'max_n_images_per_StudyInstanceUID': 1582,
  'min_n_images_per_PatientID': 174,
  'min_n_images_per_SOPInstanceUID': 1,
  'min_n_images_per_SeriesInstanceUID': 174,
  'min_n_images_per_StudyInstanceUID': 174,
  'n_PatientID': 4,
  'n_SOPInstanceUID': 1028,
  'n_SeriesInstanceUID': 5,
  'n_StudyInstanceUID': 4,
  'n_images': 2956},
 {'SOPClassUID': '1.2.840.10008.5.1.4.1.1.2',
  'class_name': 'CT Image Storage',
  'max_n_images_per_PatientID': 723,
  'max_n_images_per_SOPInstanceUID': 6,
  'max_n_images_per_SeriesInstanceUID': 723,
  'max_n_images_per_StudyInstanceUID': 723,
  'min_n_images_per_PatientID': 1,
  'min_n_images_per_SOPInstanceUID': 1,
  'min_n_images_per_SeriesInstanceUID': 1,
  'min_n_images_per_StudyInstanceUID': 1,
  'n_PatientID': 5,
  '

In [54]:
# we define a funtion that will be used with each SOP Class
def analyse_sop_class(uid):
    class_name = UID_dictionary.get(uid, ["Unknown"])[0]
    n_images = im_dao.collection.count_documents({"dicom_tags.SOPClassUID.Value": uid})

    def group_and_count(tag: str):
        return list(
            im_dao.collection.aggregate(
                [
                    {"$match": {"dicom_tags.SOPClassUID.Value": uid}},
                    {
                        "$group": {
                            "_id": {"$first": f"$dicom_tags.{tag}.Value"},
                            "n": {"$sum": 1},
                        },
                    },
                    {
                        "$facet": {  # split the pipeline to count different things
                            f"n_{tag}": [{"$count": "n"}],
                            f"min_n_images_per_{tag}": [
                                {"$sort": {"n": 1}},
                                {"$limit": 1},
                            ],
                            f"max_n_images_per_{tag}": [
                                {"$sort": {"n": -1}},
                                {"$limit": 1},
                            ],
                        }
                    },
                    {
                        "$project": {
                            f"n_{tag}": {"$first": f"$n_{tag}.n"},
                            f"min_n_images_per_{tag}": {
                                "$first": f"$min_n_images_per_{tag}.n"
                            },
                            f"max_n_images_per_{tag}": {
                                "$first": f"$max_n_images_per_{tag}.n"
                            },
                        }
                    },
                ],
                allowDiskUse=True,
            )
        )[0]

    ans = {"SOPClassUID": uid, "class_name": class_name, "n_images": n_images}
    ans.update(group_and_count("SOPInstanceUID"))
    ans.update(group_and_count("SeriesInstanceUID"))
    ans.update(group_and_count("StudyInstanceUID"))
    ans.update(group_and_count("PatientID"))
    return ans

In [55]:
from pprint import pprint
sop_classes_analysis = [analyse_sop_class(doc['_id']) for doc in sop_classes]
pprint(sop_classes_analysis)

[{'SOPClassUID': '1.2.840.10008.5.1.4.1.1.128',
  'class_name': 'Positron Emission Tomography Image Storage',
  'max_n_images_per_PatientID': 1582,
  'max_n_images_per_SOPInstanceUID': 8,
  'max_n_images_per_SeriesInstanceUID': 1582,
  'max_n_images_per_StudyInstanceUID': 1582,
  'min_n_images_per_PatientID': 174,
  'min_n_images_per_SOPInstanceUID': 1,
  'min_n_images_per_SeriesInstanceUID': 174,
  'min_n_images_per_StudyInstanceUID': 174,
  'n_PatientID': 4,
  'n_SOPInstanceUID': 1028,
  'n_SeriesInstanceUID': 5,
  'n_StudyInstanceUID': 4,
  'n_images': 2956},
 {'SOPClassUID': '1.2.840.10008.5.1.4.1.1.2',
  'class_name': 'CT Image Storage',
  'max_n_images_per_PatientID': 723,
  'max_n_images_per_SOPInstanceUID': 6,
  'max_n_images_per_SeriesInstanceUID': 723,
  'max_n_images_per_StudyInstanceUID': 723,
  'min_n_images_per_PatientID': 1,
  'min_n_images_per_SOPInstanceUID': 1,
  'min_n_images_per_SeriesInstanceUID': 1,
  'min_n_images_per_StudyInstanceUID': 1,
  'n_PatientID': 5,
  '

In [64]:
#find the most common SOP Class
most_common_sop_class = max(sop_classes_analysis, key=lambda x: x['n_images'])

print(f"Most common SOP Class: {most_common_sop_class['class_name']}")

Most common SOP Class: Positron Emission Tomography Image Storage


### pymongo

In [31]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

In [32]:
# Replace the placeholder with your Atlas connection string
uri = "mongodb+srv://test:@cluster0.ftnca3b.mongodb.net/?retryWrites=true&w=majority"
# Set the Stable API version when creating a new client
client = MongoClient(uri, server_api=ServerApi('1'))
                          
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [33]:
# List all databases
databases = client.list_database_names()
print("Databases:", databases)


Databases: ['my_tutorial_0', 'admin', 'local']


In [34]:
# Select a database
db_name = "my_tutorial_0"
db = client[db_name]

# List all collections in the selected database
collections = db.list_collection_names()
print("Collections in", db_name, ":", collections)


Collections in my_tutorial_0 : ['ImageMetadata', 'RadiologicalSeries', 'Patient', 'RadiologicalStudy']


In [None]:
#