In [1]:
from pathlib import Path
import pandas as pd
from pprint import pprint

from pymongo import MongoClient
from pymongo.errors import ConnectionFailure


from datetime import datetime,timezone

%reload_ext autoreload
%autoreload 2

In [2]:
from csv_containerisation_mongodb.test.test import DataIntegrityChecker

In [3]:
# connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
try:
    client.admin.command('ismaster')
    print("MongoDB connection: Successful")
except ConnectionFailure as e:
    print("MongoDB connection: Failed {e}")

MongoDB connection: Successful


In [4]:
# create the database 
db = client['medical_records'] 
# create the collection
collection = db['healthcare_data']

In [5]:
# Load the cleaned data
cleaned_data_dir = Path('../data/cleaned/')
data_path = cleaned_data_dir / 'healthcare_cleaned.csv'
df = pd.read_csv(data_path)
df = df.rename(columns={'Date of Admission': 'Admission Date'})
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Admission Date,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby Jackson,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,Leslie Terry,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,Danny Smith,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,Andrew Watts,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,Adrienne Bell,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


## Strategy: convert flat CSV into an Organise Nested Document 

In [6]:
# convert csv into a nested mongo document
records = df.to_dict(orient='records')
pprint(records[0]) 

{'Admission Date': '2024-01-31',
 'Admission Type': 'Urgent',
 'Age': 30,
 'Billing Amount': 18856.281305978155,
 'Blood Type': 'B-',
 'Discharge Date': '2024-02-02',
 'Doctor': 'Matthew Smith',
 'Gender': 'Male',
 'Hospital': 'Sons and Miller',
 'Insurance Provider': 'Blue Cross',
 'Medical Condition': 'Cancer',
 'Medication': 'Paracetamol',
 'Name': 'Bobby Jackson',
 'Room Number': 328,
 'Test Results': 'Normal'}


In [7]:
df = df.head(5)

In [8]:
def transform_row_to_mongodb(row):
    """Convert CSV row to MongoDB document"""
    
    document = {
        "patient_info": {
            "name": row['Name'],
            "age": int(row['Age']),                    
            "gender": row['Gender'],
            "blood_type": row['Blood Type']
        },
        
        "medical_details": {
            "medical_condition": row['Medical Condition'],
            "medication": row['Medication'],
            "test_results": row['Test Results']
        },
        
        "admission_details": {
            "admission_date": row['Admission Date'],
            "admission_type": row['Admission Type'],
            "room_number": int(row['Room Number']),    
            "discharge_date": row['Discharge Date']
        },
        
        "hospital_info": {
            "hospital": row['Hospital'],
            "doctor": row['Doctor']
        },
        
        "billing": {
            "insurance_provider": row['Insurance Provider'],
            "billing_amount": float(round(row['Billing Amount'], 2))
        },
        
        "metadata": {
            "created_at": datetime.now(timezone.utc),
            "updated_at": datetime.now(timezone.utc),
            "data_source": "CSV_migration",
            "migrated_by": "Hope - DataSoluTech"
        }
    }
    
    return document

## VERIFICATION COMPLETE AND STRATEGY SUCCESSFUL
**Tests to carry out:**

**Before Migration (CSV):**
1. Row count
2. Column count and names
3. Data types for each column
4. Missing values per column
5. Duplicate rows count
6. Sample values (first 5 rows)

**After Migration (MongoDB):**
1. Document count
2. Field count and names (check document structure)
3. Data types for each field
4. Missing/null values per field
5. Duplicate documents count
6. Sample values (first 5 documents)

**Comparison:**
1. CSV row count = MongoDB document count?
2. All columns mapped to fields?
3. Data types preserved?
4. Values match (sample verification)?
5. No data loss?

Want the code for these tests?

In [9]:
# insert 5 sample documents
# delete the collection first before inserting a new documents 
collection.delete_many({})
for i in range(5):
    row = df.iloc[i]
    doc=transform_row_to_mongodb(row)
    collection.insert_one(doc)

In [10]:
# sample documents
print("="*70)
print("SAMPLE DOCUMENTS")
print("="*70)

for i, doc in enumerate(collection.find().limit(5), 1):
    print(f"\nDocument {i}:")
    print("-"*70)
    pprint(doc, indent=2, width=70)

SAMPLE DOCUMENTS

Document 1:
----------------------------------------------------------------------
{ '_id': ObjectId('695d1f9f0261df9776de8714'),
  'admission_details': { 'admission_date': '2024-01-31',
                         'admission_type': 'Urgent',
                         'discharge_date': '2024-02-02',
                         'room_number': 328},
  'billing': { 'billing_amount': 18856.28,
               'insurance_provider': 'Blue Cross'},
  'hospital_info': { 'doctor': 'Matthew Smith',
                     'hospital': 'Sons and Miller'},
  'medical_details': { 'medical_condition': 'Cancer',
                       'medication': 'Paracetamol',
                       'test_results': 'Normal'},
  'metadata': { 'created_at': datetime.datetime(2026, 1, 6, 14, 43, 43, 40000),
                'data_source': 'CSV_migration',
                'migrated_by': 'Hope - DataSoluTech',
                'updated_at': datetime.datetime(2026, 1, 6, 14, 43, 43, 40000)},
  'patient_info': { 'age

## VERIFICATION COMPLETE AND STRATEGY SUCCESSFUL
**Tests to carry out:**

**Before Migration (CSV):**
1. Row count
2. Column count and names
3. Data types for each column
4. Missing values per column
5. Duplicate rows count
6. Sample values (first 5 rows)

**After Migration (MongoDB):**
1. Document count
2. Field count and names (check document structure)
3. Data types for each field
4. Missing/null values per field
5. Duplicate documents count
6. Sample values (first 5 documents)

**Comparison:**
1. CSV row count = MongoDB document count?
2. All columns mapped to fields?
3. Data types preserved?
4. Values match (sample verification)?
5. No data loss?

Want the code for these tests?

In [11]:
# documents integrity check
total_docs = collection.count_documents({})
print(f"Total documents in collection: {total_docs}")

Total documents in collection: 5


## test of the testing functions 

In [12]:
data_Integrity = DataIntegrityChecker(db_name='medical_records', collection_name='healthcare_data', df=df.head(5))

[INFO] Connected to medical_records.healthcare_data


In [14]:
data_Integrity.test_document_count()

[PASS] Document count: 5 matches expected: 5


In [16]:
data_Integrity.test_missing_values()

MISSING VALUES VALIDATION TEST
                                   MongoDB Missing (%)  CSV Missing (%)  Match
Admission Date                                     0.0              0.0   True
Admission Type                                     0.0              0.0   True
Age                                                0.0              0.0   True
Billing Amount                                     0.0              0.0   True
Blood Type                                         0.0              0.0   True
Discharge Date                                     0.0              0.0   True
Doctor                                             0.0              0.0   True
Gender                                             0.0              0.0   True
Hospital                                           0.0              0.0   True
Insurance Provider                                 0.0              0.0   True
Medical Condition                                  0.0              0.0   True
Medication           

In [17]:
data_Integrity.test_duplicates()


DUPLICATE VALIDATION TEST

DUPLICATE COUNT COMPARISON:
  CSV duplicate rows: 0/5
  MongoDB duplicate docs: 0/5
  [PASS] Status: MATCH


In [None]:
data_Integrity.mongo_type_verification()

DATA TYPE COMPARISON
Field                     MongoDB         Expected DF    
------------------------------------------------------------------------------------------
name                      str             object         
age                       int             int64          
gender                    str             object         
blood_type                str             object         
medical_condition         str             object         
medication                str             object         
test_results              str             object         
admission_date            str             object         
admission_type            str             object         
room_number               int             int64          
discharge_date            str             object         
hospital                  str             object         
doctor                    str             object         
insurance_provider        str             object         
billing_amount    

<project_5.data.migration_test.DataIntegrityChecker at 0x110312900>

In [18]:
data_Integrity.test_data_types()

DATA TYPE VALIDATION TEST
Field                     MongoDB Type    Expected Type  
------------------------------------------------------------------------------------------
name                      str             object         
age                       int             int64          
gender                    str             object         
blood_type                str             object         
medical_condition         str             object         
medication                str             object         
test_results              str             object         
admission_date            str             object         
admission_type            str             object         
room_number               int             int64          
discharge_date            str             object         
hospital                  str             object         
doctor                    str             object         
insurance_provider        str             object         
billing_amoun

In [20]:
data_Integrity.test_field_structure()

FIELD STRUCTURE VALIDATION TEST
         MongoDB Field      Expected (CSV)
0                 Name                Name
1                  Age                 Age
2               Gender              Gender
3           Blood Type          Blood Type
4    Medical Condition   Medical Condition
5           Medication      Admission Date
6         Test Results              Doctor
7       Admission Date            Hospital
8       Admission Type  Insurance Provider
9          Room Number      Billing Amount
10      Discharge Date         Room Number
11            Hospital      Admission Type
12              Doctor      Discharge Date
13  Insurance Provider          Medication
14      Billing Amount        Test Results
----------------------------------------------------------------------
[PASS] Field structure validation passed
