In [1]:
!pip install faker gtts requests

Collecting faker
  Downloading Faker-26.0.0-py3-none-any.whl.metadata (15 kB)
Collecting gtts
  Downloading gTTS-2.5.2-py3-none-any.whl.metadata (4.1 kB)
Downloading Faker-26.0.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gTTS-2.5.2-py3-none-any.whl (29 kB)
Installing collected packages: gtts, faker
Successfully installed faker-26.0.0 gtts-2.5.2


In [5]:
!pip install faker requests pillow



In [6]:
!pip install pydicom

Collecting pydicom
  Downloading pydicom-2.4.4-py3-none-any.whl.metadata (7.8 kB)
Downloading pydicom-2.4.4-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.4.4


In [16]:
import os
import json
import requests
import numpy as np
import pydicom
from pydicom.dataset import Dataset, FileDataset
from pydicom.uid import generate_uid, ExplicitVRLittleEndian, SecondaryCaptureImageStorage
from faker import Faker
from gtts import gTTS
from datetime import datetime

In [17]:
faker = Faker()

MAIN_FOLDER = '/content/dataLake-db'

# Define the folder structure
folders = [
    'unstructured/text_documents/patient_notes',
    'unstructured/medical_images/x-rays',
    'unstructured/audio_files/patient_interviews',
    'unstructured/sensor_data/heart_rate',
    'unstructured/log_files/system_logs'
]

# Create the main folder and subdirectories
for folder in folders:
    os.makedirs(os.path.join(MAIN_FOLDER, folder), exist_ok=True)

print(f"Created folder structure under {MAIN_FOLDER}")

Created folder structure under /content/dataLake-db


In [14]:
def generate_patient_notes(num):
    return [{
        'patient_id': faker.random_int(min=1, max=1000),
        'notes': faker.paragraph(nb_sentences=3),
        'date': faker.date_this_decade().isoformat()
    } for _ in range(num)]

def generate_medical_images(num):
    images = []
    for i in range(num):
        patient_id = faker.random_int(min=1, max=1000)
        file_name = f'image_{i}.dcm'
        description = faker.sentence()
        date = faker.date_this_decade().isoformat()

        # Create a placeholder DICOM image
        ds = FileDataset(file_name, {}, file_meta=Dataset(), preamble=b"\0" * 128)

        ds.PatientName = f'Patient {patient_id}'
        ds.PatientID = str(patient_id)
        ds.StudyInstanceUID = generate_uid()
        ds.SeriesInstanceUID = generate_uid()
        ds.SOPInstanceUID = generate_uid()
        ds.SOPClassUID = SecondaryCaptureImageStorage
        ds.Modality = 'OT'  # Other
        ds.StudyDate = date.replace("-", "")
        ds.ContentDate = date.replace("-", "")
        ds.Rows = 256
        ds.Columns = 256
        ds.BitsAllocated = 16
        ds.BitsStored = 16
        ds.HighBit = 15
        ds.SamplesPerPixel = 1
        ds.PixelRepresentation = 0
        ds.PhotometricInterpretation = 'MONOCHROME2'

        # Set the necessary attributes
        ds.is_little_endian = True
        ds.is_implicit_VR = False

        # Create a numpy array for the image
        pixel_array = np.arange(256*256, dtype=np.uint16).reshape(256, 256)
        ds.PixelData = pixel_array.tobytes()

        # Add the required File Meta Information elements
        ds.file_meta.TransferSyntaxUID = ExplicitVRLittleEndian
        ds.file_meta.MediaStorageSOPClassUID = SecondaryCaptureImageStorage
        ds.file_meta.MediaStorageSOPInstanceUID = ds.SOPInstanceUID
        ds.file_meta.ImplementationClassUID = generate_uid()

        image_path = os.path.join(MAIN_FOLDER, 'unstructured/medical_images/x-rays', file_name)
        pydicom.filewriter.dcmwrite(image_path, ds, write_like_original=False)

        images.append({
            'patient_id': patient_id,
            'file': file_name,
            'description': description,
            'date': date
        })

    return images

def generate_audio_files(num):
    audio_files = []
    for i in range(num):
        patient_id = faker.random_int(min=1, max=1000)
        file_name = f'audio_{i}.mp3'
        description = faker.sentence()
        date = faker.date_this_decade().isoformat()

        text = f'This is a sample audio file for patient ID {patient_id}. {description}'
        tts = gTTS(text=text, lang='en')
        audio_path = os.path.join(MAIN_FOLDER, 'unstructured/audio_files/patient_interviews', file_name)
        tts.save(audio_path)

        audio_files.append({
            'patient_id': patient_id,
            'file': file_name,
            'description': description,
            'date': date
        })

    return audio_files

def write_json_to_file(file_path, data):
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=2)

def main():
    num_entries = 10  # Adjust the number of entries here

    # Generate and save patient notes
    patient_notes = generate_patient_notes(num_entries)
    write_json_to_file(os.path.join(MAIN_FOLDER, 'unstructured/text_documents/patient_notes/patient_notes.json'), patient_notes)

    # Generate and save medical images metadata
    medical_images = generate_medical_images(num_entries)
    write_json_to_file(os.path.join(MAIN_FOLDER, 'unstructured/medical_images/x-rays/medical_images.json'), medical_images)

    # Generate and save audio files and metadata
    audio_files = generate_audio_files(num_entries)
    write_json_to_file(os.path.join(MAIN_FOLDER, 'unstructured/audio_files/patient_interviews/audio_files.json'), audio_files)

    print('Sample data generation completed.')

if __name__ == "__main__":
    main()


Created folder structure under /content/dataLake-db
Sample data generation completed.


In [15]:
shutil.make_archive('/content/dataLake-db', 'zip', '/content/dataLake-db')
print('Created zip file of the data folder.')

Created zip file of the data folder.
