# 1. Synthetic Data Generation

## 1.1 Imports

In [2]:
import numpy as np
from faker import Faker
import pandas as pd
import random
import datetime

## 1.2 Generate Data

In [35]:
COLUMNS = ['Patient ID', 'Name', 'Ethnicity', 'Age', 'Location', 'Total PSA (ng/ml)', 'Clinically Significant', 'PI-RAD', 'MRI Date', 'Tumor Marker Results', 'Conclusions', 'Anonimized']
ETHNICITIES = ["White", "Black", "Hispanic", "Asian", "Native American", "Mixed", "Other"]

fake = Faker()
fake.locale = 'en_US'
data = {col: [] for col in COLUMNS}

ages = np.random.normal(50, 15, size=10000)
ages = np.clip(ages, 20, 90)
data['Age'] = [int(age) for age in ages]

for i in range(10000):

    if random.random() < 0.3: 
        data['Patient ID'].append('' if random.random() < 0.5 else 'PRO' + str(i).zfill(8))
        data['Name'].append('' if random.random() < 0.5 else fake.first_name_male() + ' ' + fake.last_name())
        data['Ethnicity'].append('' if random.random() < 0.5 else random.choice(ETHNICITIES))
        data['Location'].append('' if random.random() < 0.5 else fake.address().replace('\n', ', '))
        data['Anonimized'].append('Yes')

    else:
        data['Patient ID'].append('PRO' + str(i).zfill(8))
        data['Name'].append(fake.first_name_male() + ' ' + fake.last_name()) 
        data['Ethnicity'].append(random.choice(ETHNICITIES))
        data['Location'].append(fake.address().replace('\n', ', '))
        data['Anonimized'].append('No')

    data['MRI Date'].append(fake.date_between(start_date='-1y', end_date='today'))

    total_psa = round(random.uniform(0, 25), 2)
    clinically_significant = 'No'
    pi_rad = random.randint(1, 2)
    tumor_marker_results = 'Low'
    conclusions = 'WE SUGGEST TO DISCARD MALIGNANCY.'

    if total_psa > 4:
        clinically_significant = 'Yes'
        pi_rad = random.randint(3, 5)
    
    if pi_rad > 3:
        tumor_marker_results = random.choice(['Moderate', 'High'])
        
    if tumor_marker_results == 'High':
        conclusions = 'POSSIBLE MALIGNANCY. BIOPSY RECOMMENDED.'
    elif tumor_marker_results == 'Moderate':
        conclusions = 'WE SUGGEST TO FOLLOW UP IN 6 MONTHS.'

    data['Total PSA (ng/ml)'].append(total_psa)
    data['PI-RAD'].append(pi_rad)
    data['Tumor Marker Results'].append(tumor_marker_results)
    data['Clinically Significant'].append(clinically_significant)
    data['Conclusions'].append(conclusions)

## 1.3 Store in CSV File

In [36]:
df = pd.DataFrame(data)
df.to_csv('data.csv')