# Dataset Generation

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import random

In [2]:
np.random.seed(42)
random.seed(42)

In [3]:
n_projects = 150

In [4]:
project_ids = [f"PROJ-{str(i+1).zfill(3)}" for i in range(n_projects)]
print(project_ids)

phases = ['I', 'II', 'III']

phase_weights = [0.25, 0.30, 0.45]  # More phase III projects
trial_phases = np.random.choice(phases, size=n_projects, p=phase_weights)

['PROJ-001', 'PROJ-002', 'PROJ-003', 'PROJ-004', 'PROJ-005', 'PROJ-006', 'PROJ-007', 'PROJ-008', 'PROJ-009', 'PROJ-010', 'PROJ-011', 'PROJ-012', 'PROJ-013', 'PROJ-014', 'PROJ-015', 'PROJ-016', 'PROJ-017', 'PROJ-018', 'PROJ-019', 'PROJ-020', 'PROJ-021', 'PROJ-022', 'PROJ-023', 'PROJ-024', 'PROJ-025', 'PROJ-026', 'PROJ-027', 'PROJ-028', 'PROJ-029', 'PROJ-030', 'PROJ-031', 'PROJ-032', 'PROJ-033', 'PROJ-034', 'PROJ-035', 'PROJ-036', 'PROJ-037', 'PROJ-038', 'PROJ-039', 'PROJ-040', 'PROJ-041', 'PROJ-042', 'PROJ-043', 'PROJ-044', 'PROJ-045', 'PROJ-046', 'PROJ-047', 'PROJ-048', 'PROJ-049', 'PROJ-050', 'PROJ-051', 'PROJ-052', 'PROJ-053', 'PROJ-054', 'PROJ-055', 'PROJ-056', 'PROJ-057', 'PROJ-058', 'PROJ-059', 'PROJ-060', 'PROJ-061', 'PROJ-062', 'PROJ-063', 'PROJ-064', 'PROJ-065', 'PROJ-066', 'PROJ-067', 'PROJ-068', 'PROJ-069', 'PROJ-070', 'PROJ-071', 'PROJ-072', 'PROJ-073', 'PROJ-074', 'PROJ-075', 'PROJ-076', 'PROJ-077', 'PROJ-078', 'PROJ-079', 'PROJ-080', 'PROJ-081', 'PROJ-082', 'PROJ-083', 'PR

In [5]:
start_date_base = datetime(2020, 1, 1)
end_date_base = datetime(2025, 12, 1)
date_range = (end_date_base - start_date_base).days
start_dates = [start_date_base + timedelta(days=random.randint(0, date_range)) 
               for _ in range(n_projects)]

end_dates = []

print(start_dates)

[datetime.datetime(2021, 4, 1, 0, 0), datetime.datetime(2020, 4, 12, 0, 0), datetime.datetime(2023, 1, 31, 0, 0), datetime.datetime(2022, 9, 30, 0, 0), datetime.datetime(2022, 7, 3, 0, 0), datetime.datetime(2021, 7, 25, 0, 0), datetime.datetime(2021, 2, 23, 0, 0), datetime.datetime(2020, 12, 22, 0, 0), datetime.datetime(2024, 9, 24, 0, 0), datetime.datetime(2020, 5, 10, 0, 0), datetime.datetime(2020, 5, 2, 0, 0), datetime.datetime(2021, 1, 18, 0, 0), datetime.datetime(2022, 6, 14, 0, 0), datetime.datetime(2022, 8, 10, 0, 0), datetime.datetime(2025, 8, 31, 0, 0), datetime.datetime(2020, 4, 18, 0, 0), datetime.datetime(2022, 3, 25, 0, 0), datetime.datetime(2024, 9, 14, 0, 0), datetime.datetime(2022, 6, 21, 0, 0), datetime.datetime(2025, 1, 13, 0, 0), datetime.datetime(2023, 2, 13, 0, 0), datetime.datetime(2020, 1, 27, 0, 0), datetime.datetime(2021, 10, 15, 0, 0), datetime.datetime(2024, 9, 27, 0, 0), datetime.datetime(2023, 10, 25, 0, 0), datetime.datetime(2023, 2, 12, 0, 0), datetime.da

In [None]:
from dateutil.relativedelta import relativedelta

for i, (phase, start_date) in enumerate(zip(trial_phases, start_dates)):
    if phase == 'I':
        duration_months = random.randint(12, 24)
    elif phase == 'II':
        duration_months = random.randint(18, 36)
    else:  # Phase III
        duration_months = random.randint(24, 60)
    
    end_date = start_date + relativedelta(months=duration_months)

    if end_date > end_date_base:
        end_date = end_date_base
    end_dates.append(end_date)

budgets = []

In [7]:
for phase, start_date, end_date in zip(trial_phases, start_dates, end_dates):
    duration_months = (end_date - start_date).days / 30
    
    if phase == 'I':
        base_budget = np.random.uniform(5, 15)
    elif phase == 'II':
        base_budget = np.random.uniform(15, 50)
    else:  # Phase III
        base_budget = np.random.uniform(50, 200)
    
    
    budget = base_budget * (1 + duration_months / 24 * 0.2)
    budgets.append(round(budget, 2))

In [8]:
print(len(budgets))

150


In [9]:
fte_required = []
for phase, budget in zip(trial_phases, budgets):
    if phase == 'I':
        base_fte = np.random.uniform(10, 30)
    elif phase == 'II':
        base_fte = np.random.uniform(25, 60)
    else:  # Phase III
        base_fte = np.random.uniform(50, 150)
    

    fte = base_fte * (1 + budget / 100 * 0.1)
    fte_required.append(int(round(fte)))

In [10]:
print(len(project_ids))
print(len(trial_phases))
print(len(start_dates))
print(len(end_dates))
print(len(budgets))
print(len(fte_required))


150
150
150
150
150
150


In [11]:
df = pd.DataFrame({
    'Project_ID': project_ids,
    'Trial_Phase': trial_phases,
    'Start_Date': start_dates,
    'End_Date': end_dates,
    'Budget': budgets,
    'FTE_Required': fte_required
})


df = df.sort_values('Start_Date').reset_index(drop=True)

In [None]:
print(f"\nDimensions: {df.shape[0]} x {df.shape[1]} columns")
print(df.head(10))
print(df.describe())
print(f"\nPhase distribution:")
print(df['Trial_Phase'].value_counts())
print(df.info())


df.to_csv('data/1_raw_pharma_trials_dataset.csv', index=False)


Dimensions: 150 x 6 columns
  Project_ID Trial_Phase Start_Date   End_Date  Budget  FTE_Required
0   PROJ-127         III 2020-01-03 2022-06-03  211.33           114
1   PROJ-115         III 2020-01-14 2022-12-14  235.33           140
2   PROJ-022           I 2020-01-27 2021-06-27    5.91            24
3   PROJ-106           I 2020-02-17 2021-02-17   11.64            28
4   PROJ-130          II 2020-03-20 2023-03-20   55.89            31
5   PROJ-002         III 2020-04-12 2024-03-12  120.07           116
6   PROJ-016           I 2020-04-18 2021-04-18   14.71            24
7   PROJ-011           I 2020-05-02 2021-05-02    9.56            19
8   PROJ-010         III 2020-05-10 2024-02-10  219.87            94
9   PROJ-072           I 2020-05-11 2021-05-11   11.77            10
                Start_Date             End_Date      Budget  FTE_Required
count                  150                  150  150.000000    150.000000
mean   2022-09-30 06:14:24  2024-09-21 11:40:48   79.425600     