In [None]:
#Don't forget to save your work to your local computer and push to GitHub.
#File -> Download as -> Notebook
#Move file from download folder to local repo clone
#git add filename.ipynb
#git commit -m "put a specific and informative message here of what you worked on"
#git push

# Import and Prepare Dataset - Custom Concept Set for Birth and Preterm Birth 
In this section:  
Data is in long format  
Rename to long_data  
Remove irrelevant values for "condition" (e.g. "second trimester pregnancy" is too vague)  
Convert datetime cells to date only  
(For ease of reading, I have commented out most visualizations of dataframes)  

In [1]:
import os
import pandas
import numpy as np

# This query represents dataset "Concept Set Exploration" for domain "condition" and was generated for 
#All of Us Controlled Tier Dataset v7

dataset_95902282_condition_sql = """
    SELECT
        c_occurrence.person_id,
        c_standard_concept.concept_name as standard_concept_name,
        c_occurrence.condition_start_datetime 
    FROM
        ( SELECT
            * 
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.condition_occurrence` c_occurrence 
        WHERE
            (
                condition_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    36712702, 4014295, 432441, 44784550, 45757175, 45757176
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    c_occurrence.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (
                                                SELECT
                                                    DISTINCT c.concept_id 
                                                FROM
                                                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                                                JOIN
                                                    (
                                                        select
                                                            cast(cr.id as string) as id 
                                                        FROM
                                                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr 
                                                        WHERE
                                                            concept_id IN (36712702, 45757176, 4014295) 
                                                            AND full_text LIKE '%_rank1]%'
                                                    ) a 
                                                        ON (
                                                            c.path LIKE CONCAT('%.',
                                                        a.id,
                                                        '.%') 
                                                        OR c.path LIKE CONCAT('%.',
                                                        a.id) 
                                                        OR c.path LIKE CONCAT(a.id,
                                                        '.%') 
                                                        OR c.path = a.id) 
                                                    WHERE
                                                        is_standard = 1 
                                                        AND is_selectable = 1
                                                    ) 
                                                    AND is_standard = 1 
                                            )
                                        ) criteria 
                                    ) ))
                        ) c_occurrence 
                    LEFT JOIN
                        `""" + os.environ["WORKSPACE_CDR"] + """.concept` c_standard_concept 
                            ON c_occurrence.condition_concept_id = c_standard_concept.concept_id"""

dataset_95902282_condition_df = pandas.read_gbq(
    dataset_95902282_condition_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

dataset_95902282_condition_df.sample(3)

Downloading:   0%|          | 0/390805 [00:00<?, ?rows/s]

Unnamed: 0,person_id,standard_concept_name,condition_start_datetime
367637,1603140,Third trimester pregnancy,2016-10-11 12:45:00+00:00
147093,2514138,Finding of length of gestation,2017-02-10 05:07:16+00:00
67868,3325672,Second trimester pregnancy,2021-07-13 05:00:00+00:00


In [2]:
#Rename and visualize dataset - data in long format (multiple rows per person_id)

long_data = dataset_95902282_condition_df

#long_data.info

In [3]:
#Review values of standard_concept_name and remove irrelevant entries (e.g. too vague, not used in analysis)

#unique_values = long_data['standard_concept_name'].unique()

#print("List of all possible values for column 'standard_concept_name':")
#for value in unique_values:
    #print(value)

#Prepare list of values to drop    
values_to_drop = ['Gestation less than 20 weeks', 'Gestation less than 9 weeks']
values_to_drop += ['Finding of length of gestation', 'Gestation less than 24 weeks']
values_to_drop += ['First trimester pregnancy', 'Second trimester pregnancy', 'Third trimester pregnancy']

# Use the 'drop' method to remove rows with specific values in 'standard_concept_name'
drop_long_data = long_data[~long_data['standard_concept_name'].isin(values_to_drop)]
#This filtered df is created by making a Boolean list of values_to_drop found in the column 'standard_concept_name'
#The tilde switches this from True to False so that we are applying the opposite of the Boolean list to the df long_data
#in order to create the new df

#print("Dataframe after dropping rows with specific values in 'standard_concept_name':")
#print(drop_long_data)

In [4]:
#Convert condition_start_datetime cells to contain only date information and not time, as time is irrelevant to our question
#Make a new copy of the dataset and drop the old datetime column, retaining new date column

import  pandas as pd

# Convert the datetime column to a datetime type
drop_long_data.loc[:,'condition_start_datetime'] = pd.to_datetime(drop_long_data['condition_start_datetime'])

#Create a new date only column
drop_long_data.loc[:,'condition_start_date'] = drop_long_data['condition_start_datetime'].dt.date

# Drop the datetime_column
date_only_long_data = drop_long_data.drop(columns=['condition_start_datetime']).copy()

# Display new dataframe with all columns except datetime_column
print("Date_only_long dataframe; time information has been removed:")
date_only_long_data.sample(3)



Date_only_long dataframe; time information has been removed:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,person_id,standard_concept_name,condition_start_date
246716,2001356,Single live birth,2010-10-12
6919,1896275,"Gestation period, 38 weeks",2021-03-12
112820,1904111,"Gestation period, 12 weeks",2017-03-30


# Remove duplicate entries and explore remaining entries
Duplicates can be defined in many ways. I am defining duplicates here to be those entries that are the same person_id, 
standard_concept_name, and condition_start_date. I'm calling them triple duplicates to differentiate them from duplicates
that are the same person and concept only (for example someone who had two births a year apart).  

Note that it is very important to keep the first instance, not drop both. (keep='first')  

Exploring data to understand type of duplicate entries remaining in the data set (e.g. instances of multiple births per person) 

In [5]:
#Remove duplicates that have the same person_id, standard_concept_name, and condition_start_date
#(Note that this will retain duplicates of same person_id and standard_concept_name but different condition_start_date)

#Drop rows with the same 'person_id', 'standard_concept_name', and 'condition_start_date', keeping first instance
date_only_long_data_no_triple_duplicates = date_only_long_data.drop_duplicates(subset=['person_id', 'standard_concept_name', 
                                                                                       'condition_start_date'], keep='first')

print("Dataframe after dropping duplicate rows:")
date_only_long_data_no_triple_duplicates.sample(3)

# Calculate the number of dropped entries
dropped_entries = len(date_only_long_data) - len(date_only_long_data_no_triple_duplicates)
print("Number of entries dropped:", dropped_entries)


#Double checked work for duplicates
#check_duplicates = date_only_long_data_no_triple_duplicates.duplicated(subset=['person_id', 'standard_concept_name', 
                                                                               #'condition_start_date'], keep='first')
#Print any duplicate rows
#duplicate_rows_check = date_only_long_data_no_triple_duplicates[check_duplicates]
#print("Duplicate Rows in the Resulting dataframe:")
#print(duplicate_rows_check)

Dataframe after dropping duplicate rows:
Number of entries dropped: 39013


In [6]:
#Explore duplicate entries that differ in condition_start_datetime 

duplicates = date_only_long_data_no_triple_duplicates.duplicated(subset=['person_id', 'standard_concept_name'], keep=False)

#subset=['person_id', 'standard_concept_name'], keep=False is a boolean phrase that marks all duplicates as True, therefore
#the series that is returned contains all duplicates. keep=first will return only the first duplicate. We do not want to drop
#this data, just understand it and decide if some of it we want to drop (e.g. if a birth is observed a few days apart)

different_date_duplicates = date_only_long_data_no_triple_duplicates[duplicates]
print("No. of rows that are same 'person_id' and 'standard_concept_name' but have different 'condition_start_date':", 
      len(different_date_duplicates))


# Create a dataframe of the duplicate rows
duplicate_rows = date_only_long_data_no_triple_duplicates[duplicates]

# Group by 'person_id' and 'standard_concept_name' and count the duplicates
duplicate_count = duplicate_rows.groupby(['person_id', 'standard_concept_name']).size()

# Find the combination with the most duplicates
max_duplicates = duplicate_count.idxmax()
max_count = duplicate_count.max()

print("Combination with the Most Duplicates:")
print("person_id:", max_duplicates[0])
print("standard_concept_name:", max_duplicates[1])
print("Number of Duplicates:", max_count)

No. of rows that are same 'person_id' and 'standard_concept_name' but have different 'condition_start_date': 82422
Combination with the Most Duplicates:
person_id: 1302966
standard_concept_name: Gestation period, 17 weeks
Number of Duplicates: 18


In [7]:
#view all entries for a specific person_id to explore duplicates, etc

#person_df = df[df['person_id'] == 5433458]
#person_df

#person_df = date_only_long_data_no_triple_duplicates[date_only_long_data_no_triple_duplicates['person_id'] == 1302966]
#person_df

person_df = date_only_long_data_no_triple_duplicates[date_only_long_data_no_triple_duplicates['person_id'] == 1934256	]
person_df = person_df.sort_values(by='condition_start_date')

print("The person below (1934256) we can see has some erroneous entries as it shows multiple births within one month.\
One thing that commonly shows up is single live birth given a couple days apart. We need our code to choose one of these \
and throw out the others.")
person_df


The person below (1934256) we can see has some erroneous entries as it shows multiple births within one month.One thing that commonly shows up is single live birth given a couple days apart. We need our code to choose one of these and throw out the others.


Unnamed: 0,person_id,standard_concept_name,condition_start_date
231362,1934256,Single live birth,2013-12-17
238917,1934256,Single live birth,2013-12-18
269336,1934256,Single live birth,2013-12-23
178621,1934256,Preterm labor with preterm delivery,2015-12-10
178638,1934256,Preterm labor with preterm delivery,2015-12-23
178683,1934256,Preterm labor with preterm delivery,2015-12-24
178640,1934256,Preterm labor with preterm delivery,2016-01-06
178667,1934256,Preterm labor with preterm delivery,2016-01-22
22461,1934256,"Gestation period, 21 weeks",2016-01-23
22168,1934256,"Gestation period, 21 weeks",2016-01-26


# Create a loop to calcuate conception date based on observations of gestational length.
We can use various entries of 'standard_concept_name' to calculate date of conception. (ex. Gestation 8 weeks, Gestation 38 weeks).  

Create a dictionary to view how many times different 'standard_concept_name's are used in the dataset to see which we will need to use for calculations.

Create a loop to calculate conception date for every instance of Gestation period entry in 'standard_concept_name' and populate new column called "conception_date".

Explore conception date variances / reasons / how to handle discrepancies. 

Through this process I found that because of issues with medical records, our most accurate approach is to start with the most recent medical record of gestation period, e.g. 40 weeks, then look for 39 weeks, and so on. If we start with 8 weeks for example, then we sometimes label a birth as over-term by weeks or months because the instance of 8 week gestation length was for a miscarriage, and then no further records available until the next birth. 


In [8]:
#view all entries for standard_concept_name

#Create a dictionary to store concept names
concept_dict = {}

#Iterate through column 'standard_concept_name' 
for concept in date_only_long_data_no_triple_duplicates['standard_concept_name']:
    if concept in concept_dict:
        concept_dict[concept] += 1
    else:
        concept_dict[concept] = 1

#print nicely
for key, value in concept_dict.items():
    print(f'{key}: {value}')


Preterm labor in second trimester with preterm delivery in second trimester: 150
Gestation period, 27 weeks: 1998
Gestation period, 11 weeks: 2930
Gestation period, 38 weeks: 7400
Gestation period, 9 weeks: 34
Gestation period, 42 weeks: 15
Gestation period, 18 weeks: 2703
Gestation period, 13 weeks: 1862
Term pregnancy: 3
Gestation period, 21 weeks: 2105
Gestation period, 23 weeks: 1692
Gestation period, 26 weeks: 1875
Gestation period, 22 weeks: 2023
Preterm spontaneous labor with preterm delivery: 425
Gestation period, 35 weeks: 4700
Gestation period, 36 weeks: 6376
Gestation period, 41 weeks: 1123
Gestation period, 14 weeks: 1157
Gestation period, 16 weeks: 1508
Gestation period, 15 weeks: 1104
Gestation period, 20 weeks: 3157
Gestation period, 12 weeks: 3479
Gestation period, 37 weeks: 7332
Gestation period, 40 weeks: 4741
Gestation period, 39 weeks: 9860
Gestation period, 17 weeks: 1339
Gestation period, 33 weeks: 3714
Gestation period, 32 weeks: 3719
Preterm labor in third trime

In [9]:
#Create code to calculate conception date from standard_concept_name that gives gestational age

from datetime import timedelta

# Define a function to calculate the conception date
def calculate_conception(row):
    # Split the 'standard_concept_name' to extract the number of weeks
    concept_name_parts = row['standard_concept_name'].split()
    if len(concept_name_parts) == 4 and concept_name_parts[0] == "Gestation":
        try:
            weeks = int(concept_name_parts[2])
            # Calculate the conception date by subtracting 'weeks' from 'condition_start_datetime'
            conception_date = row['condition_start_date'] - timedelta(weeks=weeks)
            return conception_date
        except ValueError:
            pass
    return None

# Apply the function to create the 'conception_date' column
date_only_long_data_no_triple_duplicates.loc[:,'conception_date'] = date_only_long_data_no_triple_duplicates.apply(calculate_conception, axis=1).copy()

date_only_long_data_no_triple_duplicates.sample(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,person_id,standard_concept_name,condition_start_date,conception_date
125902,1270043,"Gestation period, 37 weeks",2019-03-12,2018-06-26
119788,5201023,"Gestation period, 37 weeks",2017-09-14,2016-12-29
143145,2001945,"Gestation period, 39 weeks",2015-10-16,2015-01-16
178559,1346791,"Gestation period, 34 weeks",2019-11-11,2019-03-18
38665,1432753,"Gestation period, 36 weeks",2020-11-11,2020-03-04
233666,1569773,Single live birth,2020-06-06,
127120,2205965,"Gestation period, 40 weeks",2018-02-08,2017-05-04
261739,1977450,Single live birth,2019-02-14,
244573,1037983,Single live birth,2018-10-28,
10795,2068499,"Gestation period, 38 weeks",2020-04-14,2019-07-23


In [86]:
#Explore instance of person with differing conception dates
person_df = date_only_long_data_no_triple_duplicates[date_only_long_data_no_triple_duplicates['person_id'] == 5433458]
person_df = person_df.sort_values(by='condition_start_date')

print("If we visit person_id 5433458, we will see that we calculated a conception date, but there is some variance.\
Some entries seem wrong (9/9 conception when all the others are august, but others range from 8/6 - 8/18 which can often \
be contributed to the fact that appointment week #s are given by week not days, so less precise) \
It seems like 8-11 weeks might be most reliable in the data as this is their first appointment, and after that weeks 42,41, \
39, etc.")
person_df


#person_df.dropna(subset=['conception_date'])

If we visit person_id 5433458, we will see that we calculated a conception date, but there is some variance.Some entries seem wrong (9/9 conception when all the others are august, but others range from 8/6 - 8/18 which can often be contributed to the fact that appointment week #s are given by week not days, so less precise) It seems like 8-11 weeks might be most reliable in the data as this is their first appointment, and after that weeks 42,41, 39, etc.


Unnamed: 0,person_id,standard_concept_name,condition_start_date,conception_date
298494,5433458,"Gestation period, 8 weeks",2021-10-01,2021-08-06
110761,5433458,"Gestation period, 12 weeks",2021-11-04,2021-08-12
17903,5433458,"Gestation period, 13 weeks",2021-11-12,2021-08-13
18527,5433458,"Gestation period, 13 weeks",2021-12-09,2021-09-09
145170,5433458,"Gestation period, 17 weeks",2021-12-09,2021-08-12
24128,5433458,"Gestation period, 21 weeks",2022-01-06,2021-08-12
144907,5433458,"Gestation period, 17 weeks",2022-01-06,2021-09-09
28246,5433458,"Gestation period, 26 weeks",2022-02-08,2021-08-10
167101,5433458,"Gestation period, 30 weeks",2022-03-09,2021-08-11
162221,5433458,"Gestation period, 32 weeks",2022-03-23,2021-08-11


#Notes on calculating conception:

The earlier the ultrasound, the more accurate (before 14 weeks, and especially at 8 weeks). American College of Obstetricians and Gynecologists. 

8 weeks is a common time in US healthcare to get US and measurements, and we have n = 7k.

If we can't calculate single live birth from the 8 week entry, then let's choose 10, 12, 13, weeks etc and prioritize
#which entries are used to calculate.

I'm not yet sure how to handle instances like the person above who the first two lines are both week 13, but at appointments 
4 weeks apart; one is clearly wrong. Fortunately in this case we would use her week 8 data anyway, but we need to find more reliable measures.

# Create loop to calculate Gestational Age
Prepare dataset by removing instances of Single live birth for which we cannot calculate gestational length. This code takes a minute to run.

Will calculate gestational_age_at_birth by subtracting condition_start_time for "Single live birth" from conception_date from measurements most accurate for estimating conception date (i.e. 8 weeks, followed by 9, etc).  

The loop takes a few minutes to run so please be patient.   

Next is exploration of person_ids that have gestational ages that are unreasonably high or low (indicating that conception date was pulled from another person or another pregnancy). 

**We also need to screen for births that occur too close together (because of duplicate medical record entries). I haven't attempted this step yet.**


## Loop to remove incalculable instances of Single live birth 

In [110]:
#Remove rows that have Single live birth that cannot be calculated as gestational length data is missing. 
#This will shorten loop time on next step.
#Accomplish this by removing every instance of Single live birth that:
#1. Occurs as the very first line for a given person_id (e.g. no data available before that time)
#2. Occurs immediately following a Single live birth (e.g. no data available in between births to calculate gest length)

import time
start_time = time.time()

#Data must be sorted first
df = date_only_long_data_no_triple_duplicates.copy()
df = df.sort_values(by=['person_id', 'condition_start_date'])

# Initialize first_birth_in_cluster tracker as True. 
# Will be marked False after a Single live birth is found (row is kept).
# If the next row is also a Single live birth, that row will be dropped.
# Set as True after finding a row that does not have Single live birth in it (row is kept and next instance of SLB kept).
first_birth_in_cluster = True

prev_person_id = None

# Create a dataframe to hold the rows that will be dropped
dropped_rows_df = pd.DataFrame(columns=df.columns)

# Iterate through the dataframe
for idx, row in df.iterrows():  # Iterate through index/row in the dataframe
    if row['person_id'] != prev_person_id:  # If on a row for a new person (thus the first row)
        # If it is the first row for a specific person_id
        if row['standard_concept_name'] == 'Single live birth':  # If the first row is a single live birth
            dropped_rows_df = dropped_rows_df.append(row)  # Drop the row
            first_birth_in_cluster = False  # Set the tracker to False
        else:
            first_birth_in_cluster = True  # If the first row is not a single live birth, the tracker is set to True
    else:
        # If it is not the first row for a specific person_id
        if row['standard_concept_name'] == 'Single live birth':  # If the row is not the first row and is a single live birth
            if first_birth_in_cluster == True:  # If the tracker is True, keep the row
                first_birth_in_cluster = False  # Set the tracker to False
            else:  # If the tracker is False already, then drop the row
                dropped_rows_df = dropped_rows_df.append(row)
        else:  # If the row is not the first row and not a single live birth
            first_birth_in_cluster = True  # Keep the row and reset the tracker to True

    prev_person_id = row['person_id']  # Set the prev_person_id to the person_id of the row just finished

# Drop the rows from the original DataFrame
df = df.drop(dropped_rows_df.index)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"End time: {end_time}")
print(f"Execution time: {elapsed_time} seconds")

print("Number of dropped rows:", len(dropped_rows_df))
# View the dropped rows
dropped_rows_df



End time: 1698909309.0451927
Execution time: 55.34494185447693 seconds
Number of dropped rows: 14877


Unnamed: 0,person_id,standard_concept_name,condition_start_date,conception_date
240912,1000104,Single live birth,2005-06-20,
262183,1000104,Single live birth,2005-06-23,
270492,1000109,Single live birth,2000-03-13,
272224,1000109,Single live birth,2001-11-13,
241948,1000109,Single live birth,2001-11-16,
...,...,...,...,...
243624,9978035,Single live birth,2012-05-10,
231645,9978035,Single live birth,2015-02-20,
268096,9983360,Single live birth,2009-05-08,
242179,9989704,Single live birth,2007-11-02,


## Calculate gestational age at birth for instances of Single live birth 

In [140]:
#Apply loop to calculate gestional_age_at_birth to instances of Single live birth that have gestational data available.
df_copy = df.copy()

import time
import pandas as pd
import numpy as np

# Sort the DataFrame by person_id and condition_start_date
df_copy = df_copy.sort_values(by=['person_id', 'condition_start_date'])

# Initialize a dictionary to store the most recent "Gestation period" and conception date for each person_id
gestational_info = {}

# Initialize a list to store birth dates
birth_dates = []

# Start measuring execution time
start_time = time.time()

# Iterate through the DataFrame
for idx, row in df_copy.iterrows():
    person_id = row['person_id']

    if person_id not in gestational_info:
        gestational_info[person_id] = {'gestational_period': None, 'conception_date': None}

    current_gestational_period = gestational_info[person_id]['gestational_period']
    conception_date = gestational_info[person_id]['conception_date']

    if row['standard_concept_name'].startswith('Gestation period'):
        # Update the current gestational period
        gestational_info[person_id]['gestational_period'] = row['standard_concept_name']
        gestational_info[person_id]['conception_date'] = row['conception_date']

    if row['standard_concept_name'] == 'Single live birth':
        if conception_date is not None:
            birth_dates.append((idx, row['condition_start_date']))
            last_idx, last_birth_date = birth_dates[-1]
            gestational_age = (pd.to_datetime(last_birth_date) - pd.to_datetime(conception_date)).days / 7
            df_copy.at[last_idx, 'gestational_age_at_birth'] = gestational_age
            conception_date = None

# End measuring execution time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")

# Display a sample of the updated DataFrame
df_copy.sample(5)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")  

Execution time: 18.772106647491455 seconds
Execution time: 18.776557207107544 seconds


In [141]:
added_gestational_ages = df_copy['gestational_age_at_birth'].count()
print("Number of gestational ages added:", added_gestational_ages)

Number of gestational ages added: 20568


## Check numbers to track improvements/debugging of loop 

In [143]:
#Checking numbers to see how the loop is working....

#Missing gestation_age_at_birth (for example, birth occurred but no previous records attached)
missing_gestational_age = len(df_copy[df_copy['standard_concept_name'] == 'Single live birth'][df_copy['gestational_age_at_birth'].isna()])
print("Number of 'Single live birth' entries with missing gestational_age_at_birth:", missing_gestational_age)

# Filter and delete entries with negative or greater than 45 weeks gestational age
negative_gestational_age = len(df_copy[df_copy['gestational_age_at_birth'] < 0])
greater_than_45_weeks = len(df_copy[df_copy['gestational_age_at_birth'] > 45])


#Filter the DataFrame for 'Single live birth' with gestational age less than 37 weeks and full term
filtered_df_copy = df_copy[(df_copy['gestational_age_at_birth'] >= 0) & (df_copy['gestational_age_at_birth'] <= 45)]
single_live_births_less_than_37 = len(filtered_df_copy[(filtered_df_copy['standard_concept_name'] == 'Single live birth') & (filtered_df_copy['gestational_age_at_birth'] < 37)])
single_live_births_full_term = len(filtered_df_copy[(filtered_df_copy['standard_concept_name'] == 'Single live birth') & (filtered_df_copy['gestational_age_at_birth'] >= 37)])

# Display the counts of entries deleted
print("Number of entries with negative gestational age deleted:", negative_gestational_age)
print("Number of entries with gestational age greater than 45 weeks deleted:", greater_than_45_weeks)
print("Number of single live births that are full term >= 37 weeks:", single_live_births_full_term)
print("Number of single live births that are preterm < 37 weeks:", single_live_births_less_than_37)
print("Percentage of dataset that is preterm:", single_live_births_less_than_37 / single_live_births_full_term * 100)


'''
previous numbers before loop used to clean up incalculable Single live birth
Number of 'Single live birth' entries with missing gestational_age_at_birth: 16132
Number of entries with negative gestational age deleted: 0
Number of entries with gestational age greater than 45 weeks deleted: 952
Number of single live births that are full term >= 37 weeks: 16859
Number of single live births that are preterm < 37 weeks: 1542
Percentage of dataset that is preterm: 9.146449967376476
'''

Number of 'Single live birth' entries with missing gestational_age_at_birth: 40
Number of entries with negative gestational age deleted: 0
Number of entries with gestational age greater than 45 weeks deleted: 9
Number of single live births that are full term >= 37 weeks: 17823
Number of single live births that are preterm < 37 weeks: 2736
Percentage of dataset that is preterm: 15.35095101834708


  after removing the cwd from sys.path.


"\nprevious numbers before loop used to clean up incalculable SLB\nNumber of 'Single live birth' entries with missing gestational_age_at_birth: 16132\nNumber of entries with negative gestational age deleted: 0\nNumber of entries with gestational age greater than 45 weeks deleted: 952\nNumber of single live births that are full term >= 37 weeks: 16859\nNumber of single live births that are preterm < 37 weeks: 1542\nPercentage of dataset that is preterm: 9.146449967376476\n"

In [144]:
# Filter the dataframe to select rows with "Single live birth" and terms to explore (e.g. why neg ages, why over-term etc)
explore_gestational_age_df = df_copy[(df_copy['standard_concept_name'] == 'Single live birth') & (df_copy['gestational_age_at_birth'] > 45)]

# Get the unique person_id values for those rows
person_ids_with_expl_values = explore_gestational_age_df['person_id'].unique()

# Display the list of person_id values
print("person_id values with explore gestational ages in 'Single live birth' entries:")
print(person_ids_with_expl_values)


person_id values with explore gestational ages in 'Single live birth' entries:
<IntegerArray>
[1243605, 1261217, 1395771, 1399080, 1426151, 2851465, 2903700, 3892859,
 9765491]
Length: 9, dtype: Int64


In [149]:
#Why does this person have very long term.... shouldn't be calculated this way

#person_check = date_only_long_data_no_triple_duplicates[date_only_long_data_no_triple_duplicates['person_id'] == 1001034]
#person_check

#person resolved; gestation periods not prioritizing correctly
#person_df = df[df['person_id'] == 1001034]
#person_df

#person resolved; using 8 week measurement from pregnancy loss
#person_df = df[df['person_id'] == 9930591]
#person_df


person_df = df_copy[df_copy['person_id'] == 9765491]
person_df

print("Able to see that these person_id with very long gestation are because there are no entries for prenatal care \
and we don't know how long the term was.")

Unnamed: 0,person_id,standard_concept_name,condition_start_date,conception_date,gestational_age_at_birth
27748,9765491,"Gestation period, 26 weeks",2016-09-29,2016-03-31,
259949,9765491,Single live birth,2020-08-05,,226.857143
8550,9765491,"Gestation period, 38 weeks",2021-11-26,2021-03-05,
237872,9765491,Single live birth,2021-11-26,,38.0


## Check for duplicate births: same person_id and multiple records of same birth over a few days

In [162]:
#Look for births that occur within 42 weeks of each other for the same person_id to check for duplicate births
# in medical records 

start_time = time.time()

# Sort the DataFrame by person_id and condition_start_date
df_copy = df_copy.sort_values(by=['person_id', 'condition_start_date'])

# Create a new column 'next_birth_date' to store the next "Single live birth" date for each person
df_copy['next_birth_date'] = df_copy[df_copy['standard_concept_name'] == 'Single live birth'].groupby('person_id')['condition_start_date'].shift(-1)

# Calculate the time difference in weeks between consecutive births
df_copy['time_diff_weeks'] = (df_copy['next_birth_date'] - df_copy['condition_start_date']).dt.days / 7

# Filter rows where "Single live births" occur less than 42 weeks apart
result = df_copy[(df_copy['standard_concept_name'] == 'Single live birth') & (df_copy['time_diff_weeks'] < 20)]

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")  

result = result.sort_values(by=['person_id', 'condition_start_date'])
result


Execution time: 0.2838318347930908 seconds


Unnamed: 0,person_id,standard_concept_name,condition_start_date,conception_date,gestational_age_at_birth,next_birth_date,time_diff_weeks
228079,1000195,Single live birth,2021-08-10,,40.000000,2021-08-11,0.142857
244388,1001207,Single live birth,2020-02-15,,40.000000,2020-02-16,0.142857
232619,1001591,Single live birth,2019-08-21,,38.000000,2019-08-24,0.428571
244713,1001946,Single live birth,2020-05-11,,37.000000,2020-05-15,0.571429
242630,1002157,Single live birth,2016-08-10,,39.142857,2016-08-11,0.142857
...,...,...,...,...,...,...,...
268945,9930591,Single live birth,2018-04-30,,38.000000,2018-05-01,0.142857
259847,9946365,Single live birth,2021-07-30,,37.000000,2021-08-01,0.285714
232924,9961630,Single live birth,2020-09-25,,39.000000,2020-09-26,0.142857
266571,9977453,Single live birth,2022-05-05,,39.000000,2022-05-18,1.857143


In [81]:
#The above numbers for preterm births are those listed as single live birth that we then calculated preterm age based
#on gestational findings earlier in records. Now we can convert term and preterm into categories in another column, and
#add preterm birth findings and see if that increases our percentage of dataset that is preterm.

## Classify birth as preterm/term using gestational_age_at_birth
Can also try to classify very preterm etc as well


df.loc[(df['gestational_age_at_birth'] >= 37), 'birth_class'] = 'Term'
df.loc[(df['gestational_age_at_birth'] >= 32 and df['gestational_age_at_birth'] < 37 ), 'birth_class'] = 'Late Preterm'
df.loc[(df['gestational_age_at_birth'] < 28, 'birth_class'] = 'Very Preterm'


In [166]:
# Create a new column "birth_class" and set it to 'Unknown' initially
df_copy['birth_class'] = 'Unknown'
#Once we have also addressed standard_concept_name containing preterm birth, then we can delete all rows
#with birth_class "Unknown". This will give us a wide format data set (x rows and columns person_id, condition_start_date,
#and birth_class)

# Update "birth_class" for rows with "Single live birth" based on gestational age
df_copy.loc[(df_copy['standard_concept_name'] == 'Single live birth') & (df_copy['gestational_age_at_birth'] >= 37), 'birth_class'] = 'Term'
df_copy.loc[(df_copy['standard_concept_name'] == 'Single live birth') & (df_copy['gestational_age_at_birth'] < 37), 'birth_class'] = 'Preterm'

# Display the updated DataFrame
df_copy.sample(20)


# Sort the DataFrame by person_id and condition_start_date
df_copy = df_copy.sort_values(by=['person_id', 'condition_start_date'])

# Create a new column 'next_birth_date' to store the next "Single live birth" date for each person
df_copy['next_birth_date'] = df_copy[df_copy['standard_concept_name'] == 'Single live birth'].groupby('person_id')['condition_start_date'].shift(-1)

# Calculate the time difference in weeks between consecutive births
df_copy['time_diff_weeks'] = (df_copy['next_birth_date'] - df_copy['condition_start_date']).dt.days / 7

# Filter rows where "Single live births" occur less than 42 weeks apart
result = df_copy[(df_copy['birth_class'] == 'Preterm') & (df_copy['time_diff_weeks'] > 20)]

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")  

result = result.sort_values(by=['person_id', 'condition_start_date'])
result

#The results so far are indicating medical record issues, not instances of consecutive preterm births

Execution time: 244.9045639038086 seconds


Unnamed: 0,person_id,standard_concept_name,condition_start_date,conception_date,gestational_age_at_birth,next_birth_date,time_diff_weeks,birth_class
273643,1003625,Single live birth,2012-11-25,,32.428571,2015-12-12,158.857143,Preterm
237881,1009394,Single live birth,2017-09-28,,30.000000,2020-09-30,156.857143,Preterm
262187,1009531,Single live birth,2018-10-08,,36.000000,2019-09-12,48.428571,Preterm
271511,1011700,Single live birth,2016-06-18,,36.000000,2019-03-18,143.285714,Preterm
235130,1013051,Single live birth,2018-08-02,,36.000000,2020-01-01,73.857143,Preterm
...,...,...,...,...,...,...,...,...
267400,9561683,Single live birth,2018-07-12,,22.000000,2019-07-28,54.428571,Preterm
232339,9613057,Single live birth,2019-01-11,,23.000000,2022-01-21,158.000000,Preterm
263677,9642533,Single live birth,2016-02-23,,34.000000,2021-01-19,256.000000,Preterm
248008,9684358,Single live birth,2020-07-16,,19.000000,2022-01-12,77.857143,Preterm


## Examine entries related to preterm birth in standard_concept_name


# Debugging

The cells below were used for debugging.

In [78]:
#Troubleshooting
#the list appears correct and in correct order
#print(gest_test)

#All conception_date values are correctly formatted.
#try:
    #pd.to_datetime(df['conception_date'], errors='raise')
    #print("All values in the 'conception_date' column are valid dates.")
#except ValueError:
    #print("There are non-date values in the 'conception_date' column.")
    
#Debugged with 1000195 (for gestation periods being used in correct order of priority)    
#Create mini dataset only person  
#df = date_only_long_data_no_triple_duplicates.copy()
df = df.sort_values(by=['person_id', 'condition_start_date'])

person_df = df[df['person_id'] == 1001211]
person_df

person_df = person_df.reset_index(drop=True)
df = person_df

import numpy as np
import pandas as pd

# Initialize variables to store the most recent "Gestation period" and birth dates
most_accurate_conception_date = None
birth_dates = []

# Initialize gest_test as an empty list
gest_test = []

# Initialize current_person_id
current_person_id = None

# Iterate through the DataFrame
for idx, row in df.iterrows():
    if row['person_id'] != current_person_id:
        # Update the current person_id
        current_person_id = row['person_id']
        most_accurate_conception_date = None  # Reset for the new person_id

        # Create a dynamic gest_test list based on the unique values in standard_concept_name
        # Scan all rows for the current person_id and update gest_test
        gest_test = df[df['person_id'] == current_person_id]['standard_concept_name'].unique()

    # Print the current row information
    print(f"Examining row {idx}, person_id: {row['person_id']}, standard_concept_name: {row['standard_concept_name']}")

    for test_period in gest_test:
        # Check if the row matches the current gest_test item
        if row['standard_concept_name'] == test_period:
            most_accurate_conception_date = row['conception_date']
            print(f"Assigned most_accurate_conception_date: {most_accurate_conception_date}")

    if 'gestational_age_at_birth' not in df.columns:
        df['gestational_age_at_birth'] = np.nan
        print("Created gestational_age_at_birth column")

    if row['standard_concept_name'] == 'Single live birth' and pd.isna(df.at[idx, 'gestational_age_at_birth']):
        birth_dates.append((idx, row['condition_start_date']))

        if most_accurate_conception_date is not None:
            last_idx, last_birth_date = birth_dates[-1]
            gestational_age = (pd.to_datetime(last_birth_date) - pd.to_datetime(most_accurate_conception_date)).days / 7
            df.at[last_idx, 'gestational_age_at_birth'] = gestational_age
            most_accurate_conception_date = None  # Reset date after calculating gestational age
            print(f"Calculated gestational age: {gestational_age}")

df


Examining row 0, person_id: 1001211, standard_concept_name: Gestation period, 40 weeks
Assigned most_accurate_conception_date: 2015-12-14
Examining row 1, person_id: 1001211, standard_concept_name: Single live birth
Assigned most_accurate_conception_date: None
Examining row 2, person_id: 1001211, standard_concept_name: Gestation period, 38 weeks
Assigned most_accurate_conception_date: 2018-03-07
Examining row 3, person_id: 1001211, standard_concept_name: Single live birth
Assigned most_accurate_conception_date: None
Examining row 4, person_id: 1001211, standard_concept_name: Gestation period, 38 weeks
Assigned most_accurate_conception_date: 2019-06-15
Examining row 5, person_id: 1001211, standard_concept_name: Gestation period, 38 weeks
Assigned most_accurate_conception_date: 2019-06-25
Examining row 6, person_id: 1001211, standard_concept_name: Single live birth
Assigned most_accurate_conception_date: None


Unnamed: 0,person_id,standard_concept_name,condition_start_date,conception_date,gestational_age_at_birth
0,1001211,"Gestation period, 40 weeks",2016-09-19,2015-12-14,
1,1001211,Single live birth,2016-09-19,,40.0
2,1001211,"Gestation period, 38 weeks",2018-11-28,2018-03-07,
3,1001211,Single live birth,2018-11-28,,38.0
4,1001211,"Gestation period, 38 weeks",2020-03-07,2019-06-15,
5,1001211,"Gestation period, 38 weeks",2020-03-17,2019-06-25,
6,1001211,Single live birth,2020-03-17,,38.0


In [56]:
#Just a double check...
#How many entries containing "preterm" also have a conception date (should be none, I think)
#import pandas as pd

# Filter rows where 'standard_concept_name' contains 'preterm' and 'conception_date' is not null
#preterm_with_conception_date = df[(df['standard_concept_name'].str.contains('preterm', case=False)) & ~df['conception_date'].isnull()]

# Get the count of such rows
#count = len(preterm_with_conception_date)

#print(f"Number of 'preterm' entries with a 'conception_date': {count}")


Number of 'preterm' entries with a 'conception_date': 0


## Preterm birth entries categorization
Entries containing "preterm birth" from standard_concept_name should be categorized as Preterm in column birth_class. 
Alternatively, we can first calculate gestational_age for this set, and then classify as very preterm, preterm, early term, and term. 