<a href="https://colab.research.google.com/github/hansbrunner/safety_data/blob/main/Clozapine/Convert_AE_PT_to_SOC_Clozapine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
In this analysis, I aim to explore the use of data analysis techniques within the field of drug safety surveillance.
First, I will create a file to convert reported Clozapine AE in PT to SOC using ChatGPT-3.5.
This is not 100% accurate (manuel evaluation), but I get an acceptable result.
"""

In [2]:
# get Clozapine AE reports from FDA
import time
import json
import requests
base_url = 'https://api.fda.gov/drug/event.json'

# Parameters for the API request
limit = 1000  # Fetch 1000 reports at a time
total_reports_to_fetch = 10000  # The total number of reports to get
total_reports_fetched = 0  # Counter
data = []  # Store all reports

# Loop
while total_reports_fetched < total_reports_to_fetch:
    # Fetch the next batch of reports
    url = f'{base_url}?search=clozapine&limit={limit}&skip={total_reports_fetched}'
    response = requests.get(url)
    _data = response.json()

    # Add the fetched results to the all_data list
    data.extend(_data['results'])

    # Update the total number of reports fetched
    total_reports_fetched += len(_data['results'])

    # Sleep
    time.sleep(1)

In [3]:
# Get all unique AE as PT
# Empty set to store unique Preferred Terms (PT)
unique_pts = set()

# Loop through each report in the data
for report in data:
    reactions = report['patient'].get('reaction', [])
    for reaction in reactions:
        event = reaction['reactionmeddrapt'] # preferred term
        unique_pts.add(event)

# Convert the set to list
unique_pts_list = list(unique_pts)

In [4]:
# Use chatGPT to map PTs to SOCs
# Takes some time and costs money (Not a lot though)
import openai
import numpy as np
import pandas as pd

# Get openai api key
with open('openai_key.txt', 'r') as file:
    openai_api_key = file.read().strip()

openai.api_key = openai_api_key
# Ask GPT-3.5 to map PTs to SOCs
def map_pt_to_soc(pt):
    try:
        # Call GPT-3.5
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"What is the medDra System Organ Class (SOC) for the following medDra Preferred Term (PT): '{pt}'? Only write the SOC."}
            ],
            max_tokens=50,
            n=1,
            temperature=0.0
        )
        # Get the SOC from the response
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        # Handle API call failure
        print(f"Error for PT '{pt}': {e}")
        return np.nan

# Create a DataFrame
pt_to_soc = pd.DataFrame({'PT': unique_pts_list}) # list from before

# New column 'SOC' with nan values
pt_to_soc['SOC'] = np.nan

# Map PTs to SOCs using GPT-3.5
pt_to_soc['SOC'] = pt_to_soc['PT'].apply(map_pt_to_soc)

In [8]:
# Responses are not all in same format, clean!
import re
import pandas as pd

# MedDRA SOCs
meddra_socs = [
    "Blood and lymphatic system disorders",
    "Cardiac disorders",
    "Congenital, familial and genetic disorders",
    "Ear and labyrinth disorders",
    "Endocrine disorders",
    "Eye disorders",
    "Gastrointestinal disorders",
    "General disorders and administration site conditions",
    "Hepatobiliary disorders",
    "Immune system disorders",
    "Infections and infestations",
    "Injury, poisoning and procedural complications",
    "Investigations",
    "Metabolism and nutrition disorders",
    "Musculoskeletal and connective tissue disorders",
    "Neoplasms benign, malignant and unspecified (incl cysts and polyps)",
    "Nervous system disorders",
    "Pregnancy, puerperium and perinatal conditions",
    "Product issues",
    "Psychiatric disorders",
    "Renal and urinary disorders",
    "Reproductive system and breast disorders",
    "Respiratory, thoracic and mediastinal disorders",
    "Skin and subcutaneous tissue disorders",
    "Social circumstances",
    "Surgical and medical procedures",
    "Vascular disorders"
]

# Removing special characters and convert to lowercase
def clean_string(s):
    return re.sub(r'[^\w\s]', '', s).strip().lower()

# Match SOCs by cleaning and comparing the strings
def normalize_soc(soc_response):
    cleaned_response = clean_string(soc_response)

    # Compare the cleaned response with the cleaned MedDRA SOC list
    for official_soc in meddra_socs:
        cleaned_soc = clean_string(official_soc)
        if cleaned_soc in cleaned_response:
            return official_soc  # Return the official SOC if matched

    return None  # Return None if match is not found

df = pt_to_soc.copy()
# Match SOCs
df['SOC_normalized'] = df['SOC'].apply(normalize_soc)

# Print a sample of 10 random old and corrected SOCs
#sampled_df = df[['SOC', 'SOC_normalized']].sample(n=10, random_state=10)
#pd.set_option('display.max_colwidth', None)  # Ensure long text isn't truncated


# Save
df.to_csv('pt_soc_mapping.csv')
from google.colab import files
# Download the file to your desktop
files.download('pt_soc_mapping.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
# Visualize mappings with Sangey diagram
import pandas as pd
import plotly.graph_objects as go
df = pd.read_csv("pt_soc_mapping.csv",index_col=0)

# rename nan to Undefined
df['SOC'].fillna('Undefined', inplace=True)

# Select 50 random rows foro plot
df_subset = df.sample(n=50, random_state=1)

# Lists of unique PTs and SOCs
unique_pts = list(df_subset['PT'].unique())
unique_socs = list(df_subset['SOC_normalized'].unique())

# Combine lists
all_nodes = unique_pts + unique_socs

# Dictionaries for mapping terms to nodes
node_indices = {node: i for i, node in enumerate(all_nodes)}

# Mappings for the Sankey diagram
sources = [node_indices[pt] for pt in df_subset['PT']]
targets = [node_indices[soc] for soc in df_subset['SOC_normalized']]

# Plot the Sankey diagram
fig = go.Figure(go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = all_nodes
    ),
    link = dict(
        source = sources,  # PTs
        target = targets,  # SOCs
        value = [1]*len(df_subset)  # 1-to-1 mapping
    )
))

# Tidy
fig.update_layout(
    title_text="MedDRA PT to SOC Mapping",
    font_size=12,
    autosize=False,
    width=1200,
    height=700,
    margin=dict(l=200, r=200, t=50, b=50)
)
fig.show()

