# PFAS Dataset Exploratory Data Analysis

This notebook explores two datasets:
1. **EPA PFAS Master List V2**: A comprehensive list of PFAS chemicals.
2. **Tox21 Supporting Information**: A dataset containing toxicity screening data and predicted properties.

## Goal
Understand the structure of these datasets and analyze the overlap of chemicals (via `DTXSID`) to see how much toxicity data is available for known PFAS.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set style
sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)

## 1. Load Datasets

In [None]:
pfas_file = "EPA PFAS Master List V2.xlsx"
tox21_file = "tx0c00264_si_003.xlsx"

print("Loading EPA PFAS Master List...")
df_pfas = pd.read_excel(pfas_file, sheet_name='EPA PFAS Master List V2')
print(f"PFAS Master List Shape: {df_pfas.shape}")

print("Loading Tox21 Data (Sheet S2 - Chemical Info)...")
df_tox21_info = pd.read_excel(tox21_file, sheet_name='S2.TOX21S')
print(f"Tox21 Chemical Info Shape: {df_tox21_info.shape}")

print("Loading Tox21 Data (Sheet S4 - Predicted Properties)...")
df_tox21_props = pd.read_excel(tox21_file, sheet_name='S4.Predicted properties')
print(f"Tox21 Properties Shape: {df_tox21_props.shape}")

## 2. Inspect Data Structures

In [None]:
print("--- EPA PFAS Master List Head ---")
display(df_pfas.head(3))

print("\n--- Tox21 Chemical Info Head ---")
display(df_tox21_info.head(3))

## 3. Overlap Analysis
We will identify common chemicals using the `DTXSID` identifier.

In [None]:
pfas_ids = set(df_pfas['DTXSID'].dropna())
tox21_ids = set(df_tox21_info['DTXSID'].dropna())

common_ids = pfas_ids.intersection(tox21_ids)
print(f"Unique PFAS IDs: {len(pfas_ids)}")
print(f"Unique Tox21 IDs: {len(tox21_ids)}")
print(f"Common IDs (Overlap): {len(common_ids)}")
print(f"Percentage of PFAS List with Tox21 Data: {(len(common_ids)/len(pfas_ids))*100:.2f}%")

In [None]:
# Visualization of Overlap
plt.figure(figsize=(8, 5))
labels = ['Common (Overlap)', 'PFAS Only (No Tox21)', 'Tox21 Only (Non-PFAS)']
sizes = [
    len(common_ids),
    len(pfas_ids) - len(common_ids),
    len(tox21_ids) - len(common_ids)
]
colors = ['#ff9999','#66b3ff','#99ff99']

plt.bar(labels, sizes, color=colors)
plt.title('Dataset Overlap (Count of Chemicals)')
plt.ylabel('Count')
plt.yscale('log') # Log scale because overlap is very small compared to totals
plt.text(0, len(common_ids), f"{len(common_ids)}", ha='center', va='bottom', fontsize=12)
plt.show()

## 4. Analysis of Overlapping Chemicals
Let's extract the property predictions for the 36 common chemicals.

In [None]:
# Filter properties dataframe for the common IDs
common_props = df_tox21_props[df_tox21_props['DTXSID'].isin(common_ids)].copy()

# Merge with names for better calibration
common_props = common_props.merge(df_pfas[['DTXSID', 'Chemical Name']], on='DTXSID', how='left')

print(f"Properties loaded for {len(common_props)} common chemicals.")
display(common_props.head())

In [None]:
# Visualize Predicted Mutagenicity (AMES test prediction)
if 'AMES_MUTAGENICITY_TEST_PRED' in common_props.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(common_props['AMES_MUTAGENICITY_TEST_PRED'], bins=10, kde=True)
    plt.title('Distribution of Predicted Mutagenicity for Overlapping PFAS')
    plt.xlabel('Predicted Probability (0-1)')
    plt.ylabel('Count')
    plt.show()

In [None]:
# Visualize Predicted LD50 (Oral Rat)
if 'ORAL_RAT_LD50_MOL/KG_TEST_PRED' in common_props.columns:
    plt.figure(figsize=(10, 6))
    # Log transformed for visualization usually, but raw values might be small
    sns.histplot(common_props['ORAL_RAT_LD50_MOL/KG_TEST_PRED'], bins=10, kde=True, color='orange')
    plt.title('Distribution of Predicted Oral Rat LD50 (mol/kg) for Overlapping PFAS')
    plt.xlabel('LD50 (mol/kg)')
    plt.ylabel('Count')
    plt.show()