In [2]:
from anthropic import Anthropic
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()

# Get key from environment
api_key = os.getenv("ANTHROPIC_API_KEY")

# Initialize client
client = Anthropic(api_key=api_key)

# Send a simple prompt
response = client.messages.create(
    model="claude-3-7-sonnet-20250219",
    max_tokens=200,
    messages=[
        {"role": "user", "content": "Hello Claude! Can you summarize why dotenv is useful?"}
    ]
)

print(response.content[0].text)


# Why dotenv is Useful

Dotenv is a popular tool that simplifies environment variable management in applications. Here's why it's valuable:

## Key Benefits

- **Configuration separation**: Keeps sensitive information (API keys, database credentials) out of your code
- **Environment-specific settings**: Easily manage different configurations for development, testing, and production
- **Simple implementation**: Just create a `.env` file and the library loads variables into your application
- **Security improvement**: Prevents credentials from being committed to version control
- **Development workflow**: Team members can maintain their own local configuration
- **Consistent interface**: Access environment variables the same way across different environments

Dotenv works by loading variables from a `.env` file into your application's environment, making them accessible through your language's standard environment variable interface (like `process.env` in Node.js).


In [4]:
from pathlib import Path
import pandas as pd, torch, os, gc
from interplm.sae.inference import load_sae_from_hf
import matplotlib.pyplot as plt
import numpy as np
DEVICE="cuda"

DATA_DIR = Path("esm_sae_results"); DATA_DIR.mkdir(exist_ok=True)
SEQUENCES_DIR = Path("/home/ec2-user/InterPLM/data/uniprot/subset_25k.csv")
# ANNOTATIONS_DIR = Path("uniprotkb_swissprot_annotations.tsv.gz")
ANNOTATIONS_DIR = Path("/home/ec2-user/InterPLM/subset_annotations.tsv.gz")


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import glob
parts = [pd.read_pickle(p) for p in sorted(glob.glob(str(DATA_DIR / "sae_features_rank*.final.pkl")))]
features_all = pd.concat(parts, ignore_index=True).drop_duplicates(subset=["uniprot_id"])
features_all.to_pickle(DATA_DIR / "sae_features_all.pkl")
features_all.shape


(40000, 6)

In [6]:
features_all.head()

Unnamed: 0,uniprot_id,length,features,max_activation,n_active_features,reconstruction_mse
0,Q9GL23,50,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002...",1.265625,1876,45.19838
1,Q6GZU6,50,"[0.00023197175, 0.0, 0.0, 0.0, 0.0013056946, 0...",0.843262,2168,13.467114
2,P9WJG6,50,"[0.0, 0.00057144166, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.935059,1740,12.720748
3,P18924,51,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.000...",0.956543,1799,11.394856
4,Q08076,52,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.000...",1.139648,1772,24.694654


In [7]:
annotations_df = pd.read_csv(ANNOTATIONS_DIR, sep="\t", compression="gzip")

In [8]:
annotations_df.head()

Unnamed: 0,Entry,Reviewed,Protein names,Length,Sequence,EC number,Active site,Binding site,Cofactor,Disulfide bond,...,Helix,Turn,Beta strand,Coiled coil,Domain [CC],Compositional bias,Domain [FT],Motif,Region,Zinc finger
0,A0A009IHW8,reviewed,2' cyclic ADP-D-ribose synthase AbTIR (2'cADPR...,269,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,3.2.2.-; 3.2.2.6,"ACT_SITE 208; /evidence=""ECO:0000255|PROSITE-P...","BINDING 143; /ligand=""NAD(+)""; /ligand_id=""ChE...",,,...,"HELIX 143..145; /evidence=""ECO:0007829|PDB:7UW...","TURN 146..149; /evidence=""ECO:0007829|PDB:7UWG...","STRAND 135..142; /evidence=""ECO:0007829|PDB:7U...","COILED 31..99; /evidence=""ECO:0000255""",DOMAIN: The TIR domain mediates NAD(+) hydrola...,,"DOMAIN 133..266; /note=""TIR""; /evidence=""ECO:0...",,,
1,A0A059WI14,reviewed,Trivalent organoarsenical cleaving enzyme (EC ...,161,MKYAHVGLNVTNLEKSIEFYSKLFGAEPVKVKPDYAKFLLESPGLN...,1.13.11.-,,"BINDING 5; /ligand=""Fe(2+)""; /ligand_id=""ChEBI...",COFACTOR: Name=Fe(2+); Xref=ChEBI:CHEBI:29033;...,,...,,,,,DOMAIN: The thiolates of the vicinal cysteine ...,,"DOMAIN 2..119; /note=""VOC""; /evidence=""ECO:000...",,,
2,A0A067XGX8,reviewed,"Phospho-2-dehydro-3-deoxyheptonate aldolase 2,...",512,MALTATATTRGGSALPNSCLQTPKFQSLQKPTFISSFPTNKKTKPR...,2.5.1.54,,"BINDING 126; /ligand=""Mn(2+)""; /ligand_id=""ChE...",COFACTOR: Name=Mn(2+); Xref=ChEBI:CHEBI:29035;...,,...,,,,,,,,,"REGION 37..57; /note=""Disordered""; /evidence=""...",
3,A0A067XH53,reviewed,"Phospho-2-dehydro-3-deoxyheptonate aldolase 1,...",533,MALSTNSTTSSLLPKTPLVQQPLLKNASLPTTTKAIRFIQPISAIH...,2.5.1.54,,"BINDING 145; /ligand=""Mn(2+)""; /ligand_id=""ChE...",COFACTOR: Name=Mn(2+); Xref=ChEBI:CHEBI:29035;...,,...,,,,,,"COMPBIAS 47..56; /note=""Polar residues""; /evid...",,,"REGION 47..70; /note=""Disordered""; /evidence=""...",
4,A0A0A1H8I4,reviewed,Aconitate isomerase (AI) (EC 5.3.3.7),262,MFPRLPTLALGALLLASTPLLAAQPVTTLTVLSSGGIMGTIREVAP...,5.3.3.7,,,,,...,,,,,,,,,,


In [17]:
import numpy as np
import pandas as pd
import random

# Parameters
N_FEATURES = 1200
BINS = np.arange(0, 1.1, 0.1)

# Randomly select feature ids
all_feature_ids = list(range(len(features_all.iloc[0].features)))
print("num features", len(all_feature_ids))
selected_features = random.sample(all_feature_ids, N_FEATURES)

print(f"Selected {len(selected_features)} features out of {len(all_feature_ids)}")

# Build dataset for each feature
feature_datasets = {}

# Predefine bin labels
bin_labels = [f"{BINS[i]:.1f}-{BINS[i+1]:.1f}" for i in range(len(BINS)-1)]

for fid in selected_features:
    # Extract activations for this feature
    activations = [f[fid] for f in features_all["features"]]
    df = pd.DataFrame({
        "uniprot_id": features_all["uniprot_id"],
        "activation": activations
    })

    # Assign bins
    df["bin"] = pd.cut(df["activation"], bins=BINS, labels=bin_labels, include_lowest=True)

    sampled = []

    # Sample proteins per bin
    for b in df["bin"].dropna().unique():
        bin_df = df[df["bin"] == b]
        n = 10 if b == "0.9-1.0" else 2
        sampled.extend(bin_df.sample(min(len(bin_df), n), random_state=42).to_dict(orient="records"))

    # Add 10 random zero-activation proteins 
    zero_df = df[df["activation"] == 0.0]
    if len(zero_df) > 0:
        sampled.extend(zero_df.sample(min(len(zero_df), 10), random_state=42).to_dict(orient="records"))

    # Merge with metadata from annotations_df
    sampled_df = pd.DataFrame(sampled)
    merged = sampled_df.merge(annotations_df, left_on="uniprot_id", right_on="Entry", how="left")

    feature_datasets[fid] = merged

# Example feature dataset
example_fid = selected_features[0]
feature_datasets[example_fid].head()


num features 10240
Selected 1200 features out of 10240


Unnamed: 0,uniprot_id,activation,bin,Entry,Reviewed,Protein names,Length,Sequence,EC number,Active site,...,Helix,Turn,Beta strand,Coiled coil,Domain [CC],Compositional bias,Domain [FT],Motif,Region,Zinc finger
0,B1IQD2,0.005489,0.0-0.1,B1IQD2,reviewed,Adenosine deaminase (EC 3.5.4.4) (Adenosine am...,333,MIDTTLPLTDIHRHLDGNIRPQTILELGRQYNISLPAQSLETLIPH...,3.5.4.4,"ACT_SITE 200; /note=""Proton donor""; /evidence=...",...,,,,,,,,,,
1,P0AAE7,3e-06,0.0-0.1,P0AAE7,reviewed,Putative arginine/ornithine antiporter,460,MEKKLGLSALTALVLSSMLGAGVFSLPQNMAAVASPAALLIGWGIT...,,,...,,,,,,,,,,
2,Q8BMG1,0.107273,0.1-0.2,Q8BMG1,reviewed,ATR-interacting protein (ATM and Rad3-related-...,785,MAGTPAPNSHRKQSGGLEPFPGLSRSIENPPSKRARSFSETTVPDP...,,,...,,,,"COILED 108..209; /evidence=""ECO:0000255""",DOMAIN: The EEXXXDDL motif is required for the...,,,"MOTIF 763..770; /note=""EEXXXDL motif""","REGION 1..53; /note=""Disordered""; /evidence=""E...",
3,Q9N077,0.118354,0.1-0.2,Q9N077,reviewed,ATR-interacting protein (ATM and Rad3-related-...,655,LIKNGEIKILRDSLHQTESVLEEQRRSHFLLEQEKTQALSDKEKEF...,,,...,,,,"COILED 6..82; /evidence=""ECO:0000255""",DOMAIN: The EEXXXDDL motif is required for the...,"COMPBIAS 138..157; /note=""Basic and acidic res...",,"MOTIF 633..640; /note=""EEXXXDL motif""","REGION 120..157; /note=""Disordered""; /evidence...",
4,Q18FB4,0.0,0.0-0.1,Q18FB4,reviewed,A-type ATP synthase subunit E,193,MSLDTVVEDIRDEAQARASEIQADADERAEKIIEEAEADAEDILEE...,,,...,,,,,,,,,,


In [18]:
len(feature_datasets[example_fid])

14