In [1]:
import pandas as pd
import os
import subprocess

## Join Tiles

I'll join the mutation effect data from all tiles (with stop codons) into a single dataframe.

In [2]:
# Store dataframes after reading from each file
tile_dfs = []
# Loop through each directory in the base directory
for subdir in os.listdir("results"):
    # Check if the directory matches the tile_* pattern
    if "tile_" in subdir:
        # Build the full file path
        file_path = os.path.join("results", subdir, "muteffects_withStops", "virus_muteffects.csv")
        # Check if the file exists
        if os.path.exists(file_path):
            # Load the file into a dataframe
            df = pd.read_csv(file_path) 
            # Add the 'tile' column
            df["tile"] = int(subdir.split("_")[1])
            # Append the dataframe to the list
            tile_dfs.append(df)

# Concatenate all dataframes into one
all_tiles_df = pd.concat(tile_dfs, ignore_index=True).sort_values(by=['tile', 'site']).reset_index(drop=True)
all_tiles_df['tile_name'] = 'Tile ' + all_tiles_df['tile'].astype(str)

# Write the dataframe to a CSV file to a new dms_viz directory in the results directory
if not os.path.exists("results/dms_viz"):
    os.makedirs("results/dms_viz")
all_tiles_df.to_csv("results/dms_viz/virus_muteffects.csv", index=False)

all_tiles_df.head()

Unnamed: 0,site,wildtype,mutant,mutation,effect,log2effect,tile,tile_name
0,1,R,*,R1*,0.001401,-9.4799,1,Tile 1
1,1,R,A,R1A,0.003133,-8.3181,1,Tile 1
2,1,R,C,R1C,0.007928,-6.9788,1,Tile 1
3,1,R,D,R1D,0.008225,-6.9258,1,Tile 1
4,1,R,E,R1E,0.004463,-7.8079,1,Tile 1


## Make the Sitemap

I need to map the sites from the reference to the protein structure. For now, I'll just assume that this is one-to-one. 

In [38]:
# Extract unique sites and create the sitemap dataframe
unique_sites = all_tiles_df['site'].unique()
sitemap_df = pd.DataFrame({
    'reference_site': unique_sites,
    'sequential_site': unique_sites
})
# Offset the sites by 131 
sitemap_df['protein_site'] = sitemap_df['reference_site'] - 131


# Write the dataframe to a CSV file to a new dms_viz directory in the results directory
if not os.path.exists("results/dms_viz"):
    os.makedirs("results/dms_viz")
sitemap_df.to_csv("results/dms_viz/sitemap.csv", index=False)

sitemap_df.head()

Unnamed: 0,reference_site,sequential_site,protein_site
0,1,1,-130
1,2,2,-129
2,3,3,-128
3,4,4,-127
4,5,5,-126


## Make the `dms-viz` JSON

In [43]:
command = """
configure-dms-viz format \
  --input "results/dms_viz/virus_muteffects.csv" \
  --sitemap "results/dms_viz/sitemap.csv" \
  --output "results/dms_viz/zikv-ns3.json" \
  --name "ZIKV NS3" \
  --metric "effect" \
  --metric-name "Mutation Effect" \
  --condition "tile_name" \
  --condition-name "Tile" \
  --structure "6KK6" \
  --included-chains "B" \
  --exclude-amino-acids "*" \
  --description "Deep mutational scanning on the ZIKV NS3 [PDB: 6KK6]" \
  --title "Zika Virus NS3 DMS [PDB: 6KK6]"
"""

In [44]:
subprocess.run(command, shell=True, check=True)


Formatting data for visualization using the 'effect' column from 'results/dms_viz/virus_muteffects.csv'...

Using sitemap from 'results/dms_viz/sitemap.csv'.

Success! The visualization JSON was written to 'results/dms_viz/zikv-ns3.json'


CompletedProcess(args='\nconfigure-dms-viz format   --input "results/dms_viz/virus_muteffects.csv"   --sitemap "results/dms_viz/sitemap.csv"   --output "results/dms_viz/zikv-ns3.json"   --name "ZIKV NS3"   --metric "effect"   --metric-name "Mutation Effect"   --condition "tile_name"   --condition-name "Tile"   --structure "6KK6"   --included-chains "B"   --exclude-amino-acids "*"   --description "Deep mutational scanning on the ZIKV NS3 [PDB: 6KK6]"   --title "Zika Virus NS3 DMS [PDB: 6KK6]"\n', returncode=0)