# Combine all tiles

The NS2B/NS3 library was mutagenized in three roughly equal length tiles. We analyzed each tile separately using [`dms_tools2`](https://jbloomlab.github.io/dms_tools2/index.html). In this notebook, we join the resulting data files together.

In [20]:
import os
import pandas as pd

In [None]:
tiles = snakemake.params.tiles
without_stops_csv = snakemake.output.without_stops_csv
with_stops_csv = snakemake.output.with_stops_csv

## Mutation effects without stop codons

Combine all of the mutation effects from each tiles.

In [23]:
muteffects_paths = {tile: os.path.join("results", tile, 'muteffects', 'virus_muteffects.csv') for tile in tiles}
muteffects_dfs = []
for tile, path in muteffects_paths.items():
    df = pd.read_csv(path)
    df['tile'] = tile
    muteffects_dfs.append(df)
muteffects_df = pd.concat(muteffects_dfs)
muteffects_df.head()

Unnamed: 0,site,wildtype,mutant,mutation,effect,log2effect,tile
0,1,R,A,R1A,0.003142,-8.3143,tile_1
1,1,R,C,R1C,0.007931,-6.9783,tile_1
2,1,R,D,R1D,0.008227,-6.9253,tile_1
3,1,R,E,R1E,0.004469,-7.8057,tile_1
4,1,R,F,R1F,0.009128,-6.7754,tile_1


## Mutation Preferences without stop codons

Combine the average mutation preferences from each tile.

In [16]:
preferences_paths = {tile: os.path.join("results", tile, 'prefs', 'prefs_virus.csv') for tile in tiles}
preferences_dfs = []
for tile, path in preferences_paths.items():
    df = pd.read_csv(path)
    df_melted = (
        df 
            .melt(id_vars=['site'], var_name='mutant', value_name='preference')
            .sort_values(by=['site', 'mutant'], ascending=[True, True])
            .reset_index(drop=True)
    )
    df_melted['tile'] = tile
    preferences_dfs.append(df_melted)

preferences_df = pd.concat(preferences_dfs)
preferences_df.head()

Unnamed: 0,site,mutant,preference,tile
0,1,A,0.00265,tile_1
1,1,C,0.00669,tile_1
2,1,D,0.00694,tile_1
3,1,E,0.00377,tile_1
4,1,F,0.0077,tile_1


## Merged effects and preferences without stop codons

We'll merge the effects and preferences for each tile and mutation into a single file. We'll also add information about whether a site is in NS2B or NS3.

In [38]:
effects_and_preferences_df = pd.merge(muteffects_df, preferences_df, on=['site', 'mutant', 'tile'])
assert(effects_and_preferences_df.shape[0] == muteffects_df.shape[0])
assert(effects_and_preferences_df.shape[0] == preferences_df.shape[0])

In [39]:
sitemap = pd.read_csv('data/sitemap.csv')
sitemap.head()

Unnamed: 0,reference_site,sequential_site,protein_site,protein
0,(NS2B) 0,1,0,NS2B
1,(NS2B) 1,2,1,NS2B
2,(NS2B) 2,3,2,NS2B
3,(NS2B) 3,4,3,NS2B
4,(NS2B) 4,5,4,NS2B


In [40]:
sitemap_effects_and_preferences_df = pd.merge(effects_and_preferences_df, sitemap, left_on='site', right_on='sequential_site', how='left')
sitemap_effects_and_preferences_df.head()

Unnamed: 0,site,wildtype,mutant,mutation,effect,log2effect,tile,preference,reference_site,sequential_site,protein_site,protein
0,1,R,A,R1A,0.003142,-8.3143,tile_1,0.00265,(NS2B) 0,1,0,NS2B
1,1,R,C,R1C,0.007931,-6.9783,tile_1,0.00669,(NS2B) 0,1,0,NS2B
2,1,R,D,R1D,0.008227,-6.9253,tile_1,0.00694,(NS2B) 0,1,0,NS2B
3,1,R,E,R1E,0.004469,-7.8057,tile_1,0.00377,(NS2B) 0,1,0,NS2B
4,1,R,F,R1F,0.009128,-6.7754,tile_1,0.0077,(NS2B) 0,1,0,NS2B


In [41]:
sitemap_effects_and_preferences_df.to_csv(without_stops_csv, index=False)

## Mutation effects **with** stop codons

Combine all of the mutation effects from each tiles.

In [None]:
muteffects_with_stops_paths = {tile: os.path.join("results", tile, 'muteffects_withStops', 'virus_muteffects.csv') for tile in tiles}
muteffects_with_stops_dfs = []
for tile, path in muteffects_with_stops_paths.items():
    df = pd.read_csv(path)
    df['tile'] = tile
    muteffects_with_stops_dfs.append(df)
muteffects_with_stops_df = pd.concat(muteffects_with_stops_dfs)
muteffects_with_stops_df.head()

Unnamed: 0,site,wildtype,mutant,mutation,effect,log2effect,tile
0,1,R,A,R1A,0.003142,-8.3143,tile_1
1,1,R,C,R1C,0.007931,-6.9783,tile_1
2,1,R,D,R1D,0.008227,-6.9253,tile_1
3,1,R,E,R1E,0.004469,-7.8057,tile_1
4,1,R,F,R1F,0.009128,-6.7754,tile_1


## Mutation Preferences **with** stop codons

Combine the average mutation preferences from each tile.

In [None]:
preferences_with_stops_paths = {tile: os.path.join("results", tile, 'prefs_withStops', 'prefs_virus.csv') for tile in tiles}
preferences_with_stops_dfs = []
for tile, path in preferences_with_stops_paths.items():
    df = pd.read_csv(path)
    df_melted = (
        df 
            .melt(id_vars=['site'], var_name='mutant', value_name='preference')
            .sort_values(by=['site', 'mutant'], ascending=[True, True])
            .reset_index(drop=True)
    )
    df_melted['tile'] = tile
    preferences_with_stops_dfs.append(df_melted)

preferences_with_stops_df = pd.concat(preferences_with_stops_dfs)
preferences_with_stops_df.head()

Unnamed: 0,site,mutant,preference,tile
0,1,A,0.00265,tile_1
1,1,C,0.00669,tile_1
2,1,D,0.00694,tile_1
3,1,E,0.00377,tile_1
4,1,F,0.0077,tile_1


## Merged effects and preferences without stop codons

We'll merge the effects and preferences for each tile and mutation into a single file. We'll also add information about whether a site is in NS2B or NS3.

In [None]:
# Merge effects and preferences
effects_and_preferences_with_stops_df = pd.merge(muteffects_with_stops_df, preferences_with_stops_df, on=['site', 'mutant', 'tile'])
assert(effects_and_preferences_with_stops_df.shape[0] == muteffects_with_stops_df.shape[0])
assert(effects_and_preferences_with_stops_df.shape[0] == preferences_with_stops_df.shape[0])

# Write out effects and preferences with sitemap
sitemap_effects_and_preferences_with_stops_df = pd.merge(effects_and_preferences_with_stops_df, sitemap, left_on='site', right_on='sequential_site', how='left')
sitemap_effects_and_preferences_with_stops_df.to_csv(with_stops_csv, index=False)