# BICS - Genome Analysis and Visualization

In [25]:
# Connect to Google Drive (make sure you have `metadata.tsv` in your Google Drive "My Drive" directory.)
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm import tqdm
import re

###### metadata.tsv 733 MB

In [3]:
data = pd.read_csv("/content/drive/MyDrive/metadata.tsv", sep="\t")
with open("/content/drive/MyDrive/reference.gb") as f:
  reference_full_string = f.read()

FileNotFoundError: [Errno 2] File /content/drive/MyDrive/metadata.tsv does not exist: '/content/drive/MyDrive/metadata.tsv'

In [4]:
# Preview data
data.head(n=3)

NameError: name 'data' is not defined

In [5]:
len(data)

NameError: name 'data' is not defined

In [6]:
data.describe()

NameError: name 'data' is not defined

In [7]:
full_data = data
data = full_data[::4]  # quarter of the entire dataset

NameError: name 'data' is not defined

# Reorganize Into Substitution Table
## Make a table (dataframe), where each row represents a single AA substitution

In [8]:
column_dict = {
    "name": "Virus name",
    "date": "Collection date",
    "age": "Patient age",
    "gender": "Gender",
    "version": "Pangolin version",
}

In [9]:
def parse_substitutions(subs):
  """
  Arguments
  - subs: "(NSP15_A283V,NSP12_P323L,Spike_D614G)"
  Returns
  - List of individual substitutions: [
    ("NSP15_A283V", "NSP15", "A", "V", 283),
    ...
  ]
  """
  if pd.isna(subs):  # nan value
    return []

  subs = subs.replace("(", "").replace(")", "")
  subs = subs.split(",")
  result = []
  for s in subs:
    try:
      feature, mut = s.split("_")
      before = mut[0]
      after = mut[-1]
      index = int(mut[1:-1])
    except:  # other types of substitutions
      continue
    result.append((s, feature, before, after, index))
  return result

In [10]:
sub_data = defaultdict(list)
for i, row in tqdm(data.iterrows(), total=len(data)):
  # Identify mutations
  subs = parse_substitutions(row["AA Substitutions"])
  location_info = [s.strip() for s in row["Location"].split("/")]
  continent = location_info[0]
  country = location_info[1]
  city = None
  territory = None
  if len(location_info) > 2:
    territory = location_info[2]
    if len(location_info) > 3:
      city = location_info[3]

  # Copy over columns from the column dict
  for new_column, old_column in column_dict.items():
    sub_data[new_column].extend([row[old_column]] * len(subs))

  for full_sub, feature, before, after, index in subs:
    # Fill mutation columns
    sub_data["sub"].append(full_sub)
    sub_data["sub_feature"].append(feature)
    sub_data["sub_before"].append(before)
    sub_data["sub_after"].append(after)
    sub_data["sub_index"].append(index)

    # Fill location columns
    sub_data["loc_continent"].append(continent)
    sub_data["loc_country"].append(country)
    sub_data["loc_territory"].append(territory)
    sub_data["loc_city"].append(city)

sub_df = pd.DataFrame(sub_data)
sub_df.head()

NameError: name 'data' is not defined

In [11]:
len(data)

NameError: name 'data' is not defined

In [12]:
len(sub_df)

NameError: name 'sub_df' is not defined

In [13]:
sub_df.loc_city.unique()

NameError: name 'sub_df' is not defined

In [14]:
sub_df["sub"].head()

NameError: name 'sub_df' is not defined

In [15]:
unique_subs = sub_df["sub"].unique().tolist()
unique_subs[::5000]

NameError: name 'sub_df' is not defined

In [16]:
len(unique_subs)

NameError: name 'unique_subs' is not defined

In [17]:
sub_df["sub"].value_counts()

NameError: name 'sub_df' is not defined

In [18]:
print(sorted(sub_df["sub_feature"].unique().tolist()))

NameError: name 'sub_df' is not defined

# Mutation Index
Let's figure out where each mutation occured, within the full genome sequence. Let's look at this key: NSP12_D291N.

This means that the mutation occured in NSP12 and the exact location is index 291 (relative to the start of NSP12).

We can use this information to find the exact location of each substituion.

## Find Feature Locations From GenBank File
We need to find the start index of each feature (including proteins, peptides, etc.).

In [19]:
def parse_feature_locations(genbank_path):
  with open(genbank_path) as f:
    gb = f.read()
  gb_oneline = gb.replace("\n", "")
  gen_regex = r'gene.+?(\d*)\.\.(\d*).+?\/gene="(.+?)"'
  pep_regex = r'mat_peptide.+?(\d*)\.\.(\d*).+?/gene="(.+?)".+?/product="(.+?)".+?/note="(.{10})'
  gen_data = re.findall(gen_regex, gb_oneline)
  pep_data = re.findall(pep_regex, gb_oneline)

  # Gene Ranges
  range_dict = dict()
  letters = ["E", "M", "N", "S"]
  for start, end, name in gen_data:
      start = int(start)
      for letter in letters:
          if letter in name:
              range_dict[name] = (start - 1, end)

  # Peptide Ranges
  candidates = []
  for i in range(1, 17):
      candidates.append("nsp{}".format(i))
  candidates.reverse()  # e.g., check nsp11 before nsp1
  for start, end, _, name, note in pep_data:
      start = int(start)
      end = int(end)
      for c in candidates:
          if c in name.lower() or c in note.lower():
              range_dict[c.upper()] = (start - 1, end)
              
  # Spike Protein Ranges
  range_dict["Spike"] = range_dict["S"]
  del range_dict["S"]

  return dict(range_dict)

In [20]:
range_dict = parse_feature_locations("/content/drive/MyDrive/reference.gb")
print(range_dict)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/reference.gb'

## Filter Features
### Filter rows that contain features for which we have the starting index

In [21]:
full_sub_df = sub_df

sub_df = sub_df[sub_df["sub_feature"].isin(range_dict.keys())]
sub_df

NameError: name 'sub_df' is not defined

## Add Mutation Index To Table
Add the following columns based on the ranges obtained above:

* sub_feature_start
* sub_full_index

In [23]:
start_dict = {key: value[0] for key, value in range_dict.items()}
map_start_index = (lambda feature: start_dict[feature])

sub_df["sub_feature_start"] = sub_df["sub_feature"].apply(map_start_index)
sub_df["sub_full_index"] = sub_df["sub_feature_start"] + sub_df["sub_index"] * 3
# assumes that sub index is the AA index

NameError: name 'range_dict' is not defined

In [24]:
sub_df

NameError: name 'sub_df' is not defined

## Congrats!
We've successfully pre-processed the data into the substitution table. This should help us analyze and visualize stats, related to gene substitutions, much more easily.