# ProWave - WaveNet-based Protein Generation

Authors: Hans Jakob Damsgaard & Lucas Balling

02456 Deep Learning project: ProGen

## Initialization

Run the commmand below if you have not yet installed the [TAPE project](https://github.com/songlab-cal/tape).

In [None]:
#!pip install tape_proteins

#### Importing needed packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import tape

#### Import the data

We were unable to make the data download script, `download_data.sh`, run from Jupyter, so instead we ran it manually and simply placed the resulting files in the right folder for TAPE to find them. We import all the data in the LMDB format as it is most easily worked with in Python.

In [None]:
from tape.datasets import LanguageModelingDataset

# Data stored under `<data-path>/data`
#data_path = '/Users/lucasballing/Desktop/DeepLearningProject/prowave-main/data/'
data_path = 'E:/Pfam/data/'
train_data   = LanguageModelingDataset(data_path, 'train')
valid_data   = LanguageModelingDataset(data_path, 'valid')
holdout_data = LanguageModelingDataset(data_path, 'holdout')

#### Understanding data features

To get a good understanding of the data provided in the imported dataset, we provide plots of certain features and their ranges. Data is already split into the three required subsets; train, validation, and holdout by TAPE, so it is also interesting to understand this split.

In [None]:
# Split sizes
print(f'Training data has shape ({len(train_data)}, {len(train_data[0])})')
print(f'Validation data has shape ({len(valid_data)}, {len(valid_data[0])})')
print(f'Holdout data has shape ({len(holdout_data)}, {len(holdout_data[0])})')

# Original data columns
from tape.datasets import LMDBDataset
lmdb_train = LMDBDataset(data_path+'pfam/pfam_train.lmdb')
print(f'File data entries look like this: {lmdb_train[0]}')
del lmdb_train

# Data columns - all subsets are taken from the same overall dataset, so the columns are the same
# From combining information from LMDBDataset and LanguageModelingDataset, we know the columns are
# - IUPAC-encoded protein string
# - Input mask (for masked-token prediction)
# - Protein clan
# - Protein family
# The protein ID (i.e., its number within its clan and family) is not included
print(f'Encoded data entries look like this: {train_data[0]}')

In [None]:
def setify(data):
    """
    Produces summary statistics for the provided datasets.

    Args:
     `data`: a list of datasets

    Returns a tuple of three lists;
     `uniques` representing sets of clans and families in each of the datasets
     `perclan` representing tuples of clan ID and protein count
     `perfam`  representing tuples of family ID and protein count
    """
    uniques = [[set() for _ in range(len(data))] for _ in range(2)]
    perclan = [{} for _ in range(len(data))]
    perfam  = [{} for _ in range(len(data))]
    for index, d in enumerate(data):
        for i in range(len(d)):
            # Fetch this entry
            row  = d[i]
            clan = row[2]
            fam  = row[3]

            # Add clan and family IDs to sets
            uniques[0][index].add(clan) # add clan
            uniques[1][index].add(fam) # add family

            # Count proteins in this clan
            if clan not in perclan[index]:
                perclan[index][clan] = 1
            else:
                perclan[index][clan] += 1

            # Count proteins in this family
            if fam not in perfam[index]:
                perfam[index][fam] = 1
            else:
                perfam[index][fam] += 1

    return uniques, [x.items() for x in perclan], [x.items() for x in perfam]


In [None]:
# Fetch results from all splits
results = setify([train_data, valid_data, holdout_data])

# Clans in splits
clans = results[0][0]
print(f'Unique clans in training data {len(clans[0])}')
print(f'Unique clans in validation data {len(clans[1])}')
print(f'Unique clans in holdout data {len(clans[2])}')

# Families in splits
families = results[0][1]
print(f'Unique families in training data {len(families[0])}')
print(f'Unique families in validation data {len(families[1])}')
print(f'Unique families in holdout data {len(families[2])}')

# PRINTS:
# Unique clans in training data 623
# Unique clans in validation data 623
# Unique clans in holdout data 8
# Unique families in training data 17737
# Unique families in validation data 15974
# Unique families in holdout data 28


We will now plot some histograms on number of proteins in each clan and family across all three splits.

In [None]:
# Histograms of protein counts in clans
# TRAINING
df = pd.DataFrame(results[1][0], columns=['Clan', 'Count'])
sns.displot(df, x='Clan')
plt.title('Training - Protein count vs Clan')
plt.show()

# VALIDATION
df = pd.DataFrame(results[1][1], columns=['Clan', 'Count'])
sns.displot(df, x='Clan')
plt.title('Validation - Protein count vs Clan')
plt.show()

# HOLDOUT
df = pd.DataFrame(results[1][2], columns=['Clan', 'Count'])
sns.displot(df, x='Clan')
plt.title('Holdout - Protein count vs Clan')
plt.show()

# Histograms of protein counts in families
# TRAINING
df = pd.DataFrame(results[2][0], columns=['Family', 'Count'])
sns.displot(df, x='Family')
plt.title('Training - Protein count vs Clan')
plt.show()

# VALIDATION
df = pd.DataFrame(results[2][1], columns=['Family', 'Count'])
sns.displot(df, x='Family')
plt.title('Validation - Protein count vs Clan')
plt.show()

# HOLDOUT
df = pd.DataFrame(results[2][2], columns=['Family', 'Count'])
sns.displot(df, x='Family')
plt.title('Holdout - Protein count vs Clan')
plt.show()
