# Takes tables with VCF data on the variants of two levels and plots them to the protein sequence for each isoform

In [None]:
import pandas as pd
from pyteomics import fasta
from numpy import nan

import matplotlib.colors as mcolors
from matplotlib import pyplot as plt, patches
from matplotlib.patches import Rectangle
from matplotlib.gridspec import GridSpec

## 1. Table with all the variants ('level 1 table')

Creating a table that contains transcript and protein IDs along with gene names, protein sequences (unmodified) and protein descriptions taken from FASTA. For these purpose canonical proteome fasta with all isoforms downloaded from Ensembl is used. 

Also creating an 'info' table that contains the number of transcripts and the number of variants belonging to each gene name. This is useful to have in ordet to estimete how each plot will look like in the end. 

In [None]:
table = pd.read_table(
    'results/MD_var_level1_unfiltered.tsv')
table = table.rename(columns={"transcriptID": "transcript_id"})
transcripts = table['transcript_id'].tolist()

data_names = []
with fasta.read(
    'input/ensembl_reference_proteinDB_tagged.fa') as f:
    for prot in f:
        transcript_ID = prot.description.split('transcript:')[1].split('.')[0]
        proteinID = prot.description.replace('.', '|').split('|')[1]
        for item in transcripts:
            if item == transcript_ID:
                if 'gene_symbol:' in prot.description:
                    gene_name = prot.description.split('gene_symbol:')[1].split(' ')[0]
                    data_names.append({'transcript_id': transcript_ID, 
                                       'gene_name': gene_name, 
                                       'description': prot.description,
                                       'unmodified_sequence': prot.sequence,
                                       'protein_id': proteinID})
                        
                        
name_frame = pd.DataFrame(data_names).drop_duplicates().reset_index(drop=True)
table_names = pd.merge(table, name_frame, how='outer', on = 'transcript_id').replace(nan, '')
var_gene_names = table_names[table_names['gene_name'] != ''].reset_index(drop=True)

#######
info = []
genes = var_gene_names['gene_name'].unique()
for gene in genes:
    info.append({'gene': gene, 'transcripts': 
                 len(var_gene_names[var_gene_names['gene_name'] == gene]['transcript_id'].unique()), 'variants':
                 len(var_gene_names[var_gene_names['gene_name'] == gene]['protein_change'].unique())})
info_df = pd.DataFrame(info)
info_df.to_csv(
    'info_level1_prot.csv',
    header=True, index=False)
#######

var_gene_names.to_csv(
    'all_variants_gene_names_prot_level1.csv',
    header=True, index=False)


In [None]:
var_gene_names

In [None]:
var_gene_names[var_gene_names['reading_frame'] == -1]

In [None]:
info_df

## 2. The same for the 'level 2' more confident filtered variants

In [None]:
table2 = pd.read_table('results/MD_var_level2_unfiltered.tsv')
table2 = table2.rename(columns={"transcriptID": "transcript_id"})
transcripts2 = table2['transcript_id'].tolist()


data_names2 = []
with fasta.read('input/ensembl_reference_proteinDB_tagged.fa') as f:
    for prot in f:
        transcript_ID = prot.description.split('transcript:')[1].split('.')[0]
        proteinID = prot.description.replace('.', '|').split('|')[1]
        for item in transcripts2:
            if item == transcript_ID:
                if 'gene_symbol:' in prot.description:
                    gene_name = prot.description.split('gene_symbol:')[1].split(' ')[0]
                    data_names2.append({'transcript_id': transcript_ID, 
                                       'gene_name': gene_name, 
                                       'description': prot.description,
                                       'unmodified_sequence': prot.sequence, 
                                       'protein_id': proteinID})
                        
                        
name_frame2 = pd.DataFrame(data_names2).drop_duplicates().reset_index(drop=True)
table_names2 = pd.merge(table2, name_frame2, how='outer', on = 'transcript_id').replace(nan, '')
var_gene_names2 = table_names2[table_names2['gene_name'] != ''].reset_index(drop=True)

#######
info2 = []
genes2 = var_gene_names['gene_name'].unique()
for gene in genes2:
    info2.append({'gene': gene, 'transcripts': 
                 len(var_gene_names[var_gene_names['gene_name'] == gene]['transcript_id'].unique()), 'variants':
                 len(var_gene_names[var_gene_names['gene_name'] == gene]['protein_change'].unique())})
info_df2 = pd.DataFrame(info2)
info_df2.to_csv('intermediate/info_level2_prot.csv', header=True, index=False)
#######

var_gene_names2.to_csv('intermediate/all_variants_gene_names_prot_level2.csv', header=True, index=False)


## 3. Plotting data from two tables in one plot

In [None]:
# In case the 
info_df = pd.read_csv(
    'intermediate/info_level1_prot.csv')
var_gene_names = pd.read_csv(
    'intermediate/all_variants_gene_names_prot_level1.csv')
info_df2 = pd.read_csv(
    'intermediate/info_level2_prot.csv')
var_gene_names2 = pd.read_csv(
    'intermediate/all_variants_gene_names_prot_level2.csv')

In [None]:
# Genes present in both levels
genes = var_gene_names2['gene_name'].unique()
genes

In [None]:
# Creating plots for the genes present in both levels. These plots will have two rows of dots: one for each level.
# code for all the genes

for gene in genes:
    gene_df = var_gene_names[var_gene_names['gene_name'] == gene]
    gene_df2 = var_gene_names2[var_gene_names2['gene_name'] == gene]

    number_of_transcr = int(info_df['transcripts'][info_df['gene'] == gene])
    fig = plt.figure(figsize=(55, (number_of_transcr * 2 - 1)), constrained_layout=True)
    gs = GridSpec(number_of_transcr, 1, figure=fig)

    pos_of_subplot = 0
    for proteinID in gene_df2['protein_id'].unique().tolist():
        single_prot_df = gene_df[gene_df['protein_id'] == proteinID].reset_index(drop=True)
        sequence = single_prot_df['unmodified_sequence'][0]
        protein_change = single_prot_df['protein_change'].drop_duplicates().tolist()
        protein_change.sort(key = lambda x: int(x.split(':')[0]))

        ### position1
        sequence_pos = []
        sequence = single_prot_df['unmodified_sequence'][0]
        for change in protein_change:
            if change[0] != '-':
                pos = int(change.split(':')[0]) + 1
                #if sequence[:pos][-1] != change.replace('>', ':').split(':')[1]:
                    #print(transcriptID, ' ', change)
                if pos not in sequence_pos:
                    sequence_pos.append(pos)
        if sequence_pos == []: continue

        single_prot_df2 = gene_df2[gene_df2['protein_id'] == proteinID].reset_index(drop=True)
        sequence2 = single_prot_df2['unmodified_sequence'][0]
        prot_change2 = single_prot_df2['protein_change'].drop_duplicates().tolist()
        prot_change2.sort(key = lambda x: int(x.split(':')[0]))

        ### position2
        sequence_pos2 = []
        sequence2 = single_prot_df2['unmodified_sequence'][0]
        for change in prot_change2:
            if change[0] != '-':
                pos2 = int(change.split(':')[0]) + 1
                if pos2 not in sequence_pos2:
                    sequence_pos2.append(pos2)
        if sequence_pos2 == []: continue

        y_numbers = str(1) * len(sequence_pos)
        y_int_number_list = []
        for el in y_numbers:
            y_int_number_list.append(int(el))
        y_numbers_2 = str(2) * len(sequence_pos2)
        y_numbers_2_list = []
        for el in y_numbers_2:
            y_numbers_2_list.append(int(el))

        ax = fig.add_subplot(gs[pos_of_subplot, :])    
        ax.scatter(sequence_pos, y_int_number_list, s=20, label="all_var", color='darkcyan')
        ax.scatter(sequence_pos2, y_numbers_2_list, s=20, label="short_list", color='navy')
        ax.add_patch(Rectangle((0, 1.2), max(sequence_pos), 0.6, color='gold'))
        ax.yaxis.set_visible(False)
        ax.spines[["left", "top", "right", "bottom"]].set_visible(False)
        ax.text(0.5, 0.5, proteinID, fontsize=18)
        ax.tick_params(labelsize=18)
        pos_of_subplot = pos_of_subplot + 1

    fig.savefig(
        'figures/' + 
        gene + '_protein.png', dpi=300, format='png')
    print(gene)
    plt.close('all')

## 4. Plotting the rest of genes from the level1 table without the level2 variants

These plots will have just one row of dots

In [None]:
level1_only_genes = set(var_gene_names['gene_name'].unique()) - set(var_gene_names2['gene_name'].unique())
level1_only_genes

In [None]:
for gene in level1_only_genes:
    gene_df = var_gene_names[var_gene_names['gene_name'] == gene]

    number_of_transcr = int(info_df['transcripts'][info_df['gene'] == gene])
    fig = plt.figure(figsize=(55, (number_of_transcr * 2 - 1)), constrained_layout=True)
    gs = GridSpec(number_of_transcr, 1, figure=fig)

    pos_of_subplot = 0
    for proteinID in gene_df['protein_id'].unique().tolist():
        single_prot_df = gene_df[gene_df['protein_id'] == proteinID].reset_index(drop=True)
        sequence = single_prot_df['unmodified_sequence'][0]
        prot_change = single_prot_df['protein_change'].drop_duplicates().tolist()
        prot_change.sort(key = lambda x: int(x.split(':')[0]))

        ### position1
        sequence_pos = []
        sequence = single_prot_df['unmodified_sequence'][0]
        for change in prot_change:
            if change[0] != '-':
                pos = int(change.split(':')[0]) + 1
                if pos not in sequence_pos:
                    sequence_pos.append(pos)
        if sequence_pos == []: continue

        y_numbers = str(1) * len(sequence_pos)
        y_int_number_list = []
        for el in y_numbers:
            y_int_number_list.append(int(el))

        ax = fig.add_subplot(gs[pos_of_subplot, :])    
        ax.scatter(sequence_pos, y_int_number_list, s=20, label="all_var", color='darkcyan')
        ax.add_patch(Rectangle((0, 1.2), max(sequence_pos), 0.6, color='gold'))
        ax.yaxis.set_visible(False)
        ax.spines[["left", "top", "right", "bottom"]].set_visible(False)
        ax.text(0.5, 0.5, proteinID, fontsize=18)
        ax.tick_params(labelsize=18)
        pos_of_subplot = pos_of_subplot + 1

    fig.savefig(
        'figures/' + 
        gene + '_protein.png', dpi=300, format='png')
    print(gene)
    plt.close('all')
