# Analysis of the distribution of pathogenisity categories in consequence types 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## 1. Import unfiltered Ensembl exon variant table that contains only variants from dbSNP

In [None]:
#Unfiltered Ensembl variants
#This file has been prodices dy the '1_Reference_Ensembl_table' notebook
ens = pd.read_csv(
    'MD_genes_exon_variants_Ens_unfiltered_dbSNP.csv',
    converters={i: str for i in range(15)})
ens

In [None]:
#Leave only variants with known clinical significance
clin_sign = ens[ens['clinical_significance'] != '[]'].reset_index(drop=True)
clin_sign

## 2. Work with clinical significance types

In [None]:
#Checking the consequences types
clin_sign['consequence_type'].unique()

In [None]:
#replacing the column with the column containing lists and not strings
new_clin_sig = []
for sig in clin_sign['clinical_significance']:
    new_clin_sig.append(sig.replace("'", "").replace("[", "").replace("]", "").split(', '))
clin_sign['clinical_significance'] = new_clin_sig
clin_sign 

## 3. Work with pathogenicity categories

In [None]:
clin_sig_types = []
for sig in clin_sign['clinical_significance']:
    for el in sig:
        if el not in clin_sig_types:
            clin_sig_types.append(el)

In [None]:
clin_sig_types

In [None]:
consequence_types = clin_sign['consequence_type'].unique()

In [None]:
#creating and checking components for the matrix
sig_dfs = []
for sig_type in clin_sig_types:
    df_one_sig_type = clin_sign[clin_sign.clinical_significance.apply(
        lambda lst: all(x == sig_type for x in lst))].reset_index(drop=True)
    sig_dfs.append(df_one_sig_type)
    for cons_type in clin_sign['consequence_type'].unique():
        print(sig_type, cons_type, len(df_one_sig_type[df_one_sig_type['consequence_type'] == cons_type])) 

In [None]:
#creating the matrix 

matrix_colnames = ['significance']
matrix_colnames.extend(consequence_types)
matrix_rows = []

print(matrix_colnames)

sig_dfs = []
for sig_type in clin_sig_types:
    df_one_sig_type = clin_sign[clin_sign.clinical_significance.apply(
        lambda lst: all(x == sig_type for x in lst))].reset_index(drop=True)
    sig_dfs.append(df_one_sig_type)
    matrix_row = [sig_type] + [len(df_one_sig_type[df_one_sig_type['consequence_type'] == cons_type]) for cons_type in consequence_types]
    matrix_rows.append(matrix_row)
    
matrix_df = pd.DataFrame(columns=matrix_colnames, data=matrix_rows)
matrix_df

## 4. Heat map

In [None]:
matrix = matrix_df.set_index('significance')
matrix

In [None]:
matrix.index.values.tolist()

In [None]:
# Heatmap with all the categories and absolute values
sns.heatmap(matrix, cmap="YlGnBu")

In [None]:
#normalized by consequence

df_norm_row = matrix.apply(lambda x: (x)*100/x.sum(), axis = 0)

sns.heatmap(df_norm_row, cmap="YlGnBu")

In [None]:
#Gettin rid of the variants with uncertain significance and the categories with almost no values
short = matrix.loc[['pathogenic', 'likely pathogenic', 'likely benign', 'benign']]
short

In [None]:
#Ordering the columns in a prettier way 
short_short = short[['protein_altering_variant', 'coding_sequence_variant','missense_variant',
                     'frameshift_variant', 'stop_gained', 
                     'splice_donor_variant', 'splice_acceptor_variant', 'splice_region_variant', 
                     'splice_polypyrimidine_tract_variant', 'splice_donor_5th_base_variant',
                     '3_prime_UTR_variant', '5_prime_UTR_variant', 'stop_lost', 'start_lost', 'stop_retained_variant',
                     'inframe_deletion', 'inframe_insertion', 
                     'non_coding_transcript_exon_variant', 'synonymous_variant']]
short_short

In [None]:
sns.heatmap(short_short, cmap="YlGnBu")

In [None]:
#normalized by consequence

df_norm_row = short_short.apply(lambda x: (x)*100/x.sum(), axis = 0)

sns.heatmap(df_norm_row, cmap="YlGnBu")

In [None]:
sns.heatmap(df_norm_row, cmap="YlGnBu")

plt.savefig(
    'pathogenicity_Ensembl_heatmat_norm.svg',
    format='svg', bbox_inches="tight")

# Analysis of the distribution of pathogenisity categories in consequence types in the dataset of the variants associated with any phenotype

## 5. Now the same procedure, but for the variants exported from ClinVar

These variants are exported from ClinVar for all phenotypes

In [None]:
ClinVar = pd.read_csv('All_ClinVar.csv')
ClinVar

In [None]:
#Checking what consequence types are there
ClinVar['Molecular consequences'].unique()

In [None]:
new_cons = []
for cons in ClinVar['Molecular consequences']:
    [i for i in cons.split(',') if i]
    new_cons.append([i for i in cons.split(',') if i])
new_cons 

In [None]:
ClinVar = ClinVar.rename(columns={"Molecular consequences": "molecular_consequences", 
                            "Most severe clinical significance": "clinical_significance"})
ClinVar['molecular_consequences'] = new_cons
ClinVar

In [None]:
cons_types = []
for con in ClinVar['molecular_consequences']:
    for el in con:
        if el not in cons_types:
            cons_types.append(el)
cons_types

In [None]:
clin_sig_types = ClinVar['clinical_significance'].unique()
clin_sig_types

In [None]:
matrix_colnames = ['consequence']
matrix_colnames.extend(clin_sig_types)
matrix_rows = []

print(matrix_colnames)

sig_dfs = []
for cons_type in cons_types:
    df_one_cons_type = ClinVar[ClinVar.molecular_consequences.apply(
        lambda lst: all(x == cons_type for x in lst))].reset_index(drop=True)
    sig_dfs.append(df_one_cons_type)
    matrix_row = [cons_type] + [ len(df_one_cons_type[df_one_cons_type['clinical_significance'] == clin_sig_type]) for clin_sig_type in clin_sig_types ]
    matrix_rows.append(matrix_row)
   
matrix_df = pd.DataFrame(columns=matrix_colnames, data=matrix_rows)
matrix = matrix_df.set_index('consequence').transpose()
matrix

In [None]:
sns.heatmap(matrix, cmap="YlGnBu")

In [None]:
#normalized by consequence

df_norm_row = matrix.apply(lambda x: (x)*100/x.sum(), axis = 0)
sns.heatmap(df_norm_row, cmap="YlGnBu")

In [None]:
short_CV = matrix.loc[['pathogenic', 'likely-pathogenic', 
                'likely-benign', 'benign', 
                'risk-factor', 'conflicting-interpretations-of-pathogenicity']]
short_CV

In [None]:
short_short_CV = short_CV[['missense variant',
                     'frameshift variant',
                     'splice donor variant',
                     'splice acceptor variant',
                     'nonsense (stop gained)',
                     'stop lost',
                     '3 prime UTR variant',
                     '5 prime UTR variant',
                     'nc transcript variant',
                     '2KB upstream variant',
                     '500B downstream variant',
                     'synonymous variant']]
short_short

In [None]:
#normalized by consequence

df_norm_row = short_short_CV.apply(lambda x: (x)*100/x.sum(), axis = 0)
sns.heatmap(df_norm_row, cmap="YlGnBu")

In [None]:
sns.heatmap(df_norm_row, cmap="YlGnBu")

plt.savefig(
    'pathogenicity_ClinVar_heatmat_normalized.svg', 
    format='svg', bbox_inches="tight")