# Part 0: Load in libraries

## runs using graph_env - needs d3blocks

In [1]:
import numpy as np
import pandas as pd
import seaborn as sbn
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
import re
import math
import scipy
from scipy.stats import ks_2samp
from matplotlib.patches import Rectangle
import pathlib
from matplotlib import font_manager
from scipy.stats import fisher_exact


# Setup the fonts
fonts_path = pathlib.Path.home().joinpath('fonts') # i.e. `~/fonts` (update as needed)
font_filename = 'ARIAL.TTF'
font_path = fonts_path.joinpath(font_filename)
print(font_path)

font_manager.fontManager.addfont(str(font_path))
prop = font_manager.FontProperties(fname=font_path)

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = prop.get_name()
plt.rcParams['pdf.fonttype'] = 42 # Makes text editiable
plt.rcParams['ps.fonttype'] = 42

COLOR_MAP = dict()
COLOR_MAP['A']         = '#8c8c8c'
COLOR_MAP['CHNQST']    = '#077a15'
COLOR_MAP['DE']        = '#e23a36'
COLOR_MAP['FWY']       = '#f58220'
COLOR_MAP['G']         = '#86bd8d'
COLOR_MAP['ILMV']      = '#231f20'
COLOR_MAP['KR']        = '#1725c2'
COLOR_MAP['P']         = '#c25fc9'


# Using the global color map, we convert the colormap of amino acid
# groups, to a map against individual amino acids. This
# serves as our internal color map which is used for styling.
aacolor_map=dict()
for amino_acids in COLOR_MAP:
    for amino_acid in amino_acids:
        aacolor_map[amino_acid] = COLOR_MAP[amino_acids]

/home/research/kiersten.ruff/fonts/ARIAL.TTF


# Part 1: Connect spliceosome processes to IDR grammars

## 1.1 - Load in spliceosome complex information

In [2]:
dfcompclust=pd.read_excel('data/Nuclear_speckles_complexes_w_clusters_IDRs_gte_70_and_nonlinkers_gte_50.xlsx', engine='openpyxl')
dfcompclust=dfcompclust.sort_values(by=['General Process'])
print(dfcompclust)

ccgeneralprocess=dfcompclust['General Process'].tolist()
cccategory=dfcompclust['Category'].tolist()
ccIDR=dfcompclust['IDR'].tolist()
cccluster18=dfcompclust['Cluster 18'].tolist()
cccluster26=dfcompclust['Cluster 26'].tolist()
ccclusterany=dfcompclust['Cluster 18, or 26'].tolist()
ccgene=dfcompclust['Gene'].tolist()
ccacc=dfcompclust['Accession'].tolist()

ccgeneralprocess = [x if x != 'Splicing factors' else 'Other splicing factors1' for x in ccgeneralprocess]
ccgeneralprocess = [x if x != 'ASAP complex' else 'ASAP complex1' for x in ccgeneralprocess]
cccategory = [x if x != 'Other pre-mRNA splicing factors' else 'Other splicing factors' for x in cccategory]

uniccgeneralprocess=[]
unicccategory=[]
for s in range(0,len(ccgeneralprocess)):
    if ccgeneralprocess[s] not in uniccgeneralprocess:
        uniccgeneralprocess.append(ccgeneralprocess[s])
    if cccategory[s] not in unicccategory:
        unicccategory.append(cccategory[s])

print(uniccgeneralprocess)
print(unicccategory)
print(len(unicccategory))

     Unnamed: 0 Accession     Gene    General Process  \
286         286    O00422    SAP18       ASAP complex   
285         285    Q15287    RNPS1       ASAP complex   
284         284    Q9UKV3    ACIN1       ASAP complex   
0             0    Q13838   DDX39B  Major Spliceosome   
170         170    Q15459    SF3A1  Major Spliceosome   
..          ...       ...      ...                ...   
365         365    Q9Y247   FAM50B   Splicing factors   
366         366    Q8N1B3     CCNQ   Splicing factors   
367         367    Q06787     FMR1   Splicing factors   
401         401    O00425  IGF2BP3   Splicing factors   
530         530    Q15695  ZRSR2P1   Splicing factors   

                               Category  IDR Cluster 18 Cluster 26  \
286                        ASAP complex   No         No         No   
285                        ASAP complex  Yes         No        Yes   
284                        ASAP complex  Yes        Yes        Yes   
0                Spliceosomal E com

## 1.2 - Create data for plotting

In [3]:
source=[]
target=[]
weight=[]

for i in range(len(uniccgeneralprocess)-1,-1,-1): #range(0,len(uniccgeneralprocess)):
    print(i)
    for j in range(len(unicccategory)-1,-1,-1): #range(0,len(unicccategory)):
        pos1=[m for m, x in enumerate(ccgeneralprocess) if x == uniccgeneralprocess[i]]
        pos2=[m for m, x in enumerate(cccategory) if x == unicccategory[j]]
        posinter=list(set(pos1) & set(pos2))


        if posinter:
            source.append(uniccgeneralprocess[i])
            target.append(unicccategory[j])
            weight.append(len(posinter))

for j in range(len(unicccategory)-1,-1,-1): #range(0,len(unicccategory)):
    pos2=[m for m, x in enumerate(cccategory) if x == unicccategory[j]]        
    tmpidr=0
    tmp18=0
    tmp26=0
    for p in pos2:
        if ccIDR[p]=='Yes':
            tmpidr=tmpidr+1
        if cccluster18[p]=='Yes':
            tmp18=tmp18+1
        if cccluster26[p]=='Yes':
            tmp26=tmp26+1

    if tmpidr==0:
        source.append(unicccategory[j])
        target.append('No IDRs in Complex')
        weight.append(1)

    elif tmpidr>0 and tmp18+tmp26==0:
        source.append(unicccategory[j])
        target.append('IDRs in complex not 18 or 26')
        weight.append(tmpidr)

    elif tmpidr>0 and tmp18+tmp26>0:
        if tmp18>0:
            source.append(unicccategory[j])
            target.append('IDRs in complex in 18')
            weight.append(tmp18)
        if tmp26>0:
            source.append(unicccategory[j])
            target.append('IDRs in complex in 26')
            weight.append(tmp26)


4
3
2
1
0


## 1.3 - Plot data

In [4]:
dftmp=pd.DataFrame()

dftmp['source']=source
dftmp['target']=target
dftmp['weight']=weight

print(dftmp)
print(dftmp[dftmp['source']=='NineTeen complex'])


from d3blocks import D3Blocks
d3 = D3Blocks(chart='Sankey', frame=True)

tmpcolorlist=['#3d2663', '#872f74', '#cb3f73', '#ff6361','#276026', '#008770', '#00aabe', '#64c8ff']


#d3.set_node_properties(dftmp)
d3.set_node_properties(dftmp, padding=1, color={'IDRs in complex in 18':'#000000', 'IDRs in complex in 26':'#1d9bd7','IDRs in complex not in 7, 18, or 23':'#dddddd','No IDRs in Complex':'#dddddd', \
                                     'Major Spliceosome':'#276026', 'Other splicing factors1':'#3d2663', 'RNA Modification':'#cb3f73', 'Minor Spliceosome':'#872f74', 'ASAP complex1':'#ff6361'})
#d3.set_node_properties(dftmp, color=mycolorsdict)
d3.set_edge_properties(dftmp, color='target', opacity='target')
#d3.show(figsize=[600, 600])
d3.show(figsize=[500, 800])

                                            source  \
0                          Other splicing factors1   
1                          Other splicing factors1   
2                                 RNA Modification   
3                                 RNA Modification   
4                                Minor Spliceosome   
5                                Major Spliceosome   
6                                Major Spliceosome   
7                                Major Spliceosome   
8                                Major Spliceosome   
9                                Major Spliceosome   
10                               Major Spliceosome   
11                               Major Spliceosome   
12                               Major Spliceosome   
13                               Major Spliceosome   
14                               Major Spliceosome   
15                               Major Spliceosome   
16                               Major Spliceosome   
17                          

[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Initializing [Sankey]
[d3blocks] >INFO> filepath is set to [/tmp/d3blocks/sankey.html]
[d3blocks] >INFO> Convert to DataFrame.
[d3blocks] >INFO> Node properties are set.
[d3blocks] >INFO> Edge properties are set.
[d3blocks] >INFO> Set [figsize]: [500, 800]
[d3blocks] >INFO> File already exists and will be overwritten: [/tmp/d3blocks/sankey.html]
[d3blocks] >INFO> Open browser: /tmp/d3blocks/sankey.html
