# Part 0: Load in libraries

## runs using graph_env - needs d3blocks

In [1]:
import numpy as np
import pandas as pd
import seaborn as sbn
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
import re
import math
import scipy
from scipy.stats import ks_2samp
from matplotlib.patches import Rectangle
import pathlib
from matplotlib import font_manager
from scipy.stats import fisher_exact


# Setup the fonts
fonts_path = pathlib.Path.home().joinpath('fonts') # i.e. `~/fonts` (update as needed)
font_filename = 'ARIAL.TTF'
font_path = fonts_path.joinpath(font_filename)
print(font_path)

font_manager.fontManager.addfont(str(font_path))
prop = font_manager.FontProperties(fname=font_path)

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = prop.get_name()
plt.rcParams['pdf.fonttype'] = 42 # Makes text editiable
plt.rcParams['ps.fonttype'] = 42

COLOR_MAP = dict()
COLOR_MAP['A']         = '#8c8c8c'
COLOR_MAP['CHNQST']    = '#077a15'
COLOR_MAP['DE']        = '#e23a36'
COLOR_MAP['FWY']       = '#f58220'
COLOR_MAP['G']         = '#86bd8d'
COLOR_MAP['ILMV']      = '#231f20'
COLOR_MAP['KR']        = '#1725c2'
COLOR_MAP['P']         = '#c25fc9'


# Using the global color map, we convert the colormap of amino acid
# groups, to a map against individual amino acids. This
# serves as our internal color map which is used for styling.
aacolor_map=dict()
for amino_acids in COLOR_MAP:
    for amino_acid in amino_acids:
        aacolor_map[amino_acid] = COLOR_MAP[amino_acids]

/home/research/kiersten.ruff/fonts/ARIAL.TTF


# Part 1: Connect nucleoplasm complexes to IDR grammars

## 1.1 - Load in nucleoli complex information

In [2]:
dfcompclust=pd.read_excel('data/Nucleoplasm_processes_w_clusters_IDRs_gte_70_and_nonlinkers_gte_50.xlsx', engine='openpyxl')
dfcompclust=dfcompclust.sort_values(by=['General Process'])
print(dfcompclust)

ccgeneralprocess=dfcompclust['General Process'].tolist()
cccategory=dfcompclust['Category'].tolist()
ccIDR=dfcompclust['IDR'].tolist()
cccluster10=dfcompclust['Cluster 10'].tolist()
cccluster11=dfcompclust['Cluster 11'].tolist()
cccluster28=dfcompclust['Cluster 28'].tolist()
ccclusterany=dfcompclust['Cluster 10, 11, or 28'].tolist()
ccgene=dfcompclust['Gene'].tolist()
ccacc=dfcompclust['Accession'].tolist()

ccgeneralprocess = [x if x != 'Splicing factors' else 'Other splicing factors1' for x in ccgeneralprocess]
ccgeneralprocess = [x if x != 'ASAP complex' else 'ASAP complex1' for x in ccgeneralprocess]
cccategory = [x if x != 'CREBBP/EP301' else 'CREBBP/EP300' for x in cccategory]

uniccgeneralprocess=[]
unicccategory=[]
for s in range(0,len(ccgeneralprocess)):
    if ccgeneralprocess[s] not in uniccgeneralprocess:
        uniccgeneralprocess.append(ccgeneralprocess[s])
    if cccategory[s] not in unicccategory:
        unicccategory.append(cccategory[s])

print(uniccgeneralprocess)
print(unicccategory)
print(len(unicccategory))

     Unnamed: 0 Accession     Gene                           General Process  \
84           84    P51532  SMARCA4              Chromatin remodeling complex   
76           76    O94805   ACTL6B              Chromatin remodeling complex   
75           75    O96019   ACTL6A              Chromatin remodeling complex   
74           74    P60709     ACTB              Chromatin remodeling complex   
73           73    Q969G3  SMARCE1              Chromatin remodeling complex   
..          ...       ...      ...                                       ...   
166         166    Q9NPA8     ENY2  RNA polymerase II-mediated transcription   
167         167    Q92830    KAT2A  RNA polymerase II-mediated transcription   
169         169    Q96ES7    SGF29  RNA polymerase II-mediated transcription   
157         157    P53803   POLR2K  RNA polymerase II-mediated transcription   
0             0    P24863     CCNC  RNA polymerase II-mediated transcription   

                       Category  \
84  

## 1.2 - Create data for plotting

In [3]:
source=[]
target=[]
weight=[]

for i in range(len(uniccgeneralprocess)-1,-1,-1): #range(0,len(uniccgeneralprocess)):
    print(i)
    for j in range(len(unicccategory)-1,-1,-1): #range(0,len(unicccategory)):
        pos1=[m for m, x in enumerate(ccgeneralprocess) if x == uniccgeneralprocess[i]]
        pos2=[m for m, x in enumerate(cccategory) if x == unicccategory[j]]
        posinter=list(set(pos1) & set(pos2))


        if posinter:
            source.append(uniccgeneralprocess[i])
            target.append(unicccategory[j])
            weight.append(len(posinter))

for j in range(len(unicccategory)-1,-1,-1): #range(0,len(unicccategory)):
    pos2=[m for m, x in enumerate(cccategory) if x == unicccategory[j]]        
    tmpidr=0
    tmp10=0
    tmp11=0
    tmp28=0
    for p in pos2:
        if ccIDR[p]=='Yes':
            tmpidr=tmpidr+1
        if cccluster10[p]=='Yes':
            tmp10=tmp10+1
        if cccluster11[p]=='Yes':
            tmp11=tmp11+1
        if cccluster28[p]=='Yes':
            tmp28=tmp28+1

    if tmpidr==0:
        source.append(unicccategory[j])
        target.append('No IDRs in Complex')
        weight.append(1)

    elif tmpidr>0 and tmp10+tmp11+tmp28==0:
        source.append(unicccategory[j])
        target.append('IDRs in complex not in 10, 11, or 28')
        weight.append(tmpidr)

    elif tmpidr>0 and tmp10+tmp11+tmp28>0:
        if tmp11>0:
            source.append(unicccategory[j])
            target.append('IDRs in complex in 11')
            weight.append(tmp11)
        if tmp28>0:
            source.append(unicccategory[j])
            target.append('IDRs in complex in 28')
            weight.append(tmp28)
        if tmp10>0:
            source.append(unicccategory[j])
            target.append('IDRs in complex in 10')
            weight.append(tmp10)


4
3
2
1
0


## 1.3 - Plot data

In [6]:
dftmp=pd.DataFrame()

dftmp['source']=source
dftmp['target']=target
dftmp['weight']=weight

print(dftmp)
print(dftmp[dftmp['source']=='NineTeen complex'])


from d3blocks import D3Blocks
d3 = D3Blocks(chart='Sankey', frame=True)

tmpcolorlist=['#3d2663', '#872f74', '#cb3f73', '#ff6361','#276026', '#008770', '#00aabe', '#64c8ff']


#d3.set_node_properties(dftmp)
d3.set_node_properties(dftmp, padding=1, color={'IDRs in complex in 10':'#1d9bd7','IDRs in complex in 11':'#218342', 'IDRs in complex in 28':'#f58220','IDRs in complex not in 10, 11, or 28':'#dddddd','No IDRs in Complex':'#dddddd', \
                                     'RNA binding':'#276026', 'Enhanceosomes':'#3d2663', 'Chromatin remodeling complex':'#cb3f73', 'Histone modifying complex':'#872f74', 'RNA polymerase II-mediated transcription':'#ff6361'})
#d3.set_node_properties(dftmp, color=mycolorsdict)
d3.set_edge_properties(dftmp, color='target', opacity='target')
#d3.show(figsize=[600, 600])
d3.show(figsize=[500, 800])

[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Initializing [Sankey]
[d3blocks] >INFO> filepath is set to [/tmp/d3blocks/sankey.html]
[d3blocks] >INFO> Convert to DataFrame.
[d3blocks] >INFO> Node properties are set.
[d3blocks] >INFO> Edge properties are set.
[d3blocks] >INFO> Set [figsize]: [500, 800]
[d3blocks] >INFO> File already exists and will be overwritten: [/tmp/d3blocks/sankey.html]


                                            source  \
0         RNA polymerase II-mediated transcription   
1         RNA polymerase II-mediated transcription   
2         RNA polymerase II-mediated transcription   
3         RNA polymerase II-mediated transcription   
4         RNA polymerase II-mediated transcription   
5         RNA polymerase II-mediated transcription   
6                                      RNA binding   
7                        Histone modifying complex   
8                        Histone modifying complex   
9                        Histone modifying complex   
10                       Histone modifying complex   
11                                   Enhanceosomes   
12                                   Enhanceosomes   
13                    Chromatin remodeling complex   
14                    Chromatin remodeling complex   
15                    Chromatin remodeling complex   
16                    Chromatin remodeling complex   
17                    Chroma

[d3blocks] >INFO> Open browser: /tmp/d3blocks/sankey.html


## 1.4 - Save data to excel file

In [5]:
with pd.ExcelWriter('../Supplementary_Tables/Table_S5.xlsx', engine='openpyxl',mode='a') as writer:
    dftmp.to_excel(writer,sheet_name='Fig_5D')