# Admin

In [24]:
stages=['ee','l3','stage_inv']
cutoffs=[51]
sets=['set2', 'set4', 'set5', 'set6', 'set7']
domain_types=['all','active','regulated']
operon_options=['no','with']
worms=['briggsae', 'inopinata']

In [25]:
import pandas as pd
import numpy as np
import pickle
import scipy
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import statistics
import seaborn as sns
import statistics

In [26]:
# hopefully this should make the script work wherever the 'to_transfer' folder is
root_path= !pwd
root_path=root_path[0]

In [27]:
# genes files you'll need

genes_files={}

for worm in worms:
    genes_files[worm]={}
    genes_files[worm]['e_orths_no_ops_path']=root_path+'/genesets/'+worm+'_e_orths_no_ops.bed'
    genes_files[worm]['b_orths_no_ops_path']=root_path+'/genesets/'+worm+'_b_orths_no_ops.bed'
    genes_files[worm]['e_orths_with_ops_path']=root_path+'/genesets/'+worm+'_e_orths_with_ops.bed'
    genes_files[worm]['b_orths_with_ops_path']=root_path+'/genesets/'+worm+'_b_orths_with_ops.bed'
    
# 'b' refers to 'b worm' which is either briggsae or inopinata
# 'e' refers to elegans

In [29]:
bed_4_columns=['chrom','chromStart','chromEnd','name']

In [30]:
# customise plotting aesthetics

SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams.update({'text.color' : "black",
                     'axes.labelcolor' : "black"})
plt.style.use('ggplot')
#plt.style.available

# Code

In [42]:
with open(root_path+'/made_by_script/location_dicts/final_info_NEW.pickle', 'rb') as handle:
    final_info=pickle.load(handle)

In [43]:
final_info

{'briggsae': {'with': {'all': {51: {'ee': {'set5': {'dd': ['chromV_10814112',
        'chromV_10579308',
        'chromIV_7493843',
        'chromIV_7498054',
        'chromIV_7493843',
        'chromIV_7497850',
        'chromIII_2487768',
        'chromIII_2503422',
        'chromIII_2508817',
        'chromIII_2511736',
        'chromIII_2494454',
        'chromIII_2507972',
        'chromIII_2511736',
        'chromIII_9443830',
        'chromIII_9456562',
        'chromIII_9462996',
        'chromIII_9449931',
        'chromIII_9462996',
        'chromIV_548301',
        'chromIV_559223',
        'chromIV_568361',
        'chromIV_572719',
        'chromIV_583812',
        'chromIV_589029',
        'chromIV_606354',
        'chromIV_615543',
        'chromIV_627536',
        'chromIV_553299',
        'chromIV_568148',
        'chromIV_570875',
        'chromIV_583784',
        'chromIV_587829',
        'chromIV_592485',
        'chromIV_614026',
        'chromIV_620925',
        '

In [45]:
intergen_lengths=[]
intergen_types=[]

for worm in worms:
    for dd_or_dp in ['dd','dp']:
        for domain_type in ['active','regulated']:
            
            df=pd.read_csv(genes_files[worm]['e_orths_no_ops_path'], sep='\t', names=bed_4_columns)
            locs_of_interest=final_info[worm]['with'][domain_type][51]['l3']['set5'][dd_or_dp]
            
            for index, row in df[1:-1].iterrows():
                
                start_loc_name='chrom'+row['chrom']+'_'+str(row['chromStart'])
                end_loc_name='chrom'+row['chrom']+'_'+str(row['chromEnd'])

                # if start loc is a dp/dd loc, what's the intergenic legnth between that loc and neighbouring end loc
                if start_loc_name in locs_of_interest:
                    this_start=df['chromStart'][index]
                    previous_end=df['chromEnd'][index-1]
                    distance=this_start-previous_end
                    if distance>=0: 
                        intergen_lengths.append(distance)
                        intergen_types.append(worm+'_'+domain_type+'_'+dd_or_dp)

                # if end loc is a dp/dd loc, what's the intergenic length between that loc and theneighbouring start loc
                if end_loc_name in locs_of_interest:
                    this_end=df['chromEnd'][index]
                    next_start=df['chromStart'][index+1]
                    distance=next_start-this_end
                    if distance>=0: 
                        intergen_lengths.append(distance)
                        intergen_types.append(worm+'_'+domain_type+'_'+dd_or_dp)

In [47]:
plot_df=pd.DataFrame({'Intergenic Region Length':intergen_lengths, 'Intergenic Region Type':intergen_types})

In [48]:
plot_df

Unnamed: 0,Intergenic Region Length,Intergenic Region Type
0,16153,briggsae_active_dd
1,2023,briggsae_active_dd
2,2023,briggsae_active_dd
3,337,briggsae_active_dd
4,337,briggsae_active_dd
...,...,...
9865,0,inopinata_regulated_dp
9866,5043,inopinata_regulated_dp
9867,0,inopinata_regulated_dp
9868,0,inopinata_regulated_dp


In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax.ylim=[-5000, 130000]

ax = sns.violinplot(x="Intergenic Region Type", y="Intergenic Region Length", data=plot_df)
plt.xticks(rotation=45)
plt.show()

In [51]:
for worm in worms:
    
    active_dd_subdf=plot_df[plot_df['Intergenic Region Type']==worm+'_'+'active_dd']
    active_dp_subdf=plot_df[plot_df['Intergenic Region Type']==worm+'_'+'active_dp']
    
    active_dd_median=statistics.median(active_dd_subdf['Intergenic Region Length'])
    active_dp_median=statistics.median(active_dp_subdf['Intergenic Region Length'])

    active_expected_preservation_ratio=active_dp_median/active_dd_median
    
    reg_dd_subdf=plot_df[plot_df['Intergenic Region Type']==worm+'_'+'regulated_dd']
    reg_dp_subdf=plot_df[plot_df['Intergenic Region Type']==worm+'_'+'regulated_dp']
    
    reg_dd_median=statistics.median(reg_dd_subdf['Intergenic Region Length'])
    reg_dp_median=statistics.median(reg_dp_subdf['Intergenic Region Length'])
    
    reg_expected_preservation_ratio=reg_dp_median/reg_dd_median
    
    print(worm)
    
    print('active_dd_median', active_dd_median)
    print('reg_dd_median', reg_dd_median)
    print('active_dp_median', active_dp_median)
    print('reg_dp_median', reg_dp_median)

    print('active expected preservation ratio:', active_expected_preservation_ratio)
    print('regulated expected preservation ratio:', reg_expected_preservation_ratio)

briggsae
active_dd_median 556
reg_dd_median 837
active_dp_median 728.0
reg_dp_median 961
active expected preservation ratio: 1.3093525179856116
regulated expected preservation ratio: 1.1481481481481481
inopinata
active_dd_median 460
reg_dd_median 874.0
active_dp_median 668.5
reg_dp_median 1016.5
active expected preservation ratio: 1.4532608695652174
regulated expected preservation ratio: 1.1630434782608696
