# Comparing BUSCO results for assemblies

* **Eudicot BUSCOs comparing purge_dups v1 & v3**
    * Total assembly sumamary
    * Comparing duplicates
    * Comparing primary assemblies
    * Comparing primary and alternate assemblies
    
* **Viridiplatae BUSCOs comparing v1-4** 
    * total assembly summary

* **Embryophata BUSCOs comparing v1-v4**
    * Total assembly sumamary
    * Comparing primary assemblies
    * Comparing primary and alternate assemblies

In [1]:
%matplotlib inline

In [2]:
import os
import matplotlib.pyplot as plt
import pandas as pd

## purge_dups Eudicot buscos assembly summary comparison

In [3]:
buscoinfn = "/home/jemimah/analysis/busco/20200602/Walli.v1.purge_dups.eudicots_odb10/short_summary.specific.eudicots_odb10.Walli.v1.purge_dups.eudicots_odb10.txt"
bheader=['v1', 'category']
busco_df = pd.read_csv(buscoinfn, sep = '\t', header = None, skiprows = 8, usecols = [1,2], names=bheader)
busco_df = busco_df.set_index("category")

temp_infn = "/home/jemimah/analysis/busco/20200602/Walli.v3.purge_dups.eudicots_odb10/short_summary.specific.eudicots_odb10.Walli.v3.purge_dups.eudicots_odb10.txt"
temp_header=['v3', "category"]
temp_df = pd.read_csv(temp_infn, sep = '\t', header = None, skiprows = 8, usecols = [1, 2], names=temp_header)
temp_df = temp_df.set_index("category")
busco_df["v3"] = temp_df["v3"]

In [4]:
origonal_headings = list(busco_df.index.values)
new_heading = [origonal_headings[-1]] + origonal_headings[:3] + ['Complete and two copy BUSCOs'] + origonal_headings[3:-1]

In [5]:
def to_percents(temp_both):
    temp_percents = ((temp_both/temp_both.loc["Total BUSCO groups searched"])*100).round(2)
    rename_percents = temp_percents.rename(index=lambda s: s + "%")
    df_both_percents = temp_both[:1].append(rename_percents[1:])
    return df_both_percents

In [6]:
%%bash
cd ~/analysis/busco/20200602/Walli.v1.purge_dups.eudicots_odb10/run_eudicots_odb10/busco_sequences/multi_copy_busco_sequences/
echo v1
for file in *.fna; do grep -c ">" $file; done | sort | uniq -c
cd ~/analysis/busco/20200602/Walli.v3.purge_dups.eudicots_odb10/run_eudicots_odb10/busco_sequences/multi_copy_busco_sequences/
echo v3
for file in *.fna; do grep -c ">" $file; done | sort | uniq -c

v1
   1667 2
     95 3
      7 4
      2 5
      1 6
      1 7
v3
   1801 2
     95 3
      8 4
      2 5
      1 6
      1 7


#### BUSCO Results comparison - numbers

In [7]:
true_duplicates = {'category': "Complete and two copy BUSCOs", 'v1':[1667], 'v3':[1801]}
df_true_duplicates = pd.DataFrame(true_duplicates).set_index("category")
busco_df = (busco_df[-1:]).append(busco_df[0:3]).append(df_true_duplicates).append(busco_df[3:-1])
busco_df

Unnamed: 0_level_0,v1,v3
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Total BUSCO groups searched,2326,2326
Complete BUSCOs (C),2095,2210
Complete and single-copy BUSCOs (S),322,302
Complete and duplicated BUSCOs (D),1773,1908
Complete and two copy BUSCOs,1667,1801
Fragmented BUSCOs (F),15,17
Missing BUSCOs (M),216,99


#### BUSCO results comparison - percents

In [8]:
to_percents(busco_df)

Unnamed: 0_level_0,v1,v3
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Total BUSCO groups searched,2326.0,2326.0
Complete BUSCOs (C)%,90.07,95.01
Complete and single-copy BUSCOs (S)%,13.84,12.98
Complete and duplicated BUSCOs (D)%,76.23,82.03
Complete and two copy BUSCOs%,71.67,77.43
Fragmented BUSCOs (F)%,0.64,0.73
Missing BUSCOs (M)%,9.29,4.26


## purge_dups buscos of primary vs alternate

### Collecting Info

In [9]:
buscoseqinfn_v1 = "/home/jemimah/analysis/busco/20200602/Walli.v1.purge_dups.eudicots_odb10/run_eudicots_odb10/full_table.tsv"
buscoseqinfn_v3 = "/home/jemimah/analysis/busco/20200602/Walli.v3.purge_dups.eudicots_odb10/run_eudicots_odb10/full_table.tsv"

In [10]:
eudicot_header = "Busco_id      Status  Sequence        Gene_Start      Gene_End        Score   Length".split()
embryophyta_header = "Busco_id      Status  Sequence        Gene_Start      Gene_End        Score   Length  OrthoDB_url     Description ".split()

In [11]:
#function takes an infile and a way to identify which contigs are 
#from the alternate assembly
#also now takes a header because the eudicots have fewer columnsthan the embryophata??
#returns a df or can save as own file
def purged_buscos(infn, identifier, seqheader):
    #read infile
    buscoseq_df = pd.read_csv(infn, sep = '\t', header = None, names=seqheader, comment = "#")
    #create a df of just duplicated buscos
#Duplicates Only
#    duplicated_df = buscoseq_df[buscoseq_df["Status"] =='Duplicated']  
#Whole Assembly
    duplicated_df = buscoseq_df[buscoseq_df["Status"] !='Missing']
    #create a list of duplicated buscos ids
    dup_ids = list(duplicated_df["Busco_id"].unique())
    
    #prep for the for loop
    tig_seqs = []
    tig_count = []
    tig_status = []
    hap_tig_seqs = []
    hap_tig_count = []
    hap_tig_status = []
    total_count = []

    #for each busco, sort the contigs into primary or alternate assembly
    for busco in dup_ids:
        temp_df = duplicated_df[duplicated_df["Busco_id"] == busco]
        total_count += [len(temp_df)]
        temp_seqs = list(temp_df["Sequence"])
        temp_hap = []
        temp_tig = []
        temp_hap_stat = []
        temp_tig_stat = []
        for seq in temp_seqs:
            if identifier in seq:
                temp_hap += [seq]
                temp_hap_stat += list(temp_df[temp_df["Sequence"] == seq]["Status"])
            else:
                temp_tig += [seq]
                temp_tig_stat += list(temp_df[temp_df["Sequence"] == seq]["Status"])
        tig_seqs += [temp_tig]
        tig_count += [len(temp_tig)]
        tig_status += [temp_tig_stat]
        hap_tig_seqs += [temp_hap]
        hap_tig_count += [len(temp_hap)]
        hap_tig_status += [temp_hap_stat]

    #collate result
    data = list(zip(dup_ids, tig_seqs, tig_status, tig_count, hap_tig_seqs, hap_tig_status, hap_tig_count, total_count))
    cols = ['Busco_id','primary_seqs', 'pri_status','primary_count', 'alternate_seqs', 'alt_status', 'alternate_count', 'total_count']
    
    #create df from that
    dup_count_df = pd.DataFrame(data, columns = cols)
    #save as own file
#Duplicates Only:
#    outpath = infn[:-14]+"duplicates_assembly_sorting.tsv"
#Whole assembly:
#    outpath = infn[:-14]+"all_buscos_assembly_sorting.tsv"
#    dup_count_df.to_csv(outpath, sep='\t', header = True, index = False)


In [12]:
purged_buscos(buscoseqinfn_v3, "hap", eudicot_header)
purged_buscos(buscoseqinfn_v1, "hap", eudicot_header)

In [13]:
buscodupsinfn_v1 = "/home/jemimah/analysis/busco/20200602/Walli.v1.purge_dups.eudicots_odb10/run_eudicots_odb10/duplicates_assembly_sorting.tsv"
buscodupsinfn_v3 = "/home/jemimah/analysis/busco/20200602/Walli.v3.purge_dups.eudicots_odb10/run_eudicots_odb10/duplicates_assembly_sorting.tsv"

dups_samples = {"v1":buscodupsinfn_v1, "v3":buscodupsinfn_v3 }

In [14]:
#dup_count_df = pd.read_csv(buscodupsinfn_v3, sep = '\t')
#dup_count_df = dup_count_df.set_index("Busco_id")
#dup_count_df

In [15]:
headings = ["Two-Copy Duplicates %", "Two Copy (one in each assembly) %", "Two Copy (both in primary assembly) %", "Two Copy (both in alternate assembly) %", "More Than Two-Copy Duplicates %", "More Than Two-Copy Duplicates (in both assemblies) %"]
df_dict = {'category': headings}

for v in dups_samples:
    dup_count_df = pd.read_csv(dups_samples[v], sep = '\t')
    dup_count_df = dup_count_df.set_index("Busco_id")

    a = len(dup_count_df[(dup_count_df["total_count"] == 2)])/busco_df.loc["Total BUSCO groups searched", v] *100
    b = len(dup_count_df[(dup_count_df["total_count"] == 2) & (dup_count_df["alternate_count"] == 1)])/busco_df.loc["Total BUSCO groups searched", v] *100 
    c = len(dup_count_df[(dup_count_df["total_count"] == 2) & (dup_count_df["alternate_count"] == 0)])/busco_df.loc["Total BUSCO groups searched", v] *100 
    d = len(dup_count_df[(dup_count_df["total_count"] == 2) & (dup_count_df["alternate_count"] == 2)])/busco_df.loc["Total BUSCO groups searched", v] *100
    e = len(dup_count_df[dup_count_df["total_count"] > 2])/busco_df.loc["Total BUSCO groups searched", v] *100  
    f = len(dup_count_df[(dup_count_df["total_count"] > 2) & (dup_count_df["alternate_count"] >= 1) & (dup_count_df["primary_count"] >= 1)])/busco_df.loc["Total BUSCO groups searched", v] *100 
    
    list_ = [a, b, c, d, e, f]
    df_dict[v] = list_
    
df_dups = pd.DataFrame(df_dict)
df_pri = df_dups.set_index("category")

### Duplicates Buscos comparison

In [16]:
df_pri

Unnamed: 0_level_0,v1,v3
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Two-Copy Duplicates %,71.6681,77.429063
Two Copy (one in each assembly) %,60.361135,66.809974
Two Copy (both in primary assembly) %,10.83405,10.146174
Two Copy (both in alternate assembly) %,0.472915,0.472915
More Than Two-Copy Duplicates %,4.55718,4.600172
More Than Two-Copy Duplicates (in both assemblies) %,4.213242,4.299226


this table shows the % of all buscos which meet the category requirements. eg. for v3 77% of all buscos are 2 copy duplicates. This matches the result for "complete and 2 copy buscos" from the earlier table, shown again below.

In [17]:
to_percents(busco_df)

Unnamed: 0_level_0,v1,v3
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Total BUSCO groups searched,2326.0,2326.0
Complete BUSCOs (C)%,90.07,95.01
Complete and single-copy BUSCOs (S)%,13.84,12.98
Complete and duplicated BUSCOs (D)%,76.23,82.03
Complete and two copy BUSCOs%,71.67,77.43
Fragmented BUSCOs (F)%,0.64,0.73
Missing BUSCOs (M)%,9.29,4.26


### Buscos of Primary Assemblies ONLY

In [18]:
buscoallinfn_v1 = "/home/jemimah/analysis/busco/20200602/Walli.v1.purge_dups.eudicots_odb10/run_eudicots_odb10/all_buscos_assembly_sorting.tsv"
buscoallinfn_v3 = "/home/jemimah/analysis/busco/20200602/Walli.v3.purge_dups.eudicots_odb10/run_eudicots_odb10/all_buscos_assembly_sorting.tsv"

all_samples = {"v1":buscoallinfn_v1, "v3":buscoallinfn_v3 }

In [19]:
#count_df = (pd.read_csv(buscoallinfn_v3, sep = '\t')).set_index("Busco_id")
#count_df[count_df["pri_status"].str.contains("Frag") | count_df["alt_status"].str.contains("Frag")]
#count_df.head()

In [20]:
another_heading = new_heading[:3] + ["Whole assembly single-copy BUSCOs"] + new_heading[3:]
pri_dict = {'category': another_heading}
both_dict = {'category': another_heading}

for v in all_samples:
    count_df = (pd.read_csv(all_samples[v], sep = '\t')).set_index("Busco_id")
    
    total = busco_df.loc["Total BUSCO groups searched", v]   
    f = len(count_df[count_df["pri_status"].str.contains("Frag")])  
    c = len(count_df[count_df["primary_count"] > 0])
    s = len(count_df[count_df["primary_count"] == 1 & (count_df["pri_status"].str.contains("Frag") == False)])
    d = len(count_df[count_df['primary_count'] > 1])
    two = len(count_df[count_df['primary_count'] == 2])
    one = len(count_df[count_df['pri_status'].str.contains("Complete")])
    m = total - (c+f) 

    fa = len(count_df[count_df["alt_status"].str.contains("Frag")])  
    ca = len(count_df[count_df["alternate_count"] > 0])
    sa = len(count_df[count_df["alternate_count"] == 1 & (count_df["alt_status"].str.contains("Frag") == False)])
    da = len(count_df[count_df['alternate_count'] > 1])
    twoa = len(count_df[count_df['alternate_count'] == 2])
    onea = len(count_df[count_df['alt_status'].str.contains("Complete")])
    ma = total - (ca+fa) 


    list_pri = [total,c,s,one,d,two,f,m]
    list_alt = [total,ca,sa,onea,da,twoa,fa,ma]
    both_dict[v + "_pri"] = list_pri
    pri_dict[v + "_pri"] = list_pri
    both_dict[v + "_alt"] = list_alt
    

df_both = pd.DataFrame(both_dict)
df_both = df_both.set_index("category")
df_pri = pd.DataFrame(pri_dict)
df_pri = df_pri.set_index("category")

In [21]:
df_pri

Unnamed: 0_level_0,v1_pri,v3_pri
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Total BUSCO groups searched,2326,2326
Complete BUSCOs (C),1958,2077
Complete and single-copy BUSCOs (S),1661,1796
Whole assembly single-copy BUSCOs,193,177
Complete and duplicated BUSCOs (D),291,274
Complete and two copy BUSCOs,283,268
Fragmented BUSCOs (F),6,7
Missing BUSCOs (M),362,242


In [22]:
to_percents(df_pri)

Unnamed: 0_level_0,v1_pri,v3_pri
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Total BUSCO groups searched,2326.0,2326.0
Complete BUSCOs (C)%,84.18,89.29
Complete and single-copy BUSCOs (S)%,71.41,77.21
Whole assembly single-copy BUSCOs%,8.3,7.61
Complete and duplicated BUSCOs (D)%,12.51,11.78
Complete and two copy BUSCOs%,12.17,11.52
Fragmented BUSCOs (F)%,0.26,0.3
Missing BUSCOs (M)%,15.56,10.4


### Primary and Alternate Assembly

In [23]:
df_both

Unnamed: 0_level_0,v1_pri,v1_alt,v3_pri,v3_alt
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total BUSCO groups searched,2326,2326,2326,2326
Complete BUSCOs (C),1958,1654,2077,1804
Complete and single-copy BUSCOs (S),1661,1556,1796,1703
Whole assembly single-copy BUSCOs,193,129,177,125
Complete and duplicated BUSCOs (D),291,89,274,91
Complete and two copy BUSCOs,283,83,268,83
Fragmented BUSCOs (F),6,9,7,10
Missing BUSCOs (M),362,663,242,512


In [24]:
to_percents(df_both)

Unnamed: 0_level_0,v1_pri,v1_alt,v3_pri,v3_alt
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total BUSCO groups searched,2326.0,2326.0,2326.0,2326.0
Complete BUSCOs (C)%,84.18,71.11,89.29,77.56
Complete and single-copy BUSCOs (S)%,71.41,66.9,77.21,73.22
Whole assembly single-copy BUSCOs%,8.3,5.55,7.61,5.37
Complete and duplicated BUSCOs (D)%,12.51,3.83,11.78,3.91
Complete and two copy BUSCOs%,12.17,3.57,11.52,3.57
Fragmented BUSCOs (F)%,0.26,0.39,0.3,0.43
Missing BUSCOs (M)%,15.56,28.5,10.4,22.01


## Viridiplantae BUSCOs comparison for v1-4

Eudicots wasn't running for v2 and v4 (freezing???) but viridiplatae worked so lets have a squiz.

In [25]:
buscoinfn_v1_v = "/home/jemimah/analysis/busco/20200615/Walli.v1.purge_dups.viridiplantae_odb10/short_summary.specific.viridiplantae_odb10.Walli.v1.purge_dups.viridiplantae_odb10.txt"
buscoinfn_v2_v = "/home/jemimah/analysis/busco/20200615/Walli.v2.ctg_combined.viridiplantae_odb10/short_summary.specific.viridiplantae_odb10.Walli.v2.ctg_combined.viridiplantae_odb10.txt"
buscoinfn_v3_v = "/home/jemimah/analysis/busco/20200615/Walli.v3.purge_dups.viridiplantae_odb10/short_summary.specific.viridiplantae_odb10.Walli.v3.purge_dups.viridiplantae_odb10.txt"
buscoinfn_v4_v = "/home/jemimah/analysis/busco/20200615/Walli.v4.ctg_combined.viridiplantae_odb10/short_summary.specific.viridiplantae_odb10.Walli.v4.ctg_combined.viridiplantae_odb10.txt"


v_samples = {"v1":buscoinfn_v1_v, "v2":buscoinfn_v2_v, "v3":buscoinfn_v3_v, "v4":buscoinfn_v4_v}

In [26]:
first_header=['v1', 'category']
busco_df_v = pd.read_csv(buscoinfn_v1_v, sep = '\t', header = None, skiprows = 8, usecols = [1,2], names=first_header)
busco_df_v = busco_df_v.set_index("category")

for v in v_samples:
    temp_header=[v, 'category']
    temp_df = pd.read_csv(v_samples[v], sep = '\t', header = None, skiprows = 8, usecols = [1, 2], names=temp_header)
    temp_df = temp_df.set_index("category")
    busco_df_v[v] = temp_df[v]

busco_df_v = busco_df_v[-1:].append(busco_df_v[:-1])

### Whole Assemblies Viridiplantae BUSCO stats

In [27]:
busco_df_v

Unnamed: 0_level_0,v1,v2,v3,v4
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total BUSCO groups searched,425,425,425,425
Complete BUSCOs (C),412,410,418,414
Complete and single-copy BUSCOs (S),72,59,49,40
Complete and duplicated BUSCOs (D),340,351,369,374
Fragmented BUSCOs (F),2,0,1,1
Missing BUSCOs (M),11,15,6,10


In [28]:
to_percents(busco_df_v)

Unnamed: 0_level_0,v1,v2,v3,v4
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total BUSCO groups searched,425.0,425.0,425.0,425.0
Complete BUSCOs (C)%,96.94,96.47,98.35,97.41
Complete and single-copy BUSCOs (S)%,16.94,13.88,11.53,9.41
Complete and duplicated BUSCOs (D)%,80.0,82.59,86.82,88.0
Fragmented BUSCOs (F)%,0.47,0.0,0.24,0.24
Missing BUSCOs (M)%,2.59,3.53,1.41,2.35


Decided 425 was too small a sample size to be sure about differences, so also running embryophyta (1614 buscos)

## Embryophata BUSCOs comparison v1-4

In [29]:
buscoinfn_v1_em = "/home/jemimah/analysis/busco/20200615/Walli.v1.purge_dups.embryophyta_odb10/short_summary.specific.embryophyta_odb10.Walli.v1.purge_dups.embryophyta_odb10.txt"
buscoinfn_v2_em = "/home/jemimah/analysis/busco/20200615/Walli.v2.ctg_combined.embryophyta_odb10/short_summary.specific.embryophyta_odb10.Walli.v2.ctg_combined.embryophyta_odb10.txt"
buscoinfn_v3_em = "/home/jemimah/analysis/busco/20200615/Walli.v3.purge_dups.embryophyta_odb10/short_summary.specific.embryophyta_odb10.Walli.v3.purge_dups.embryophyta_odb10.txt"
buscoinfn_v4_em = "/home/jemimah/analysis/busco/20200615/Walli.v4.ctg_combined.embryophyta_odb10/short_summary.specific.embryophyta_odb10.Walli.v4.ctg_combined.embryophyta_odb10.txt"


em_samples = {"v1":buscoinfn_v1_em, "v2":buscoinfn_v2_em, "v3":buscoinfn_v3_em, "v4":buscoinfn_v4_em}

In [30]:
first_header=['v1', 'category']
busco_df_em = pd.read_csv(buscoinfn_v1_em, sep = '\t', header = None, skiprows = 8, usecols = [1,2], names=first_header)
busco_df_em = busco_df_em.set_index("category")

for v in em_samples:
    temp_header=[v, 'category']
    temp_df = pd.read_csv(em_samples[v], sep = '\t', header = None, skiprows = 8, usecols = [1, 2], names=temp_header)
    temp_df = temp_df.set_index("category")
    busco_df_em[v] = temp_df[v]

busco_df_em = busco_df_em[-1:].append(busco_df_em[:-1])

In [2]:
%%bash
cd ~/analysis/busco/20200615/Walli.v1.purge_dups.embryophyta_odb10/run_embryophyta_odb10/busco_sequences/multi_copy_busco_sequences/
echo v1
for file in *.fna; do grep -c ">" $file; done | sort | uniq -c
%%bash
cd ~/analysis/busco/20200615/Walli.v2.ctg_combined.embryophyta_odb10/run_embryophyta_odb10/busco_sequences/multi_copy_busco_sequences/
echo v2
for file in *.fna; do grep -c ">" $file; done | sort | uniq -c
%%bash
cd ~/analysis/busco/20200615/Walli.v3.purge_dups.embryophyta_odb10/run_embryophyta_odb10/busco_sequences/multi_copy_busco_sequences/
echo v3
for file in *.fna; do grep -c ">" $file; done | sort | uniq -c
%%bash
cd ~/analysis/busco/20200615/Walli.v4.ctg_combined.embryophyta_odb10/run_embryophyta_odb10/busco_sequences/multi_copy_busco_sequences/
echo v4
for file in *.fna; do grep -c ">" $file; done | sort | uniq -c

v1
v2
v3
v4


bash: line 1: cd: /home/jemimah/analysis/busco/20200615/Walli.v1.purge_dups.embryophyta_odb10/run_embryophyta_odb10/busco_sequences/multi_copy_busco_sequences/: No such file or directory
grep: *.fna: No such file or directory
bash: line 4: fg: no job control
bash: line 5: cd: /home/jemimah/analysis/busco/20200615/Walli.v2.ctg_combined.embryophyta_odb10/run_embryophyta_odb10/busco_sequences/multi_copy_busco_sequences/: No such file or directory
grep: *.fna: No such file or directory
bash: line 8: fg: no job control
bash: line 9: cd: /home/jemimah/analysis/busco/20200615/Walli.v3.purge_dups.embryophyta_odb10/run_embryophyta_odb10/busco_sequences/multi_copy_busco_sequences/: No such file or directory
grep: *.fna: No such file or directory
bash: line 12: fg: no job control
bash: line 13: cd: /home/jemimah/analysis/busco/20200615/Walli.v4.ctg_combined.embryophyta_odb10/run_embryophyta_odb10/busco_sequences/multi_copy_busco_sequences/: No such file or directory
grep: *.fna: No such file or d

In [32]:
true_duplicates_em = {'category': "Complete and two copy BUSCOs", 'v1':[1208], 'v2':[1221], 'v3':[1308], 'v4':[1328]}
df_true_duplicates_em = pd.DataFrame(true_duplicates_em).set_index("category")
busco_df_em = busco_df_em[:4].append(df_true_duplicates_em).append(busco_df_em[4:])

### Stats on whole assemblies: 

In [33]:
busco_df_em

Unnamed: 0_level_0,v1,v2,v3,v4
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total BUSCO groups searched,1614,1614,1614,1614
Complete BUSCOs (C),1491,1482,1568,1557
Complete and single-copy BUSCOs (S),205,190,182,158
Complete and duplicated BUSCOs (D),1286,1292,1386,1399
Complete and two copy BUSCOs,1208,1221,1308,1328
Fragmented BUSCOs (F),8,10,5,10
Missing BUSCOs (M),115,122,41,47


In [34]:
to_percents(busco_df_em)

Unnamed: 0_level_0,v1,v2,v3,v4
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total BUSCO groups searched,1614.0,1614.0,1614.0,1614.0
Complete BUSCOs (C)%,92.38,91.82,97.15,96.47
Complete and single-copy BUSCOs (S)%,12.7,11.77,11.28,9.79
Complete and duplicated BUSCOs (D)%,79.68,80.05,85.87,86.68
Complete and two copy BUSCOs%,74.85,75.65,81.04,82.28
Fragmented BUSCOs (F)%,0.5,0.62,0.31,0.62
Missing BUSCOs (M)%,7.13,7.56,2.54,2.91


### looking at split assemblies 

In [35]:
buscoseqinfn_v1_em = "/home/jemimah/analysis/busco/20200615/Walli.v1.purge_dups.embryophyta_odb10/run_embryophyta_odb10/full_table.tsv"
buscoseqinfn_v2_em = "/home/jemimah/analysis/busco/20200615/Walli.v2.ctg_combined.embryophyta_odb10/run_embryophyta_odb10/full_table.tsv"
buscoseqinfn_v3_em = "/home/jemimah/analysis/busco/20200615/Walli.v3.purge_dups.embryophyta_odb10/run_embryophyta_odb10/full_table.tsv"
buscoseqinfn_v4_em = "/home/jemimah/analysis/busco/20200615/Walli.v4.ctg_combined.embryophyta_odb10/run_embryophyta_odb10/full_table.tsv"

for canu assemblies, alternate contigs start with "hap", for hifiasm they start with "a_ctg" while primary are "p_ctg"

In [36]:
purged_buscos(buscoseqinfn_v1_em, "hap", embryophyta_header)
purged_buscos(buscoseqinfn_v2_em, "a_ctg", embryophyta_header)
purged_buscos(buscoseqinfn_v3_em, "hap", embryophyta_header)
purged_buscos(buscoseqinfn_v4_em, "a_ctg", embryophyta_header)

ADD FULL PATH

In [37]:
purged_buscos_v1 = "/home/jemimah/analysis/busco/20200615/Walli.v1.purge_dups.embryophyta_odb10/run_embryophyta_odb10/all_buscos_assembly_sorting.tsv"
purged_buscos_v2 = "/home/jemimah/analysis/busco/20200615/Walli.v2.ctg_combined.embryophyta_odb10/run_embryophyta_odb10/all_buscos_assembly_sorting.tsv"
purged_buscos_v3 = "/home/jemimah/analysis/busco/20200615/Walli.v3.purge_dups.embryophyta_odb10/run_embryophyta_odb10/all_buscos_assembly_sorting.tsv"
purged_buscos_v4 = "/home/jemimah/analysis/busco/20200615/Walli.v4.ctg_combined.embryophyta_odb10/run_embryophyta_odb10/all_buscos_assembly_sorting.tsv"

In [38]:
all_samples_em = {"v1": purged_buscos_v1, "v2": purged_buscos_v2, "v3": purged_buscos_v3, "v4": purged_buscos_v4}

In [39]:
both_dict_em = {'category': another_heading}
pri_dict_em = {'category': another_heading} 

for v in all_samples_em:
    count_df = (pd.read_csv(all_samples_em[v], sep = '\t')).set_index("Busco_id")
    
    total = busco_df_em.loc["Total BUSCO groups searched", v]   
    f = len(count_df[count_df["pri_status"].str.contains("Frag")])  
    c = len(count_df[count_df["primary_count"] > 0])
    s = len(count_df[count_df["primary_count"] == 1 & (count_df["pri_status"].str.contains("Frag") == False)])
    d = len(count_df[count_df['primary_count'] > 1])
    two = len(count_df[count_df['primary_count'] == 2])
    one = len(count_df[count_df['pri_status'].str.contains("Complete")])
    m = total - (c+f) 

    fa = len(count_df[count_df["alt_status"].str.contains("Frag")])  
    ca = len(count_df[count_df["alternate_count"] > 0])
    sa = len(count_df[count_df["alternate_count"] == 1 & (count_df["alt_status"].str.contains("Frag") == False)])
    da = len(count_df[count_df['alternate_count'] > 1])
    twoa = len(count_df[count_df['alternate_count'] == 2])
    onea = len(count_df[count_df['alt_status'].str.contains("Complete")])
    ma = total - (ca+fa) 


    list_pri = [total,c,s,one,d,two,f,m]
    list_alt = [total,ca,sa,onea,da,twoa,fa,ma]
    both_dict_em[v + "_pri"] = list_pri
    pri_dict_em[v+ "_pri"] = list_pri
    both_dict_em[v + "_alt"] = list_alt
    

df_both_em = pd.DataFrame(both_dict_em)
df_both_em = df_both_em.set_index("category")
df_pri_em = pd.DataFrame(pri_dict_em)
df_pri_em = df_pri_em.set_index("category")

### Embryophata  BUSCOs v1-4, primary assemblies only

in numbers:

In [40]:
df_pri_em

Unnamed: 0_level_0,v1_pri,v2_pri,v3_pri,v4_pri
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total BUSCO groups searched,1614,1614,1614,1614
Complete BUSCOs (C),1400,1390,1479,1470
Complete and single-copy BUSCOs (S),1189,1312,1288,1422
Whole assembly single-copy BUSCOs,122,111,104,85
Complete and duplicated BUSCOs (D),208,72,189,43
Complete and two copy BUSCOs,200,62,183,42
Fragmented BUSCOs (F),3,6,2,5
Missing BUSCOs (M),211,218,133,139


in percents:

In [41]:
to_percents(df_pri_em)

Unnamed: 0_level_0,v1_pri,v2_pri,v3_pri,v4_pri
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total BUSCO groups searched,1614.0,1614.0,1614.0,1614.0
Complete BUSCOs (C)%,86.74,86.12,91.64,91.08
Complete and single-copy BUSCOs (S)%,73.67,81.29,79.8,88.1
Whole assembly single-copy BUSCOs%,7.56,6.88,6.44,5.27
Complete and duplicated BUSCOs (D)%,12.89,4.46,11.71,2.66
Complete and two copy BUSCOs%,12.39,3.84,11.34,2.6
Fragmented BUSCOs (F)%,0.19,0.37,0.12,0.31
Missing BUSCOs (M)%,13.07,13.51,8.24,8.61


### Embryophata  BUSCOs v1-4, primary and alternate assemblies

in numbers:

In [42]:
df_both_em

Unnamed: 0_level_0,v1_pri,v1_alt,v2_pri,v2_alt,v3_pri,v3_alt,v4_pri,v4_alt
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Total BUSCO groups searched,1614,1614,1614,1614,1614,1614,1614,1614
Complete BUSCOs (C),1400,1190,1390,1320,1479,1299,1470,1453
Complete and single-copy BUSCOs (S),1189,1121,1312,1252,1288,1226,1422,1380
Whole assembly single-copy BUSCOs,122,83,111,79,104,78,85,73
Complete and duplicated BUSCOs (D),208,64,72,64,189,70,43,68
Complete and two copy BUSCOs,200,59,62,56,183,62,42,57
Fragmented BUSCOs (F),3,5,6,4,2,3,5,5
Missing BUSCOs (M),211,419,218,290,133,312,139,156


in percents:

In [43]:
to_percents(df_both_em)

Unnamed: 0_level_0,v1_pri,v1_alt,v2_pri,v2_alt,v3_pri,v3_alt,v4_pri,v4_alt
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Total BUSCO groups searched,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
Complete BUSCOs (C)%,86.74,73.73,86.12,81.78,91.64,80.48,91.08,90.02
Complete and single-copy BUSCOs (S)%,73.67,69.45,81.29,77.57,79.8,75.96,88.1,85.5
Whole assembly single-copy BUSCOs%,7.56,5.14,6.88,4.89,6.44,4.83,5.27,4.52
Complete and duplicated BUSCOs (D)%,12.89,3.97,4.46,3.97,11.71,4.34,2.66,4.21
Complete and two copy BUSCOs%,12.39,3.66,3.84,3.47,11.34,3.84,2.6,3.53
Fragmented BUSCOs (F)%,0.19,0.31,0.37,0.25,0.12,0.19,0.31,0.31
Missing BUSCOs (M)%,13.07,25.96,13.51,17.97,8.24,19.33,8.61,9.67


In [44]:
df_paper = busco_df_em.copy()
df_paper["v3_pri"] = df_pri_em["v3_pri"]
df_paper["v3_alt"] = df_both_em["v3_alt"]

In [45]:
df_paper[["v3", "v3_pri", "v3_alt"]]

Unnamed: 0_level_0,v3,v3_pri,v3_alt
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Total BUSCO groups searched,1614,1614,1614
Complete BUSCOs (C),1568,1479,1299
Complete and single-copy BUSCOs (S),182,1288,1226
Complete and duplicated BUSCOs (D),1386,189,70
Complete and two copy BUSCOs,1308,183,62
Fragmented BUSCOs (F),5,2,3
Missing BUSCOs (M),41,133,312


In [46]:
df_paper = busco_df.copy()
df_paper["v3_pri"] = df_pri["v3_pri"]
df_paper["v3_alt"] = df_both["v3_alt"]

In [47]:
df_paper[["v3", "v3_pri", "v3_alt"]]

Unnamed: 0_level_0,v3,v3_pri,v3_alt
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Total BUSCO groups searched,2326,2326,2326
Complete BUSCOs (C),2210,2077,1804
Complete and single-copy BUSCOs (S),302,1796,1703
Complete and duplicated BUSCOs (D),1908,274,91
Complete and two copy BUSCOs,1801,268,83
Fragmented BUSCOs (F),17,7,10
Missing BUSCOs (M),99,242,512


In [49]:
to_percents(df_paper[["v3", "v3_pri", "v3_alt"]])

Unnamed: 0_level_0,v3,v3_pri,v3_alt
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Total BUSCO groups searched,2326.0,2326.0,2326.0
Complete BUSCOs (C)%,95.01,89.29,77.56
Complete and single-copy BUSCOs (S)%,12.98,77.21,73.22
Complete and duplicated BUSCOs (D)%,82.03,11.78,3.91
Complete and two copy BUSCOs%,77.43,11.52,3.57
Fragmented BUSCOs (F)%,0.73,0.3,0.43
Missing BUSCOs (M)%,4.26,10.4,22.01
