In [1]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [2]:
import pprint
pp = pprint.PrettyPrinter()

In [3]:
labels_of_interest = ['N50 scaffold length', 'Longest scaffold',  r'scaffold %N', 'L50 scaffold count', 'Number of scaffolds', 'Number of contigs','Longest contig']
I_have_stats_for = 'FRAX06 FRAX07 FRAX09 FRAX11 FRAX14 FRAX15 FRAX16 FRAX19 FRAX20 FRAX21 FRAX23 FRAX25 FRAX26 FRAX27 FRAX28 FRAX29 FRAX30 FRAX31 FRAX32 FRAX33 FRAX34'.split()

In [4]:
scope = ['https://spreadsheets.google.com/feeds']
creds = ServiceAccountCredentials.from_json_keyfile_name('../sheets_backend_secret.json', scope)
client = gspread.authorize(creds)

In [5]:
sheet = client.open('Summary_of_de_novo_assembly_stats').worksheet('Summary')
labels = sheet.range('A4:AH4')
pp.pprint([x.value for x in labels])


['',
 'F. angustifolia subsp. angustifolia (FRAX01)',
 'F. apertisquamifera (FRAX02)',
 'F. caroliniana (FRAX03)',
 'F. dipetala (FRAX04)',
 'F. latifolia (FRAX05)',
 'F. mandshurica (FRAX06)',
 'F. ornus (FRAX07)',
 'F. paxiana (FRAX08)',
 'F. pennsylvannica (FRAX09)',
 'F. pennsylvannica (FRAX10)',
 'F. quadrangulata (FRAX11)',
 'F. sieboldiana (FRAX12)',
 'F. velutina (FRAX13)',
 'F. americana (FRAX14)',
 'F. angustifolia subsp. oxycarpa  (FRAX15)',
 'F. angustifolia subsp. syriaca (FRAX16)',
 '',
 '',
 'F. goodingii (FRAX19)',
 'F. greggii (FRAX20)',
 'F. griffithii (FRAX21)',
 'F. nigra (FRAX23)',
 '',
 'F. xanthoxyloides (FRAX25)',
 'F. albicans (FRAX26)',
 'F. anomala (FRAX27)',
 'F. baroniana (FRAX28)',
 'F. bungeana (FRAX29)',
 'F. chinensis [subsp. chinensis?] (FRAX30)',
 'F. cuspidata  (FRAX31)',
 'F. floribunda  (FRAX32)',
 'F. platypoda (FRAX33)',
 'F. uhdei (FRAX34)']


# Assembleathon Conversion

In [9]:
import re
import os
BASE_DIR = 'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\data\\Assembleathon\\'

def format_assemblethon_file(filename):
    with open (filename, 'r' ) as f:
        stats_file = f.read()
    stats_file = re.sub(' {2,}', r'\t', stats_file, flags = re.M)
    stats_file = re.sub('^\t', r'', stats_file, flags = re.M)
    return stats_file
frax07_example = format_assemblethon_file(os.path.join(BASE_DIR, 'FRAX07_CLC_SSPACE_GAPCLOSER_ASSEMBLETHON.txt'))
print(frax07_example)

Number of scaffolds	417940
Total size of scaffolds	855147449
Total scaffold length as percentage of known genome size	760.1%
Longest scaffold	367815
Shortest scaffold	200
Number of scaffolds > 500 nt	168610	40.3%
Number of scaffolds > 1K nt	77211	18.5%
Number of scaffolds > 10K nt	20212	4.8%
Number of scaffolds > 100K nt	251	0.1%
Number of scaffolds > 1M nt	0	0.0%
Mean scaffold size	2046
Median scaffold size	425
N50 scaffold length	21026
L50 scaffold count	10470
NG50 scaffold length	82739
LG50 scaffold count	528
N50 scaffold - NG50 scaffold length difference	61713
scaffold %A	28.32
scaffold %C	14.99
scaffold %G	14.98
scaffold %T	28.32
scaffold %N	13.39
scaffold %non-ACGTN	0.00
Number of scaffold non-ACGTN nt	0

Percentage of assembly in scaffolded contigs	66.8%
Percentage of assembly in unscaffolded contigs	33.2%
Average number of contigs per scaffold	1.1
Average length of break (>25 Ns) between contigs in scaffold	274

Number of contigs	470798
Number of contigs in scaffolds	78758
Numb

In [10]:
def grab_values_from_file(stats_file, labels_of_interest):
    values = {}
    for line in stats_file.split('\n'):
        parts = line.split('\t')
        if parts[0] in labels_of_interest:
            print(parts[0], parts[1])
            values[parts[0]] = parts[1]
    return [values[key] for key in labels_of_interest]
grab_values_from_file(frax07_example, labels_of_interest)

Number of scaffolds 417940
Longest scaffold 367815
N50 scaffold length 21026
L50 scaffold count 10470
scaffold %N 13.39
Number of contigs 470798
Longest contig 342910


['21026', '367815', '13.39', '10470', '417940', '470798', '342910']

In [11]:
def matching_col(frax_number):
    a = [v for v in labels if frax_number in v.value]
    print(a)
    return a[0].col  # gspread.utils.rowcol_to_a1(23, a[0].col)
matching_col('FRAX07')

[<Cell R4C8 'F. ornus (FRAX07)'>]


8

In [12]:
def update_stats_for_FRAX_number(frax_number, update_labels=False):
    print(">>> Updating Stats for ", frax_number)
    filename = os.path.join(BASE_DIR, frax_number + '_CLC_SSPACE_GAPCLOSER_ASSEMBLETHON.txt')
    stats_file = format_assemblethon_file(filename)
    values = grab_values_from_file(stats_file, labels_of_interest)
    my_column = matching_col(frax_number)
    cells = sheet.range(23, my_column, 23 + len(values)-1, my_column)
    for val, cell in zip(values, cells):
        cell.value = val
    sheet.update_cells(cells)
    if update_labels:
        for label_index, label in enumerate(labels_of_interest):
            sheet.update_cell(23 + label_index, 1, label)
#         sheet.update_cell(23 + label_index, my_column, values[label_index])

In [None]:
update_stats_for_FRAX_number('FRAX06')

In [None]:
def main():
    for frax_number in I_have_stats_for:
        update_stats_for_FRAX_number(frax_number)
    update_stats_for_FRAX_number(I_have_stats_for[0], update_labels=True)
main()

In [13]:
from glob import glob
import subprocess

def call(args):
    command = ' '.join(args) if isinstance(args, list) else args
    print(command)
#     return subprocess.check_output(command, shell=True)

In [16]:
def plot_in_DDV(frax_number):
#     print(">>> Plotting DDV Genome for ", frax_number)
    DDV_EXE = r'D:\josiah\Projects\DDV\build\DDV.exe'
    
    filename = os.path.join(r'D:\Genomes\Ash Finished Genomes', frax_number + '_CLC_SSPACE_GAPCLOSER.fasta')
    real_name = sheet.cell(4,matching_col(frax_number)).value
    call([DDV_EXE, 
          '--fasta="%s"' % filename, 
          '--outname="%s"' % real_name,
         '--no_webpage'])

In [17]:
plot_in_DDV('FRAX06')

[<Cell R4C7 'F. mandshurica (FRAX06)'>]
D:\josiah\Projects\DDV\build\DDV.exe --fasta="D:\Genomes\Ash Finished Genomes\FRAX06_CLC_SSPACE_GAPCLOSER.fasta" --outname="F. mandshurica (FRAX06)" --no_webpage


In [18]:
for frax in I_have_stats_for:
    plot_in_DDV(frax)

[<Cell R4C7 'F. mandshurica (FRAX06)'>]
D:\josiah\Projects\DDV\build\DDV.exe --fasta="D:\Genomes\Ash Finished Genomes\FRAX06_CLC_SSPACE_GAPCLOSER.fasta" --outname="F. mandshurica (FRAX06)" --no_webpage
[<Cell R4C8 'F. ornus (FRAX07)'>]
D:\josiah\Projects\DDV\build\DDV.exe --fasta="D:\Genomes\Ash Finished Genomes\FRAX07_CLC_SSPACE_GAPCLOSER.fasta" --outname="F. ornus (FRAX07)" --no_webpage
[<Cell R4C10 'F. pennsylvannica (FRAX09)'>]
D:\josiah\Projects\DDV\build\DDV.exe --fasta="D:\Genomes\Ash Finished Genomes\FRAX09_CLC_SSPACE_GAPCLOSER.fasta" --outname="F. pennsylvannica (FRAX09)" --no_webpage
[<Cell R4C12 'F. quadrangulata (FRAX11)'>]
D:\josiah\Projects\DDV\build\DDV.exe --fasta="D:\Genomes\Ash Finished Genomes\FRAX11_CLC_SSPACE_GAPCLOSER.fasta" --outname="F. quadrangulata (FRAX11)" --no_webpage
[<Cell R4C15 'F. americana (FRAX14)'>]
D:\josiah\Projects\DDV\build\DDV.exe --fasta="D:\Genomes\Ash Finished Genomes\FRAX14_CLC_SSPACE_GAPCLOSER.fasta" --outname="F. americana (FRAX14)" --no_w

# Scrap Examples

In [None]:
sheet.col_values(2)

In [None]:
sheet.resize(rows=3, cols=4)

In [None]:
sheet.col_count, sheet.row_count

In [None]:
sheet.append_row(['Doin', 'good'])

In [None]:
gspread.utils.rowcol_to_a1(sheet.row_count, sheet.col_count )

In [None]:
cell_list = sheet.range('A1:' + gspread.utils.rowcol_to_a1(sheet.row_count, sheet.col_count ))

for cell in cell_list:
    cell.value = 'O_o'

# Update in batch
sheet.update_cells(cell_list)

In [None]:
sheet.update_cell(2,2, 'failing')

In [None]:
sheet.cell(2,2).value