In [1]:
from pandas import DataFrame, read_csv
from os.path import join, basename
from os.path import splitext, join
from DNASkittleUtils.Annotations import parseGFF
from DNASkittleUtils.Contigs import read_contigs
from DNASkittleUtils.CommandLineUtils import just_the_name
from DNASkittleUtils.DDVUtils import pp
filename = r"D:\josiah\Documents\Research\Arabidopsis\TAIR10_GFF3_genes_only.gff3"

In [2]:
df = read_csv(r"D:\josiah\Documents\Research\Arabidopsis\functional_annotation.csv",sep='\t', error_bad_lines=False)# fields='Gene_Family	Gene_Name	Genomic_Locus(BAC)	Genomic_Locus_Tag	Protein_Function'.split())

In [3]:
df.head()

Unnamed: 0,Gene_Family,Gene_Name,Genomic_Locus(BAC),Genomic_Locus_Tag,Protein_Function
0,IQD Protein Family,IQD1,,At3g09710,
1,IQD Protein Family,IQD2,,At5g03040,
2,IQD Protein Family,IQD3,,At3g52290,
3,IQD Protein Family,IQD4,,At2g26410,
4,IQD Protein Family,IQD5,,At3g22190,


In [4]:
len(df)

8693

In [5]:
sum(df['Gene_Family'].notnull())

8693

In [6]:
sum(df['Protein_Function'].str.contains('putative')==False)

3723

In [7]:
sum(df['Protein_Function'].notnull() & (df['Protein_Function'].str.contains('hypothetical protein')==False))

5804

# Filter out to only genes with mentions in functional_annotation.csv

In [None]:
tag = df['Genomic_Locus_Tag'].notna()
tags = df[tag]['Genomic_Locus_Tag']
sum(tag), len(tag)

In [None]:
upper_case_loci = set([str(n).upper() for n in tags])
len(upper_case_loci)

In [None]:

filtered_path = splitext(filename)[0] + '_with_functions' + splitext(filename)[1]
with open(filename) as file:
    with open(filtered_path, 'w') as outfile:
        for line in file.readlines():
            attributes = line.split('\t')[8].split(';')
            locus = attributes[0][3:]
            if locus.upper() in upper_case_loci or name.upper() in upper_case_loci:
                outfile.write(line)

6652 lines

#### Create Gene function Lookup

In [None]:
gene_function = {}
for index, row in df[tag].iterrows():
    try: 
        gene = row['Genomic_Locus_Tag'].upper()
    except AttributeError:
        print(row['Genomic_Locus_Tag'])
    function = row['Protein_Function'] if str(row['Protein_Function']) != 'nan' else row['Gene_Family']
    forbidden = ['\uffa0', 'basic', 'hypothetical protein', 'unknown protein', 'putative', 'hypothetical  protein']
    for bad in forbidden:
        function = function.replace(bad, '')
    if not function.strip() or str(function) == 'nan':
        function = row['Gene_Family'].replace('basic', '')
    gene_function[gene] = function
#     break
gene_function

** ----------------- TODO: Come back and make the name pretty ---------------------**

### Decorate Annotation with functions

In [None]:
with open(filtered_path, 'r') as genes:
    with open(r"D:\josiah\Documents\Research\Arabidopsis\TAIR10_annotated_genes.gff", 'w') as final_annotation:
        for line in genes.readlines():
            parts = line.split('\t')
            locus = parts[8].split(';')[0][3:]
            function = gene_function[locus]
            if locus in gene_function:
                final_annotation.write('\t'.join(parts[:8] + ['Name=' + function]) +'\n')

In [None]:
df[df['Genomic_Locus_Tag'] =='At1g01570']

In [None]:
'AT1G01020' in upper_case_loci

# Repeat the Process for ATH_GO_SLIM

In [15]:
go = read_csv(r"D:\josiah\Documents\Research\Arabidopsis\ATH_GO_GOSLIM.txt",sep='\t', error_bad_lines=False,
             usecols=['locus name', 'GO term','GOslim term', 'Date annotated'])
go.head()

Unnamed: 0,locus name,GO term,GOslim term,Date annotated
0,AT1G01010,"regulation of transcription, DNA-templated",other cellular processes,2018-04-02
1,AT1G01010,"regulation of transcription, DNA-templated","transcription,DNA-dependent",2018-04-02
2,AT1G01010,DNA-binding transcription factor activity,transcription factor activity,2003-06-06
3,AT1G01010,"regulation of transcription, DNA-templated",other cellular processes,2018-09-12
4,AT1G01010,integral component of membrane,other membranes,2018-09-04


In [16]:
sum(go['GO term'].notnull() & go['locus name'].notnull()) ,len(go)

(349103, 349103)

Terrific!  No missing data

In [17]:
go_upper = {cell.upper() for cell in go['locus name']}
len(go_upper)

30828

30,828 is slightly more genes than TAIR10 annotation...  because there are non-coding regions...

In [18]:
go_function = {}
for index, row in go.iterrows():
    gene = row['locus name']
    function = row['GO term']
    forbidden = ['biological_process', 'cellular_component', 'molecular_function',  'extracellular region', 'nucleus',]
    #            'mitochondrion',  'cytoplasm', 'cytosol', 'chloroplast', 'membrane', 'plasma membrane']

    if function not in forbidden:
        if gene not in go_function or len(go_function[gene]) < len(function):  # use the longest
            go_function[gene] = function
len(go_function)

25302

In [19]:
functional_gff = r"D:\josiah\Documents\Research\Arabidopsis\GO_annotation.gff"
list(go_function.values())[:100]

['protein import into mitochondrial matrix',
 'chloroplast intermembrane space',
 'chloroplast',
 'integral component of membrane',
 'defense response',
 'oxidoreductase activity, acting on paired donors, with incorporation or reduction of molecular oxygen, NAD(P)H as one donor, and incorporation of one atom of oxygen',
 'carbohydrate metabolic process',
 'triplet codon-amino acid adaptor activity',
 'triplet codon-amino acid adaptor activity',
 'peptidyl-diphthamide biosynthetic process from peptidyl-histidine',
 'mitochondrion',
 'tRNA splicing, via endonucleolytic cleavage and ligation',
 'plasma membrane',
 'transmembrane receptor protein tyrosine kinase signaling pathway',
 'copper ion binding',
 'oxidoreductase activity, acting on a sulfur group of donors, disulfide as acceptor',
 'cytidine to uridine editing',
 'positive regulation of transcription, DNA-templated',
 'amino acid transport',
 'positive regulation of transcription, DNA-templated',
 'ATPase activity, coupled to tran


    with open(filename, 'r') as genes:
        with open(functional_gff, 'w') as final_annotation:
            for line in genes.readlines():
                parts = line.split('\t')
                locus = parts[8].split(';')[0][3:]
                if locus in go_function:
                    function = go_function[locus]
                    final_annotation.write('\t'.join(parts[:8] + ['Name=%s;ID=%s'% (function, locus) ]) +'\n')

22,103 genes

In [20]:
from DNASkittleUtils.Contigs import Contig, read_contigs, write_contigs_to_file

In [21]:
chromosomes = read_contigs(r"D:\josiah\Documents\Research\Arabidopsis\TAIR10_chr_all.fas")
for chrm in chromosomes:
    chrm.name = chrm.name.split()[0]
    write_contigs_to_file(join(r"D:\josiah\Documents\Research\Arabidopsis\Chromosomes", chrm.name + '.fa'), [chrm])

Done writing  1 contigs and 30,427,671bp
Done writing  1 contigs and 19,698,289bp
Done writing  1 contigs and 23,459,830bp
Done writing  1 contigs and 18,585,056bp
Done writing  1 contigs and 26,975,502bp
Done writing  1 contigs and 366,924bp
Done writing  1 contigs and 154,478bp


In [22]:
from functools import reduce
reduce(int.__mul__,[3,3,3,3,3,15]) + 200

3845

In [23]:
reduce(int.__mul__, [5,3,3,3,3,]) # coil_height

405

# Dice up chromosomes according to annotation
* Annotations will sometimes overlap. Sort by start position, grab nucleotides 
* and then look for next annotation whose end position is greater than nucleotides already taken
* Start next annotation then and repeat
* Some annotations will be lost completely, but not many if overlap is low, more likely case is truncation
* When no active annotations, mark with the same name: Chromosome X between genes ###
------------


In [24]:
annotation = parseGFF(functional_gff)

In [25]:
annotation.keys()

dict_keys(['4', '2', 'M', '3', 'C', '1', '5'])

In [26]:
# force sorting
for chr_entries in annotation.values():
    chr_entries.sort(key=lambda e: e.start)

In [27]:
all([entry.start >= annotation['2'][i-1].start for i, entry in enumerate(annotation['2'][:]) if i])

True

In [31]:
def write_functional_gene_contigs_from_chromosome(chr_seq_file):
    #assume the order of annotations by start, create sequential contigs
    nucleotides_claimed = set()
    annotated_sequences = []  # in order from start position, end position never overlaps next start
    chr_seq = read_contigs(chr_seq_file)[0].seq
    scaff_name = just_the_name(chr_seq_file)
    current_start = 0
    if scaff_name in annotation.keys():
        for entry in annotation[scaff_name]:
            new_start = entry.start - 1
            end = entry.end - 1
            if new_start + 1 > current_start:
                annotated_sequences.append(Contig('between genes', chr_seq[current_start: new_start]))
                #print('SPacer', new_start-current_start)
            #else:
                #pass
            annotated_sequences.append(Contig(entry.name(), chr_seq[max(current_start, new_start) : end]))
            #print("Gene %i - %i" %(new_start, entry.end))
            current_start = end

    new_file = r"D:\josiah\Documents\Research\Arabidopsis\Chromosomes\chr"+ scaff_name + '_functions.fa'
    write_contigs_to_file(new_file, annotated_sequences)
    return [len(s.seq) for s in annotated_sequences], len(chr_seq) 

for chr_number in range(1,6):
    chr_seq_file = r"D:\josiah\Documents\Research\Arabidopsis\Chromosomes\%i.fa" % chr_number
    lengths, chr_length = write_functional_gene_contigs_from_chromosome(chr_seq_file)
    print(pp(sum(lengths)), 'Difference', pp(chr_length - sum(lengths)))
    

Done writing  12386 contigs and 30,453,674bp
30,453,674 Difference -26,003
Done writing  7337 contigs and 19,731,769bp
19,731,769 Difference -33,480
Done writing  9174 contigs and 23,479,378bp
23,479,378 Difference -19,548
Done writing  7061 contigs and 18,611,740bp
18,611,740 Difference -26,684
Done writing  10787 contigs and 27,009,486bp
27,009,486 Difference -33,984


In [29]:
19731769

19731769

In [30]:
len(lengths)

10787

### Layout
* Ideogram has zero padding between contigs
* Mouseover will iterate contigs til it finds the right one (could technically be overlapping)
* 