# Fauna

This is a deconstruction of parts of fauna. 

* https://github.com/nextstrain/fauna

Scripts:

**zika_upload.py**

```
python3 vdb/zika_upload.py \
  -db vdb \
  -v zika \
  --source genbank \
  --locus genome \
  --fname GenomicFastaResults.fasta
```

**zika_update.py**

```
python3 vdb/zika_update.py \
  -db vdb \
  -v zika \
  --update_citations
```

*check dependencies listed in requirements.txt*

In [1]:
import Bio            # biopython
import boto
import pandas
import rethinkdb
import requests
import unidecode
import xlrd
print("Packages available")

Packages available


## 1. Load an example dataset

Practice on 10 zika sequences. Pull from:

* https://www.viprbrc.org/brc/vipr_genome_search.spg?method=ShowCleanSearch&decorator=flavi_zika

## 2. Process the dataset

In [29]:
# Inputs
strain_fix_fname =  "fauna/source-data/zika_strain_name_fix.tsv"
location_fix_fname = "fauna/source-data/zika_location_fix.tsv"
date_fix_fname = "fauna/source-data/zika_date_fix.tsv"

virus_fasta_fields = {1:'strain', 3:'collection_date', 4: 'host', 5:'country'}
sequence_fasta_fields = {0:'accession', 1:'strain'}

# Args
db = "vdb"
v = "zika"
source = "genbank"
locus = "genome"
fname = "GenomicFastaResults.fasta"

In [3]:
# Python Packages
import os, re, time, datetime, csv, sys
from rethinkdb import r
from Bio import SeqIO
#from upload import upload # vdb/upload.py; vdb/parse.py
#from upload import get_parser

**Functions**

In [40]:
# upload.py
def replace_strain_name(original_name, fixes={}):
    '''
    return the new strain name that will replace the original
    '''
    if original_name in fixes:
        return fixes[original_name] 
    else:
        return original_name #JC

# ...
def define_strain_fixes(fname):
    '''
    Open strain name fixing files and define corresponding dictionaries
    '''
    reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
    fix_whole_name = {}
    for line in reader:
        fix_whole_name[line['label'].encode().decode('unicode-escape')] = line['fix']
    return fix_whole_name             # fixes[original_name]
# ...
fix_whole_name = define_strain_fixes(strain_fix_fname)  # tsv file in Input
type(fix_whole_name)
print(list(fix_whole_name)[:10])

# ... do same for locations
# self.fix_location = self.define_location_fixes(self.location_fix_fname) # tsv file in input
# self.fix_date = self.define_date_fixes(self.date_fix_fname)

['PRI/PRVABC59/2015', 'PRVABC_59', 'Ae-aegypti-FL01M', 'Ae-aegypti-FL02M', 'Ae-aegypti-FL03M', 'Ae-aegypti-FL04M', 'Ae-aegypti-FL05M', 'Ae-aegypti-FL06M', 'Ae-aegypti_FL08M', 'FL001Sa']


In [41]:
# zika_upload.py
def fix_name(name): # polymorphism, overwrite fix_names in upload.py
    original_name = name
    name = replace_strain_name(original_name, fix_whole_name) 
    name = name.replace('Zika_virus', '').replace('Zikavirus', '').replace('Zika virus', '').replace('Zika', '').replace('ZIKV', '')
    name = name.replace('Human', '').replace('human', '').replace('H.sapiens_wt', '').replace('H.sapiens_tc', '').replace('Hsapiens_tc', '').replace('H.sapiens-tc', '').replace('Homo_sapiens', '').replace('Homo sapiens', '').replace('Hsapiens', '').replace('H.sapiens', '')
    name = name.replace('/Hu/', '')
    name = name.replace('_Asian', '').replace('_Asia', '').replace('_asian', '').replace('_asia', '')
    name = name.replace('_URI', '').replace('_SER', '').replace('_PLA', '').replace('_MOS', '').replace('_SAL', '')
    name = name.replace('Aaegypti_wt', 'Aedes_aegypti').replace('Aedessp', 'Aedes_sp')
    name = name.replace(' ', '').replace('\'', '').replace('(', '').replace(')', '').replace('//', '/').replace('__', '_').replace('.', '').replace(',', '')
    name = re.sub('^[\/\_\-]', '', name)
    try:
        name = 'V' + str(int(name))  # Deal with numbers?
    except:
        pass
    name = replace_strain_name(name, fix_whole_name)    # Before and after local processing? Kay...
    return name, original_name

In [None]:
def fix_casing(self, document): # JC
    for field in ['host']:
        if field in document and document[field] is not None:
            document[field] = self.camelcase_to_snakecase(document[field])

In [56]:
# Load data
def parse_fasta_file(fasta, virus_fasta_fields, sequence_fasta_fields, **kwargs):
    '''
    Parse FASTA file with default header formatting
    :return: list of documents(dictionaries of attributes) to upload
    '''
    header_fixes = False
    if (kwargs["fasta_header_fix"]):
        header_fixes = {}
        try:
            with open(kwargs["fasta_header_fix"], 'rU') as fh:
                for line in fh:
                    if not line.startswith('#'):
                        k, v = line.strip().split("\t")
                        header_fixes[k] = v                
        except IOError:
            raise Exception(kwargs["fasta_header_fix"], "not found")
    viruses = []
    sequences = []
    try:
        handle = open(fasta, 'r')
    except IOError:
        raise Exception(fasta, "not found")
    else:
        for record in SeqIO.parse(handle, "fasta"):
            if header_fixes:
                try:
                    record.description = header_fixes[record.description]
                except KeyError:
                    raise Exception(record.description, "not in header fix file. Fatal.")
            content = list(map(lambda x: x.strip(), record.description.replace(">", "").split('|')))
            v = {key: content[ii] if ii < len(content) else "" for ii, key in virus_fasta_fields.items()}
            s = {key: content[ii] if ii < len(content) else "" for ii, key in sequence_fasta_fields.items()}
            s['sequence'] = str(record.seq).lower()
            #v = self.add_virus_fields(v, **kwargs)
            #s = self.add_sequence_fields(s, **kwargs)
            sequences.append(s)
            viruses.append(v)
        handle.close()
    return (viruses, sequences)

zika_seqs = "data/small.fasta"

zika_fasta = parse_fasta_file(zika_seqs, virus_fasta_fields, sequence_fasta_fields, fasta_header_fix = False)

print(zika_fasta[0])

print("\n\nstrain: ",zika_fasta[0][0]['strain'])

print("fix_name output:", fix_name(zika_fasta[0][0]['strain']))

[{'strain': 'ZIKV_SG_072', 'collection_date': '2016_08_28', 'host': 'Human', 'country': 'Singapore'}, {'strain': 'Mexico_Rus_12TVR_2017', 'collection_date': '2017_01_30', 'host': 'Human', 'country': 'Russia'}, {'strain': 'Dominican_Rep_Rus_7EGR_2016', 'collection_date': '2016_08_25', 'host': 'Human', 'country': 'Russia'}, {'strain': 'Mexico_Rus_10GNN_2016', 'collection_date': '2016_11_09', 'host': 'Human', 'country': 'Russia'}, {'strain': 'Saint_Barthelemi_Rus_6BRN_2016', 'collection_date': '2016_07_25', 'host': 'Human', 'country': 'Russia'}, {'strain': 'Dominican_Rep_Rus_5RMN_2016', 'collection_date': '2016_05_31', 'host': 'Human', 'country': 'Russia'}, {'strain': 'Dominican_Rep_Rus_8ZBR_2016', 'collection_date': '2016_08_25', 'host': 'Human', 'country': 'Russia'}, {'strain': 'SY01_2016', 'collection_date': '2016_11_01', 'host': 'Human', 'country': 'China'}, {'strain': 'SK403/13AS', 'collection_date': '2013_09_21', 'host': 'Human', 'country': 'Thailand'}, {'strain': 'SK364/13AS', 'col

## 3. Upload to fauna

In [26]:
print("hello")

hello
