We flagged 71 samples that could not be downloaded from SRA because they were no longer in the database. When attempting to download SRA gives an Access denied error when trying to resolve the accession. We identified these samples by trying to download from SRA at least 3 times and receiving this error. Then we flagged these samples as `'download_bad'` and ignored them for remaining analyses.

Example Error:

```
fastq-dump -O $TMPDIR -M 0 --split-files SRR3497498
2017-09-18T09:57:18 fastq-dump.2.8.1 err: query unauthorized while resolving tree within virtual file system module - failed to resolve accession 'SRR3497498' - Access denied - object has not been published ( 403 )
2017-09-18T09:57:18 fastq-dump.2.8.1 err: item not found while constructing within virtual database module - the path 'SRR3497498' cannot be opened as database or table
Error in job fastq_dump while creating output file ../output/pre-prealignment/raw/SRX1756361/SRR3497498/SRR3497498_1.fastq.gz.
RuleException:
CalledProcessError in line 265 of /gpfs/gsfs6/users/MiegNCBI/ncbi_remap/bin/pre-prealignment.snake:
Command 'fastq-dump -O $TMPDIR -M 0 --split-files SRR3497498' returned non-zero exit status 3
  File "/gpfs/gsfs6/users/MiegNCBI/ncbi_remap/bin/pre-prealignment.snake", line 265, in __rule_fastq_dump
  File "/data/fearjm/miniconda3/envs/ncbi_remap/lib/python3.5/concurrent/futures/thread.py", line 55, in run
Exiting because a job execution failed. Look above for error message
```

In [3]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-09-18 
Git hash: e3171f9271da826a89324a653b2d0f134c002600


In [5]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']


In [11]:
# These accession cannot be downloaded with fastq-dump
srrs_problem = [
    'ERR034184',
    'ERR034185',
    'ERR649342',
    'ERR649410',
    'ERR979810',
    'ERR979834',
    'ERR982787',
    'ERR982788',
    'ERR982790',
    'ERR982791',
    'ERR982792',
    'ERR982793',
    'SRR000909',
    'SRR032480',
    'SRR1989536',
    'SRR1989537',
    'SRR1989538',
    'SRR1989539',
    'SRR1989540',
    'SRR1989541',
    'SRR1990455',
    'SRR1990456',
    'SRR1990457',
    'SRR1990458',
    'SRR1990459',
    'SRR1991085',
    'SRR1991086',
    'SRR1991087',
    'SRR1991088',
    'SRR1991089',
    'SRR1991564',
    'SRR1991565',
    'SRR1991566',
    'SRR1991893',
    'SRR1991894',
    'SRR1991895',
    'SRR1992218',
    'SRR1992219',
    'SRR1992220',
    'SRR1992495',
    'SRR1992496',
    'SRR1992497',
    'SRR1992722',
    'SRR1992723',
    'SRR1992724',
    'SRR1994869',
    'SRR1994870',
    'SRR2063776',
    'SRR2063778',
    'SRR2063788',
    'SRR2063789',
    'SRR2176673',
    'SRR2176676',
    'SRR2176684',
    'SRR2176685',
    'SRR2176686',
    'SRR2176692',
    'SRR2176695',
    'SRR2176698',
    'SRR2176717',
    'SRR2176904',
    'SRR2176921',
    'SRR2176937',
    'SRR2176945',
    'SRR2984042',
    'SRR3497498',
    'SRR3497499',
    'SRR3497540',
    'SRR3497541',
    'SRR4015206',
    'SRR4015207',
]

In [16]:
# Update the database keep running until returns 0
remap.update_many({'runs.srr': {'$in': srrs_problem}}, 
                  {'$addToSet': {'runs.$.pre_aln_flags': 'download_bad'}}).modified_count

0

In [28]:
# Run check to make sure all are updated. If retuns error then something is wrong.
check = next(remap.aggregate([
    {"$unwind": '$runs'},
    {
        '$match': {
            'runs.pre_aln_flags': 'download_bad'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srr': '$runs.srr',
            'flags': '$runs.pre_aln_flags'
        }
    },
    {'$count': 'cnt'}
]))['cnt']

assert len(srrs_problem) == check
print(check)

71
