# Counting KOMP2 generated colonies

We retreived the list of KOMP2 generated colonies from iMits.  Using this list, count how many
* Lines
* Genes
* Datapoints
* Images

are included in the current live data release

In [87]:
import json
import requests

BASE_URL = """https://www.ebi.ac.uk/mi/impc/solr"""
MUTANT_DATA_URL = BASE_URL + """/experiment/select?q=colony_id:"{}"&rows=1&fq=biological_sample_group:experimental"""
MUTANT_IMAGES_URL = BASE_URL + """/impc_images/select?q=colony_id:"{}"&rows=0"""

CONTROL_DATA_URL = BASE_URL + """/experiment/select?q=(project_name:BaSH OR project_name:DTCC OR project_name:JAX) AND biological_sample_group:control AND datasource_name:IMPC AND -pipeline_name:"MGP Select Pipeline"&rows=0"""
CONTROL_IMAGES_URL = BASE_URL + """/impc_images/select?q=(project_name:BaSH OR project_name:DTCC OR project_name:JAX) AND biological_sample_group:control AND datasource_name:IMPC AND -pipeline_name:"MGP Select Pipeline"&rows=0"""

# SOURCE_FILE = "KOMP2_colonies.tsv"
SOURCE_FILE = "DCC_colonies.tsv"


In [88]:
colonies = [x.split("\t")[1].strip().upper() for x in open(SOURCE_FILE).readlines()][1:]
print("File has {} colonies".format(len(colonies)))

File has 3890 colonies


In [None]:

missing_colonies = set()
lines = set()
genes = set()
data_points = 0
images = 0

for colony in colonies:

    retries = 0
    while retries < 5:
    
        data = requests.get(MUTANT_DATA_URL.format(colony.replace("&", "%26")))
        if data.status_code != 200:
            print("Error retreiving data for colony: {}, URL: {}".format(colony, MUTANT_DATA_URL.format(colony)))
            retries = retries + 1
        else :
            break
        
        
    if data.status_code != 200:
        print ("Error retreiving data for colony: {}, URL: {}".format(colony, MUTANT_DATA_URL.format(colony)))
        continue
        
    num_found = data.json()['response']['numFound']
    data_points = data_points + num_found

    if num_found > 0:
        genes.add(data.json()['response']['docs'][0]['gene_accession_id'])
        lines.add(colony)

        image_data = requests.get(MUTANT_IMAGES_URL.format(colony.replace("&", "%26")))
        if image_data.status_code != 200:
            print ("Error retreiving image data for colony: {}, URL: {}".format(colony, MUTANT_IMAGES_URL.format(colony)))
        else:
            num_images_found = image_data.json()['response']['numFound']
            images = images + num_images_found

        if len(lines)%50 == 0:
            print("So far, found {} colonies with data in DR".format(len(lines)))

    else:

        missing_colonies.add(colony)
        if len(missing_colonies)%50 == 0:
            print("So far, found {} colonies missing from DR".format(len(missing_colonies)))
    




So far, found 50 colonies with data in DR
So far, found 100 colonies with data in DR
So far, found 150 colonies with data in DR
So far, found 200 colonies with data in DR
So far, found 250 colonies with data in DR
So far, found 300 colonies with data in DR
So far, found 350 colonies with data in DR
So far, found 400 colonies with data in DR
So far, found 450 colonies with data in DR
So far, found 500 colonies with data in DR
So far, found 550 colonies with data in DR
So far, found 600 colonies with data in DR
So far, found 650 colonies with data in DR
So far, found 700 colonies with data in DR
So far, found 50 colonies missing from DR
So far, found 100 colonies missing from DR
So far, found 150 colonies missing from DR
So far, found 750 colonies with data in DR
So far, found 800 colonies with data in DR
So far, found 850 colonies with data in DR
So far, found 900 colonies with data in DR
So far, found 950 colonies with data in DR
So far, found 1000 colonies with data in DR
So far, foun

In [None]:

data = requests.get(CONTROL_DATA_URL)
num_found = data.json()['response']['numFound']
total_data_points = data_points + num_found


image_data = requests.get(CONTROL_IMAGES_URL.format(colony))
num_images_found = image_data.json()['response']['numFound']
total_images = images + num_images_found


print("* Lines: {}".format(len(lines)))
print("* Genes: {}".format(len(genes)))
print("* Datapoints: {}".format(total_data_points))
print("* Images: {}".format(total_images))

print("*"*80)
print("There are {} KOMP2 colonies ({} missing) in the DR. List of colonies in DR: \n{}".format(len(lines), len(missing_colonies), "\n".join(lines)))
print("List of missing colonies in DR: \n{}".format("\n".join(missing_colonies)))
