In [None]:
# Scripts to do:
# - Refresher
# - Mapper

In [287]:
###############################################################################################################
##################################### HGVSc Problem ###########################################################
###############################################################################################################

import pandas as pd

df = pd.read_csv("../temp/mapfile.tsv", sep='\t', header=0, low_memory=False)

# Filter rows with variant_type == 'SNP'
dd_filtered = df.loc[df['variant_type'] == 'SNP', ['hgvsc', 'reference_bases', 'alternate_bases_1', 'alternate_bases_2']].copy()

# Drop rows with missing values in HGVSC
dd_filtered.dropna(subset=['hgvsc'], inplace=True)

# Extract the last character from HGVSC and assign it to a new column 'HGVSC_ref_base'
dd_filtered['HGVSC_ref_base'] = dd_filtered['hgvsc'].str.split('>').str[0].str[-1]
dd_filtered['HGVSC_alt_base'] = dd_filtered['hgvsc'].str.split('>').str[1]


# Reset the index if necessary
dd_filtered.reset_index(drop=True, inplace=True)

compare = dd_filtered[['hgvsc', 'HGVSC_ref_base', 'HGVSC_alt_base', 'reference_bases', 'alternate_bases_2']]
issues = compare[compare['reference_bases'] != compare['HGVSC_ref_base']]

Unnamed: 0,hgvsc,HGVSC_ref_base,HGVSC_alt_base,reference_bases,alternate_bases_2
0,c.1760C>G,C,G,G,C
7,c.1160C>G,C,G,G,C
8,c.1339A>G,A,G,T,C
11,c.227A>T,A,T,T,A
14,c.2113C>A,C,A,G,T
...,...,...,...,...,...
2424409,c.292T>C,T,C,A,G
2424410,c.1148G>C,G,C,C,G
2424416,c.254C>A,C,A,G,T
2424418,c.869G>A,G,A,C,T


In [318]:
print(
    'Different alt base =\t', sum(issues['HGVSC_ref_base'] != issues['alternate_bases_2']),
    '\nSame alt base =\t\t', sum(issues['HGVSC_ref_base'] == issues['alternate_bases_2']),
    '\nTotal =\t\t\t', len(issues)
)

Different alt base =	 1076223 
Same alt base =		 138916 
Different ref base =	 1215139 
Same ref base =		 0 
Total =			 1215139


In [None]:
sum(issues['HGVSC_alt_base'] == issues['alternate_bases_2']) # alt base is never alt base

In [327]:
sum(issues['HGVSC_alt_base'] != issues['reference_bases']) # same numbers as above

1076223

In [329]:
issues[issues['HGVSC_ref_base'] != issues['alternate_bases_2']]

Unnamed: 0,hgvsc,HGVSC_ref_base,HGVSC_alt_base,reference_bases,alternate_bases_2
8,c.1339A>G,A,G,T,C
14,c.2113C>A,C,A,G,T
16,c.1393C>A,C,A,G,T
18,c.954T>C,T,C,A,G
19,c.2112C>T,C,T,G,A
...,...,...,...,...,...
2424408,c.2785G>A,G,A,C,T
2424409,c.292T>C,T,C,A,G
2424416,c.254C>A,C,A,G,T
2424418,c.869G>A,G,A,C,T


In [328]:
# Alt base is sometimes hgvsc ref base, but never hgvsc alt base
# Ref base is never hgvsc ref base, but sometimes hgvsc alt base

In [270]:
###############################################################################################################
##################################### HGVSc Problem ###########################################################
###############################################################################################################

import pandas as pd

df = pd.read_csv("../temp/mapfile.tsv", sep='\t', header=0, low_memory = False)


ls = df.loc[df['variant_type'] == 'SNP', 'hgvsc']
ll = df.loc[df['variant_type'] == 'SNP', 'reference_bases']

dd = pd.DataFrame(ls, ll)


ref_base_check = []
ls = [x for x in ls if str(x) != 'nan'] # Problem: Takes out 
for i in ls:
    i = i.split('>')
    ref_base_check.append(i[0][-1])

Unnamed: 0_level_0,hgvsc
reference_bases,Unnamed: 1_level_1
G,
G,
A,
C,
C,
...,...
G,
A,
T,
G,


In [266]:
check[0:10]

'C c.1760C>'

In [None]:


problem_base = ll[ref_base_check != ll].reset_index(drop = True)
problem_hgvs = ls[ref_base_check != ll].reset_index(drop = True)

print('Reference\thgvsc')
for i in range(len(problem_base)):
    print(problem_base[i], '\t', problem_hgvs[i])

In [None]:
###############################################################################################################
##################################### Initialisation ##########################################################
###############################################################################################################

In [235]:
import os
from pymongo import MongoClient
import bycon
import pandas as pd
import time, base36

# Prepare mongodb tools
client = MongoClient()
db = client.progenetix
bs = db.biosamples

In [10]:
# mapTCGAids.py
#!/usr/local/bin/python3

from pymongo import MongoClient
import argparse
import json
import re
from progress.bar import Bar

################################################################################
################################################################################
################################################################################

def _get_args():

    parser = argparse.ArgumentParser()
    parser.add_argument("-t", "--test", help="test setting")
    parser.add_argument('-i', '--inputfile', help='a custom file to specify input data')
    parser.add_argument('-s', '--survivalfile', help='...')

    return parser.parse_args()

################################################################################

def main():

    args = _get_args()

    ds_id = 'progenetix'
    
    mapTCGAids(args, ds_id)
    addTCGAsurvival(args, ds_id)

################################################################################

def mapTCGAids(args, ds_id): #re-construct the external descriptions, and add submitter and case ids  of TCGA sampels

    if not args.inputfile:
        print( "Not updating ids - you have to provide an inputfile -i")
        return

    data_client = MongoClient('localhost', 27017)
    data_db = data_client[ ds_id ]
    bios_coll = data_db[ "biosamples" ]
    ind_coll = data_db[ "individuals" ]

    ind_ids = {}

    fh=open(args.inputfile, 'r')
    idlist = [line.rstrip() for line in fh.readlines()[1:]]
    fh.close()

    no = len(idlist)
    if not args.test:
        bar = Bar("{} samples".format(no), max = no, suffix='%(percent)d%%'+" of "+str(no) )
    else:
        idlist = idlist[0:20]

    for line in idlist:
        tcga_s_id, tcga_c_id, tcga_submitter_id  = line.rstrip().split('\t')

        q = {'info.legacy_ids': tcga_s_id}
        b = bios_coll.find_one(q)

        i_e_r = [
            {"id": "pgx:TCGA-"+tcga_c_id, "label": "TCGA case_id"},
            {"id": "pgx:TCGA-"+tcga_submitter_id, "label": "TCGA submitter_id"}
        ]

        if tcga_c_id in ind_ids:
            ind_ids[tcga_c_id]["biosample_ids"].append(b["id"])
        else:
            ind_ids.update({
                tcga_c_id: {
                    "external_references": i_e_r,
                    "biosample_ids": [b["id"]]
                }
            })

        b_e_r = i_e_r.copy()
        b_e_r.append({"id": "pgx:TCGA-"+tcga_s_id, "label": "TCGA sample_id"})

        for e in b["external_references"]:
            if not "TCGA" in e["id"]: # <= important to keep other ers
                b_e_r.append(e)
            if re.match(r'^(?:pgx:)?TCGA\-\w+?$', e["id"]):
                project = re.sub('TCGA-', '', e["id"])
                project = re.sub('pgx:', '', project)
                b_e_r.append({"id": "pgx:TCGA-"+project, "label": "TCGA "+project+" project"})

        if args.test:
            print(tcga_c_id, ind_ids[tcga_c_id]["biosample_ids"])
        else:
            bar.next()
            bios_coll.update_one({"_id":b["_id"]}, {"$set": {"external_references": b_e_r}})


    if not args.test:
        bar.finish()

    ############################################################################

    no = len(ind_ids.keys())
    if not args.test:
        bar = Bar("{} individuals".format(no), max = no, suffix='%(percent)d%%'+" of "+str(no) )

    for i_id, i_data in ind_ids.items():

        i_e_r = i_data["external_references"]
        i_e_r_ids = set()

        ind_ids = bios_coll.distinct("individual_id", {"id":{"$in":i_data["biosample_ids"]}})

        i_s = ind_coll.find({"id":{"$in":ind_ids}})
        for i in i_s:
            for e in i["external_references"]:
                if not "TCGA" in e["id"]: # <= important to keep other ers
                    if not e["id"] in i_e_r_ids:
                        i_e_r_ids.add(e["id"])
                        i_e_r.append(e)

        ind_coll.update_many({"id":{"$in":ind_ids}}, {"$set":{"external_references":i_e_r}})

        if len(ind_ids) > 1:
            ind_ids = ind_ids[1:]

            print("deleting {} of {} individuals".format(len(ind_ids), len(i_data["biosample_ids"])))

            ind_coll.delete_many({"id":{"$in":ind_ids}})


# def mapTCGAsurvival(clinical,output):# map survival information to TCGA samples
#     with open(output + '.csv', 'w') as f:
#         f.write('case_id')
#         f.write('\t')
#         f.write('vital_status')
#         f.write('\n')
#     client = MongoClient('localhost', 27017)
#     #db = client[database]
#     file = open(clinical + '.json', )
#     data = json.load(file)
#     for i in data:
#         with open(output + '.csv', 'a+') as f:
#             f.write(i['case_id'])
#             f.write('\t')
#             f.write(i['demographic']['vital_status'])
#             f.write('\n')

def addTCGAsurvival(args, ds_id):# add survival information for cancer TCGA samples

    if not args.survivalfile:
        print( "You have to provide an survivalfile -s")
        return

    data_client = MongoClient('localhost', 27017)
    data_db = data_client[ ds_id ]
    bios_coll = data_db[ "biosamples" ]
    ind_coll = data_db[ "individuals" ]

    fh=open(args.survivalfile, 'r')
    survlist = [line.rstrip() for line in fh.readlines()[1:]]
    fh.close()

    no = len(survlist)
    if not args.test:
        bar = Bar("{} samples".format(no), max = no, suffix='%(percent)d%%'+" of "+str(no) )
    else:
        survlist = survlist[0:20]

    for line in survlist:
        temp = line.split('\t')
        e_r_s_id = "pgx:TCGA-"+temp[0]
        surv = temp[1].lower()
        bs_q = {'external_references.id': e_r_s_id,'cohorts.id':'pgx:cohort-TCGAcancers'}
        if args.test:
            print(e_r_s_id, surv)
        else:
            bar.next()
            bios_coll.update_many(bs_q, {'$set': {'info.survival_status': surv}})

    if not args.test:
        bar.finish()
        

################################################################################
################################################################################
################################################################################

if __name__ == '__main__':
    main()

#mapTCGAids('TCGA_ids.txt','progenetix')
#addTCGAsurvival('TCGAcase_survival','progenetixCopy')

usage: ipykernel_launcher.py [-h] [-t TEST] [-i INPUTFILE] [-s SURVIVALFILE]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/kayvongrunigen/Library/Jupyter/runtime/kernel-ce3c82a0-b671-426e-b557-5e0b2a949c95.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
#tcga_mapper.py
#!/usr/local/bin/python3

from pymongo import MongoClient
import json
import re
import json
import argparse
from os import path

################################################################################

def _get_args():

    parser = argparse.ArgumentParser()

    parser.add_argument("-d", "--datasetid", help="dataset, usually progenetix")
    parser.add_argument('-t', '--tcgafile', help='TCGA json data file')
    parser.add_argument('-i', '--idfile', help='idfile')
    parser.add_argument('-u', '--update', help='update modus')

    return parser.parse_args()

################################################################################

def main():

	args = _get_args()

	update_TCGA_data(args)

################################################################################

def update_TCGA_data(args):

	if not args.datasetid:
		print("no datasetid was specified with -d")
		exit()

	if not args.idfile:
		print("no id file was specified with -i")
		exit()

	if not args.tcgafile:
		print("no tcga file was specified with -t")
		exit()

	if not args.update:
		print('no update will be performed - activate with "-u 1"')

	mongo_client = MongoClient()
	data_db = mongo_client[args.datasetid]
	bios_coll = data_db["biosamples"]
	ind_coll = data_db["individuals"]

	f = open(args.idfile, 'r+')
	id_lines = [line for line in f.readlines()]
	f.close()

	tcga_ids = {}

	for id_l in id_lines:
		id_l = id_l.rstrip()
		sample_id, case_id, submitter_id = re.split("\t", id_l)
		tcga_ids.update({sample_id:{"sample_id": sample_id, "case_id": case_id, "submitter_id": submitter_id }})

	with open(args.tcgafile, 'r') as c_f:
		c = c_f.read()
		c_l = json.loads(c)

	tcga_clin = {}

	for tcga_c in c_l:
		tcga_clin.update({ tcga_c["case_id"]: tcga_c })

	for s_id, s_id_o in tcga_ids.items():

		s = bios_coll.find_one({"info.legacy_id": s_id })
		if not s:
			print('!!! no biosample for "{}"'.format(s_id))
			continue

		c_id = s_id_o["case_id"]

		if not c_id in tcga_clin:
			print('!!! no tcga data for case_id "{}"'.format(c_id))
			continue

		clin = tcga_clin[c_id]

		bios_update = {
			"external_references": [],
			"info": s.get("info", {})     # <=== important!
		}

		# references
		# first collecting all "non-TCGA ones" for keeping, then adding the new TCGA ones
		for pgx_e_r in s["external_references"]:
			if "TCGA" in pgx_e_r["label"]:
				if not "project" in pgx_e_r["label"]:
					# skipping previous TCGA labels _except_ the TCGA-XXX for projects
					continue
			pgx_e_r.pop("description", None)
			bios_update["external_references"].append(pgx_e_r)

			for id_t in s_id_o.keys():
				n_id = s_id_o[id_t]
				if not "TCGA-" in n_id:
					n_id = "TCGA-"+s_id_o[id_t]
				id_obj = { "id": n_id, "label": "TCGA "+id_t }
				bios_update["external_references"].append(id_obj)

		# clinical
		f_u_s = clin["demographic"].get("vital_status", "").lower()
		if "dead" in f_u_s:
			bios_update.update({"followup_state":{"id":"EFO:0030049", "label":"dead (follow-up status)"}})
			bios_update["info"].update({"survival_status":"dead"})
		elif "alive" in f_u_s:
			bios_update.update({"followup_state":{"id":"EFO:0030041", "label":"alive (follow-up status)"}})
			bios_update["info"].update({"survival_status":"alive"})
        #Ziying update => since ["demographic"]["days_to_death"] should be correct for dead samples
        if clin["demographic"]["days_to_death"]:
            f_u_d = clin["demographic"]["days_to_death"]
        else:
            f_u_d = clin["diagnoses"][0]["days_to_last_follow_up"]
		#f_u_d = clin["diagnoses"][0]["days_to_last_follow_up"]
		if isinstance(f_u_d, int):
			f_u_m = round(f_u_d/30.5)
			bios_update["info"].update({"followup_months":f_u_m})

		a_d = clin["diagnoses"][0]["age_at_diagnosis"]
		if isinstance(a_d, int):
			a = _d2iso(a_d)
			bios_update.update({"individual_age_at_collection":"{}".format(a)})

		if args.update:
			bios_coll.update_one({"_id": s["_id"] }, { '$set': bios_update } )
		else:
			print(bios_update)

		########################################################################
		# individuals
		########################################################################

		i = ind_coll.find_one({"id": s["individual_id"] })
		if not s:
			print('!!! no individual for "{}"'.format(s["individual_id"]))
			continue

		ind_update = {
			"external_references": [],
			"info": i.get("info", {}),
		}

		# references
		for pgx_e_r in s["external_references"]:
			if "TCGA" in pgx_e_r["label"]:
				if "biosample" in pgx_e_r["label"]:
					continue
				if "TCGA collection" in pgx_e_r["label"]:
					continue
			pgx_e_r.pop("description", None)
			bios_update["external_references"].append(pgx_e_r)
			ind_update["external_references"].append(pgx_e_r)

			for id_t in s_id_o.keys():
				id_obj = { "id": "TCGA-"+s_id_o[id_t], "label": "TCGA "+id_t }
				bios_update["external_references"].append(id_obj)
				ind_update["external_references"].append(id_obj)

		sex = clin["demographic"].get("gender", "")
		if "female" in sex:
			ind_update.update({"sex":{"id":"PATO:0020002", "label":"female genotypic sex"}})
		elif "male" in sex:
			ind_update.update({"sex":{"id":"PATO:0020001", "label":"male genotypic sex"}})

		if args.update:
			ind_coll.update_one({"_id": i["_id"] }, { '$set': ind_update } )
		else:
			print(ind_update)

################################################################################

def _d2iso(d):

	y = int(d / 365.25)
	m = int((d % 365.25) / 30.5)
	d = int(d - (y * 365.25 + m * 30.5))

	return "P{}Y{}M{}D".format(y,m,d)

################################################################################
################################################################################

if __name__ == '__main__':
    main(  )

    
################################################################################
################################################################################
################################################################################
################################################################################
    
# tcga_mapperZY.py    
    
    
    
    
    
    
    
    #!/usr/local/bin/python3

from pymongo import MongoClient
import json
import re
import json
import argparse
from os import path

################################################################################

def _get_args():

    parser = argparse.ArgumentParser()

    parser.add_argument("-d", "--datasetid", help="dataset, usually progenetix")
    parser.add_argument('-t', '--tcgafile', help='TCGA json data file')
    parser.add_argument('-i', '--idfile', help='idfile')
    parser.add_argument('-u', '--update', help='update modus')

    return parser.parse_args()

################################################################################

def main():

	args = _get_args()

	update_TCGA_data(args)

################################################################################

def update_TCGA_data(args):

	if not args.datasetid:
		print("no datasetid was specified with -d")
		exit()

	if not args.idfile:
		print("no id file was specified with -i")
		exit()

	if not args.tcgafile:
		print("no tcga file was specified with -t")
		exit()

	if not args.update:
		print('no update will be performed - activate with "-u 1"')

	mongo_client = MongoClient()
	data_db = mongo_client[args.datasetid]
	bios_coll = data_db["biosamples"]
	ind_coll = data_db["individuals"]

	f = open(args.idfile, 'r+')
	id_lines = [line for line in f.readlines()]
	f.close()

	tcga_ids = {}

	for id_l in id_lines:
		id_l = id_l.rstrip()
		sample_id, case_id, submitter_id = re.split("\t", id_l)
		tcga_ids.update({sample_id:{"sample_id": sample_id, "case_id": case_id, "submitter_id": submitter_id }})

	with open(args.tcgafile, 'r') as c_f:
		c = c_f.read()
		c_l = json.loads(c)

	tcga_clin = {}

	for tcga_c in c_l:
		tcga_clin.update({ tcga_c["case_id"]: tcga_c })

	for s_id, s_id_o in tcga_ids.items():

		s = bios_coll.find_one({"info.legacy_id": s_id })
		if not s:
			print('!!! no biosample for "{}"'.format(s_id))
			continue

		c_id = s_id_o["case_id"]

		if not c_id in tcga_clin:
			print('!!! no tcga data for case_id "{}"'.format(c_id))
			continue

		clin = tcga_clin[c_id]

		bios_update = {
			#"external_references": [],
			"info": s.get("info", {})     # <=== important!
		}

		# references
		#first collecting all "non-TCGA ones" for keeping, then adding the new TCGA ones
		# for pgx_e_r in s["external_references"]:
		# 	if "TCGA" in pgx_e_r["label"]:
		# 		if not "project" in pgx_e_r["label"]:
		# 			# skipping previous TCGA labels _except_ the TCGA-XXX for projects
		# 			continue
		# 	pgx_e_r.pop("description", None)
		# 	bios_update["external_references"].append(pgx_e_r)
		#
		# 	for id_t in s_id_o.keys():
		# 		n_id = s_id_o[id_t]
		# 		if not "TCGA-" in n_id:
		# 			n_id = "TCGA-"+s_id_o[id_t]
		# 		id_obj = { "id": n_id, "label": "TCGA "+id_t }
		# 		bios_update["external_references"].append(id_obj)

		# clinical
		f_u_s = clin["demographic"].get("vital_status", "").lower()
		if "dead" in f_u_s:
			bios_update.update({"followup_state":{"id":"EFO:0030049", "label":"dead (follow-up status)"}})
			bios_update["info"].update({"survival_status":"dead"})
		elif "alive" in f_u_s:
			bios_update.update({"followup_state":{"id":"EFO:0030041", "label":"alive (follow-up status)"}})
			bios_update["info"].update({"survival_status":"alive"})
		if "days_to_death" in clin["demographic"]:
			f_u_d = clin["demographic"]["days_to_death"]
		else:
			f_u_d = clin["diagnoses"][0]["days_to_last_follow_up"]

		#update info clinical
		if "ajcc_clinical_stage" in clin["diagnoses"][0]:
			bios_update["info"].update({"tumor_stage":clin["diagnoses"][0]["ajcc_clinical_stage"].split(' ')[1]})
		if "tumor_grade" in clin["diagnoses"][0]:
			bios_update["info"].update({"tumor_grade": clin["diagnoses"][0]["tumor_grade"]})
		t=n=m=''
		if "ajcc_pathologic_t" in clin["diagnoses"][0]:
			t = clin["diagnoses"][0]["ajcc_pathologic_t"]
		if "ajcc_pathologic_n" in clin["diagnoses"][0]:
			t = clin["diagnoses"][0]["ajcc_pathologic_n"]
		if "ajcc_pathologic_m" in clin["diagnoses"][0]:
			t = clin["diagnoses"][0]["ajcc_pathologic_m"]
		tnm = t+n+m
		if tnm:
			bios_update["info"].update({"tnm": tnm})




		if isinstance(f_u_d, int):
			f_u_m = round(f_u_d/30.5)
			bios_update["info"].update({"followup_months":f_u_m})

		a_d = clin["diagnoses"][0]["age_at_diagnosis"]
		if isinstance(a_d, int):
			a = _d2iso(a_d)
			bios_update.update({"individual_age_at_collection":"{}".format(a)})

		if args.update:
			bios_coll.update_one({"_id": s["_id"] }, { '$set': bios_update } )
		else:
			print(bios_update)

		########################################################################
		# individuals
		########################################################################

		i = ind_coll.find_one({"id": s["individual_id"] })
		if not s:
			print('!!! no individual for "{}"'.format(s["individual_id"]))
			continue

		ind_update = {
			"external_references": [],
			"info": i.get("info", {}),
		}

		# references
		for pgx_e_r in s["external_references"]:
			if "TCGA" in pgx_e_r["label"]:
				if "biosample" in pgx_e_r["label"]:
					continue
				if "TCGA collection" in pgx_e_r["label"]:
					continue
			pgx_e_r.pop("description", None)
			#bios_update["external_references"].append(pgx_e_r)
			ind_update["external_references"].append(pgx_e_r)

			for id_t in s_id_o.keys():
				id_obj = { "id": "TCGA-"+s_id_o[id_t], "label": "TCGA "+id_t }
				#bios_update["external_references"].append(id_obj)
				ind_update["external_references"].append(id_obj)

		sex = clin["demographic"].get("gender", "")
		if "female" in sex:
			ind_update.update({"sex":{"id":"PATO:0020002", "label":"female genotypic sex"}})
		elif "male" in sex:
			ind_update.update({"sex":{"id":"PATO:0020001", "label":"male genotypic sex"}})

		#update diseases in individuals


		if args.update:
			ind_coll.update_one({"_id": i["_id"] }, { '$set': ind_update } )
		else:
			print(ind_update)

################################################################################

def _d2iso(d):

	y = int(d / 365.25)
	m = int((d % 365.25) / 30.5)
	d = int(d - (y * 365.25 + m * 30.5))

	return "P{}Y{}M{}D".format(y,m,d)

################################################################################
################################################################################

if __name__ == '__main__':
    main(  )