<h1 align="center">upload Karina's real data</h1> 

This Jupyter notebook demonstrates how to upload data to [ToxDataCommons](fairtox.com).

__Created by: Shuangyu Zhao, Michigan State University__

-------

In [38]:
import pandas as pd
import numpy as np
import subprocess
import sys
import gen3
import json
from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
from gen3.index import Gen3Index
from gen3.query import Gen3Query
from gen3.metadata import Gen3Metadata
from gen3.file import Gen3File
import os

# download and import some custom Python scripts from https://github.com/cgmeyer/gen3sdk-python
# os.system("wget https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py")
from expansion import Gen3Expansion

In [54]:
api = 'https://fairtox.com/'
cred = '/Users/apple/Desktop/test_gen3/credentials22.json'
auth = Gen3Auth(api, refresh_file=cred)
sub = Gen3Submission(api, auth)
query = Gen3Query(auth)
index = Gen3Index(auth)
file = Gen3File(auth)
metadata = Gen3Metadata(auth)
exp = Gen3Expansion(api,auth,sub)

## create program

In [40]:
prog = 'MyFirstProgram'

prog_txt = """{
    "dbgap_accession_number": "%s",
    "type": "program",
    "name": "%s"
}""" % (prog,prog)

prog_json = json.loads(prog_txt)
data = sub.create_program(json=prog_json)

## create project

In [41]:
proj_txt = """{
    "availability_type": "Open",
    "code": "MyFirstProject",
    "dbgap_accession_number": "MyFirstProject",
    "type": "project",
    "contact_name": "test",
    "institution": "MSU",
    "description": "test",
    "email_address": "xxxxx@fdas.sdfs",
    "telephone_number": "ssd-asdf-asdf"
    }"""
proj_json = json.loads(proj_txt)
data = sub.create_project(program="MyFirstProgram",json=proj_json) 

## upload other nodes

In [44]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/study.tsv")
df.head()

Unnamed: 0,type,submitter_id,projects.code,study_title,study_description,study_design,study_type,experimental_setting,organism,provenance
0,study,Prj171,MyFirstProject,Prj171_Mm_TCDD_RDDR-28D_Male,Male mice were treated with TCDD every 4 days ...,dose response design,Toxicogenomics,in vivo,Mus musculus,


In [45]:
# study
df = pd.read_table('/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/study.tsv')
df["projects.code"].fillna("MyFirstProject", inplace=True)
df.to_csv('/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/study.tsv', sep='\t', index=False)
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/study.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/study.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Reducing Chunk Size: { "error": "service failure - try again later"}
Retrying Chunk with reduced chunk_size: 15
Chunk 2 (chunk size: 15, submitted: 0 of 1)
	 Reducing Chunk Size: { "error": "service failure - try again later"}
Retrying Chunk with reduced chunk_size: 7
Chunk 3 (chunk size: 7, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [46]:
df = pd.read_table('/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/contact.tsv')
df.head()

Unnamed: 0,type,submitter_id,studies.submitter_id,last_name,first_name,middle_name,contact_orcid,contact_email,contact_telephone,contact_department,contact_institution,location,provenance
0,contact,Prj171:Zacharewski:Timothy,Prj171,Zacharewski,Timothy,,0000-0002-3662-7919,tzachare@msu.edu,517-884-2054,Biochemistry and Molecular Biology,Michigan State University,48824:East Lansing,


In [47]:
# contact
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/contact.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/contact.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [48]:
df = pd.read_table('/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/funding.tsv')
df.head()

Unnamed: 0,type,submitter_id,studies.submitter_id,support_id,support_source,provenance
0,funding,Prj171:R01ES029541:P42ES004911,Prj171,R01ES029541:P42ES004911,NIEHS; Superfund Basic Research Program,


In [49]:
# funding
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/funding.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/funding.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Reducing Chunk Size: { "error": "service failure - try again later"}
Retrying Chunk with reduced chunk_size: 15
Chunk 2 (chunk size: 15, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [50]:
df = pd.read_table('/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/publication.tsv')
df.head()

Unnamed: 0,type,submitter_id,studies.submitter_id,PMC_id,DOI,PMID,provenance
0,publication,Prj171,Prj171,,,,


In [51]:
# publication
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/publication.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/publication.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [52]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/subject.tsv")
df.head()

Unnamed: 0,type,submitter_id,studies.submitter_id,start_date,start_date_age,experiment_start_date,experiment_start_zt,sex,strain,strain_source,euthanasia_date,euthanasia_zt,euthanasia_method,provenance
0,subject,Prj171:M97,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,
1,subject,Prj171:M94,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,
2,subject,Prj171:M92,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,
3,subject,Prj171:M91,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,
4,subject,Prj171:M89,Prj171,2022-03-09,23,2022-03-14,0,male,C57BL/6NCrl,Charles Rivers Laboratories,2022-04-11,0,Carbon dioxide asphyxiation,


In [57]:
# subject
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/subject.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/subject.tsv with 144 records.
Chunk 1 (chunk size: 30, submitted: 0 of 144)
	 Succeeded: 30 entities.
Chunk 2 (chunk size: 30, submitted: 30 of 144)
	 Succeeded: 30 entities.
Chunk 3 (chunk size: 30, submitted: 60 of 144)
	 Succeeded: 30 entities.
Chunk 4 (chunk size: 30, submitted: 90 of 144)
	 Succeeded: 30 entities.
Chunk 5 (chunk size: 30, submitted: 120 of 144)
	 Succeeded: 24 entities.
Finished data submission.
Successful records: 144
Failed invalid records: 0


In [58]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/treatment.tsv")
df.head()

Unnamed: 0,type,submitter_id,subjects.submitter_id,date,administration_volume_ml,dose_amount,dose_amount_unit,route,test_article_administration_zt,test_article_administration_duration,test_article_name,test_article_dtxsid,vehicle_name,vehicle_dtxsid,treatment_protocol,provenance
0,treatment,Prj171:DTXSID4051378:2022-03-09:0,"Prj171:M97,Prj171:M75,Prj171:M123,Prj171:M122,...",2022-03-09,0.1,0.0,microgram per kilogram,Oral Gavage Route of Administration,0,,TCDD,DTXSID4051378,Sesame Oil,DTXSID9033971,treatment protocol,
1,treatment,Prj171:DTXSID4051378:2022-03-09:30,"Prj171:M94,Prj171:M143,Prj171:M142,Prj171:M119...",2022-03-09,0.1,30.0,microgram per kilogram,Oral Gavage Route of Administration,0,,TCDD,DTXSID4051378,Sesame Oil,DTXSID9033971,treatment protocol,
2,treatment,Prj171:DTXSID4051378:2022-03-09:10,"Prj171:M92,Prj171:M91,Prj171:M139,Prj171:M116,...",2022-03-09,0.1,10.0,microgram per kilogram,Oral Gavage Route of Administration,0,,TCDD,DTXSID4051378,Sesame Oil,DTXSID9033971,treatment protocol,
3,treatment,Prj171:DTXSID4051378:2022-03-09:3,"Prj171:M89,Prj171:M88,Prj171:M137,Prj171:M113,...",2022-03-09,0.1,3.0,microgram per kilogram,Oral Gavage Route of Administration,0,,TCDD,DTXSID4051378,Sesame Oil,DTXSID9033971,treatment protocol,
4,treatment,Prj171:DTXSID4051378:2022-03-09:1,"Prj171:M86,Prj171:M85,Prj171:M134,Prj171:M110,...",2022-03-09,0.1,1.0,microgram per kilogram,Oral Gavage Route of Administration,0,,TCDD,DTXSID4051378,Sesame Oil,DTXSID9033971,treatment protocol,


In [60]:


# treatment 
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/treatment.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/treatment.tsv with 8 records.
Chunk 1 (chunk size: 30, submitted: 0 of 8)
	 Succeeded: 8 entities.
Finished data submission.
Successful records: 8
Failed invalid records: 0


In [61]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/housing.tsv")
df.head()

Unnamed: 0,type,submitter_id,subjects.submitter_id,cageID,housing_change_date,bedding_type,cage_type,vivarium_temperature_C,vivarium_humidity_percentage,vivarium_light_cycle,provenance
0,housing,Prj171:Cage033:2022-03-09,"Prj171:M97,Prj171:M99,Prj171:M98",Cage033,2022-03-09,ALPHA-dri,Innovive Innocage,23,35,12:12,
1,housing,Prj171:Cage032:2022-03-09,"Prj171:M94,Prj171:M96,Prj171:M95",Cage032,2022-03-09,ALPHA-dri,Innovive Innocage,23,35,12:12,
2,housing,Prj171:Cage031:2022-03-09,"Prj171:M92,Prj171:M91,Prj171:M93",Cage031,2022-03-09,ALPHA-dri,Innovive Innocage,23,35,12:12,
3,housing,Prj171:Cage030:2022-03-09,"Prj171:M89,Prj171:M88,Prj171:M90",Cage030,2022-03-09,ALPHA-dri,Innovive Innocage,23,35,12:12,
4,housing,Prj171:Cage029:2022-03-09,"Prj171:M86,Prj171:M85,Prj171:M87",Cage029,2022-03-09,ALPHA-dri,Innovive Innocage,23,35,12:12,


In [64]:
# housing
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/housing.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/housing.tsv with 48 records.
Chunk 1 (chunk size: 30, submitted: 0 of 48)
	 Succeeded: 30 entities.
Chunk 2 (chunk size: 30, submitted: 30 of 48)
	 Succeeded: 18 entities.
Finished data submission.
Successful records: 48
Failed invalid records: 0


In [119]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/diet.tsv")
df = df.rename(columns={"feeding_paradigm": "feed_paradigm"})
df = df.drop("date", axis=1)
df.to_csv('/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/diet_new.tsv', sep='\t', index=False)

In [120]:
# diet
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/diet_new.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/diet_new.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


In [66]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/sample.tsv")
df.head()

Unnamed: 0,type,submitter_id,subjects.submitter_id,date,biospecimen_anatomic_site,method_of_sample_procurement,preservation_method,weight,volume,storage_vessel,collection_protocol,provenance
0,sample,Prj171:L1,Prj171:M1,2022-04-11,Liver,,Snap Frozen,,,,,"DataHarmonizer v1.4.10, Sample v4.3.1"
1,sample,Prj171:L2,Prj171:M2,2022-04-11,Liver,,Snap Frozen,,,,,"DataHarmonizer v1.4.10, Sample v4.3.1"
2,sample,Prj171:L3,Prj171:M3,2022-04-11,Liver,,Snap Frozen,,,,,"DataHarmonizer v1.4.10, Sample v4.3.1"
3,sample,Prj171:L4,Prj171:M4,2022-04-11,Liver,,Snap Frozen,,,,,"DataHarmonizer v1.4.10, Sample v4.3.1"
4,sample,Prj171:L5,Prj171:M5,2022-04-11,Liver,,Snap Frozen,,,,,"DataHarmonizer v1.4.10, Sample v4.3.1"


In [76]:
api = 'https://fairtox.com/'
cred = '/Users/apple/Desktop/test_gen3/credentials22.json'
auth = Gen3Auth(api, refresh_file=cred)
sub = Gen3Submission(api, auth)
query = Gen3Query(auth)
index = Gen3Index(auth)
file = Gen3File(auth)
metadata = Gen3Metadata(auth)
exp = Gen3Expansion(api,auth,sub)

In [78]:
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/sample.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/sample.tsv with 268 records.
Chunk 1 (chunk size: 30, submitted: 0 of 268)
	 Succeeded: 30 entities.
Chunk 2 (chunk size: 30, submitted: 30 of 268)
	 Succeeded: 30 entities.
Chunk 3 (chunk size: 30, submitted: 60 of 268)
	 Succeeded: 30 entities.
Chunk 4 (chunk size: 30, submitted: 90 of 268)
	 Succeeded: 30 entities.
Chunk 5 (chunk size: 30, submitted: 120 of 268)
	 Succeeded: 30 entities.
Chunk 6 (chunk size: 30, submitted: 150 of 268)
	 Reducing Chunk Size: { "error": "service failure - try again later"}
Retrying Chunk with reduced chunk_size: 15
Chunk 7 (chunk size: 15, submitted: 150 of 268)
	 Succeeded: 15 entities.
Chunk 8 (chunk size: 15, submitted: 165 of 268)
	 Succeeded: 15 entities.
Chunk 9 (chunk size: 15, submitted: 180 of 268)
	 Succeeded: 15 entities.
Chunk 10 (chunk size: 15, submitted: 195 of 268)
	 Succeeded: 15 entities.
Chunk 11 (chunk size: 15, submitted: 210 of 268)
	 Succeeded: 15 entiti

In [83]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/aliquot.tsv")
df.head()

Unnamed: 0,type,submitter_id,samples.submitter_id,analyte_type,derivitization,extract_preservation_method,analyte_protocol,provenance
0,aliquot,Prj171:metabolite_extract_L75,Prj171:L75,Aqueous,,-80C,,
1,aliquot,Prj171:metabolite_extract_L121,Prj171:L121,Aqueous,,-80C,,
2,aliquot,Prj171:metabolite_extract_L97,Prj171:L97,Aqueous,,-80C,,
3,aliquot,Prj171:metabolite_extract_L122,Prj171:L122,Aqueous,,-80C,,
4,aliquot,Prj171:metabolite_extract_L123,Prj171:L123,Aqueous,,-80C,,


In [81]:
# aliquot
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/aliquot.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/aliquot.tsv with 30 records.
Chunk 1 (chunk size: 30, submitted: 0 of 30)
	Chunk Failed (status code 400): 30 entities.
	Invalid records in this chunk: 1
Retrying submission of valid entities from failed chunk: 29 valid entities.
Chunk 2 (chunk size: 30, submitted: 1 of 30)
	 Succeeded: 29 entities.
Chunk 3 (chunk size: 30, submitted: 30 of 30)
	Chunk Failed (status code 400): 0 entities.
	Invalid records in this chunk: 0
Finished data submission.
Successful records: 29
Failed invalid records: 1


In [87]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/mass_spec_assay.tsv")
df.head()

Unnamed: 0,type,submitter_id,aliquots.submitter_id,sample_dilution,calibration_standard,chromatography_type,chromatography_column,chromatography_protocol,chromatography_instrument,chromatography_method_filename,...,solventA,solventB,solventC,carrier_gas,oven_temperature_program,ms_type,ms_protocol,ms_method_filename,ion_mode,provenance
0,mass_spec_assay,Prj171:TQSm_042122_010,Prj171:metabolite_extract_L75,,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100 acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
1,mass_spec_assay,Prj171:TQSm_042122_011,Prj171:metabolite_extract_L121,,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100 acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
2,mass_spec_assay,Prj171:TQSm_042122_012,Prj171:metabolite_extract_L97,,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100 acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
3,mass_spec_assay,Prj171:TQSm_042122_013,Prj171:metabolite_extract_L122,,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100 acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
4,mass_spec_assay,Prj171:TQSm_042122_014,Prj171:metabolite_extract_L123,,"13C,15N-methionine",Reversed phase,"Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)",The mobile phases were 10mM PFHA in water (mob...,Waters TQS,,...,100% water; ; 10mM perfluoroheptanoic acid,100 acetonitrile; ;,,,,ESI,Multiple reaction monitoring in positive ion mode,,POSITIVE,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"


In [88]:
# mass_spec_assay
import requests
COMMONS = "https://fairtox.com/"
API_KEY_FILEPATH = '/Users/apple/Desktop/test_gen3/credentials22.json'

projectname = 'MyFirstProject'
programname = 'MyFirstProgram'
api_url = "{}/api/v0/submission/{}/{}".format(COMMONS,programname,projectname)
df = pd.read_table('/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/mass_spec_assay.tsv')
col_name = df.columns.tolist()
# this is external link
col_name.remove("aliquots.submitter_id")

for _, row in df.iterrows():
    jsondata = []
    # this is for the external link
    dic = {
        "aliquots": [
            {
                "submitter_id": row["aliquots.submitter_id"]
            }
        ]
    }

    for i in col_name:
        value = row[i]
        if isinstance(value, float) and (value == float('inf') or value == float('-inf') or pd.isna(value)):
            dic[i] = str(value)
        else:
            dic[i] = value

    jsondata.append(dic)
    print(jsondata)
    authn = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
    output = requests.put(api_url, auth=authn, json=jsondata)
    output.json()

[{'aliquots': [{'submitter_id': 'Prj171:metabolite_extract_L75'}], 'type': 'mass_spec_assay', 'submitter_id': 'Prj171:TQSm_042122_010', 'sample_dilution': 'nan', 'calibration_standard': '13C,15N-methionine', 'chromatography_type': 'Reversed phase', 'chromatography_column': 'Waters ACQUITY UPLC HSS T3 (100 x 2.1mm,1.8um)', 'chromatography_protocol': 'The mobile phases were 10mM PFHA in water (mobile phase A) and acetonitrile (mobile phase B) using the following gradient: 0 min – 100% A, 1.0 min – 100% A, 6.0 min – 35% A, 6.01 min – 10% A, 7.0 min – 10% A, 7.01 min – 100% A, 9.0 min – 100% A)', 'chromatography_instrument': 'Waters TQS', 'chromatography_method_filename': 'nan', 'elution_program': '0 min – 100% A, 1.0 min – 100% A, 6.0 min – 35% A, 6.01 min – 10% A, 7.0 min – 10% A, 7.01 min – 100% A, 9.0 min – 100% A', 'flow_rate': '0.3 ml/minute', 'solventA': '100% water; ; 10mM perfluoroheptanoic acid', 'solventB': '100 acetonitrile; ;', 'solventC': 'nan', 'carrier_gas': 'nan', 'oven_te

In [89]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_raw_data.tsv")
df.head()

Unnamed: 0,type,submitter_id,mass_spec_assays.submitter_id,file_name,file_size,md5sum,object_id,data_category,data_format,data_type,file_source_repository,repository_accession_id,repository_download_ftp,provenance
0,ms_raw_data,Prj171:TQSm_042122_010.raw,Prj171:TQSm_042122_010,TQSm_042122_010.raw,,,,targeted metabolomics,.RAW,chromatograms,,,,
1,ms_raw_data,Prj171:TQSm_042122_011.raw,Prj171:TQSm_042122_011,TQSm_042122_011.raw,,,,targeted metabolomics,.RAW,chromatograms,,,,
2,ms_raw_data,Prj171:TQSm_042122_012.raw,Prj171:TQSm_042122_012,TQSm_042122_012.raw,,,,targeted metabolomics,.RAW,chromatograms,,,,
3,ms_raw_data,Prj171:TQSm_042122_013.raw,Prj171:TQSm_042122_013,TQSm_042122_013.raw,,,,targeted metabolomics,.RAW,chromatograms,,,,
4,ms_raw_data,Prj171:TQSm_042122_014.raw,Prj171:TQSm_042122_014,TQSm_042122_014.raw,,,,targeted metabolomics,.RAW,chromatograms,,,,


In [90]:
# create a folder of files
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_raw_data.tsv")
file_names = df["file_name"].to_list()
for file_name in file_names:
    path = '/Users/apple/Desktop/test_gen3/Karina_realData/ms_Raw_Data/' + file_name
    with open(path, 'w') as file:
        file.write(path)
        

In [92]:
# automatically insert md5sum and file_size
import pandas as pd
import hashlib
import os
def calculate_md5(file_path):
    """Calculate the MD5 checksum for a file."""
    md5_hash = hashlib.md5()
    with open(file_path, 'rb') as file:
        for chunk in iter(lambda: file.read(4096), b''):
            md5_hash.update(chunk)
    return md5_hash.hexdigest()

# Provide the file name list and DataFrame
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_raw_data.tsv")
file_names = df["file_name"].to_list()

# Provide the path to the folder containing the files
folder_path = '/Users/apple/Desktop/test_gen3/Karina_realData/ms_Raw_Data/'

# Update the MD5 checksum for each file name in the DataFrame
for file_name in file_names:
    file_path = folder_path + file_name
    md5_sum = calculate_md5(file_path)
    file_size = os.path.getsize(file_path)
    df.loc[df['file_name'] == file_name, 'md5sum'] = md5_sum
    df.loc[df['file_name'] == file_name, 'file_size'] = file_size
df['file_size'] = df['file_size'].astype(int)
df.to_csv("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_raw_data_revised.tsv", sep='\t', index=False)
df.head()

Unnamed: 0,type,submitter_id,mass_spec_assays.submitter_id,file_name,file_size,md5sum,object_id,data_category,data_format,data_type,file_source_repository,repository_accession_id,repository_download_ftp,provenance
0,ms_raw_data,Prj171:TQSm_042122_010.raw,Prj171:TQSm_042122_010,TQSm_042122_010.raw,78,edf9bd0cb51cc2c5ad21aa1b96f5fc1d,,targeted metabolomics,.RAW,chromatograms,,,,
1,ms_raw_data,Prj171:TQSm_042122_011.raw,Prj171:TQSm_042122_011,TQSm_042122_011.raw,78,eb49e4445a8a4f7ae783e3ff49d839bd,,targeted metabolomics,.RAW,chromatograms,,,,
2,ms_raw_data,Prj171:TQSm_042122_012.raw,Prj171:TQSm_042122_012,TQSm_042122_012.raw,78,d0472e55f1d68713118da5424b584717,,targeted metabolomics,.RAW,chromatograms,,,,
3,ms_raw_data,Prj171:TQSm_042122_013.raw,Prj171:TQSm_042122_013,TQSm_042122_013.raw,78,03249e0e9d0ef7863153915af1c68460,,targeted metabolomics,.RAW,chromatograms,,,,
4,ms_raw_data,Prj171:TQSm_042122_014.raw,Prj171:TQSm_042122_014,TQSm_042122_014.raw,78,54362b27a442815d208adf6e51e071f5,,targeted metabolomics,.RAW,chromatograms,,,,


In [93]:
# ms_raw_Data
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_raw_data_revised.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_raw_data_revised.tsv with 30 records.
Chunk 1 (chunk size: 30, submitted: 0 of 30)
	Chunk Failed (status code 400): 30 entities.
	Invalid records in this chunk: 1
Retrying submission of valid entities from failed chunk: 29 valid entities.
Chunk 2 (chunk size: 30, submitted: 1 of 30)
	 Succeeded: 29 entities.
Chunk 3 (chunk size: 30, submitted: 30 of 30)
	Chunk Failed (status code 400): 0 entities.
	Invalid records in this chunk: 0
Finished data submission.
Successful records: 29
Failed invalid records: 1


In [94]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_analysis.tsv")

df.head()

Unnamed: 0,type,submitter_id,ms_raw_datas.submitter_id,normalization,transformation,transformation_purpose,transformation_description,unit,analysis_protocols,provenance
0,ms_analysis,Prj171:TQSm_042122_010,Prj171:TQSm_042122_010.raw,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
1,ms_analysis,Prj171:TQSm_042122_011,Prj171:TQSm_042122_011.raw,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
2,ms_analysis,Prj171:TQSm_042122_012,Prj171:TQSm_042122_012.raw,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
3,ms_analysis,Prj171:TQSm_042122_013,Prj171:TQSm_042122_013.raw,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"
4,ms_analysis,Prj171:TQSm_042122_014,Prj171:TQSm_042122_014.raw,normalized to protein,,,,normalized peak area,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"


In [95]:
# ms_analysis
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_analysis.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_analysis.tsv with 30 records.
Chunk 1 (chunk size: 30, submitted: 0 of 30)
	Chunk Failed (status code 400): 30 entities.
	Invalid records in this chunk: 1
Retrying submission of valid entities from failed chunk: 29 valid entities.
Chunk 2 (chunk size: 30, submitted: 1 of 30)
	 Succeeded: 29 entities.
Chunk 3 (chunk size: 30, submitted: 30 of 30)
	Chunk Failed (status code 400): 0 entities.
	Invalid records in this chunk: 0
Finished data submission.
Successful records: 29
Failed invalid records: 1


In [97]:
# create a folder of files
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_analysed_data.tsv")
file_names = df["file_name"].to_list()
for file_name in file_names:
    path = '/Users/apple/Desktop/test_gen3/Karina_realData/ms_processed_Data_file/' + file_name
    with open(path, 'w') as file:
        file.write(path)

In [98]:
import pandas as pd
import hashlib
import os
def calculate_md5(file_path):
    """Calculate the MD5 checksum for a file."""
    md5_hash = hashlib.md5()
    with open(file_path, 'rb') as file:
        for chunk in iter(lambda: file.read(4096), b''):
            md5_hash.update(chunk)
    return md5_hash.hexdigest()

# Provide the file name list and DataFrame
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_analysed_data.tsv")
file_names = df["file_name"].to_list()

folder_path = '/Users/apple/Desktop/test_gen3/Karina_realData/ms_processed_Data_file/'

# Update the MD5 checksum for each file name in the DataFrame
for file_name in file_names:
    file_path = folder_path + file_name
    md5_sum = calculate_md5(file_path)
    file_size = os.path.getsize(file_path)
    df.loc[df['file_name'] == file_name, 'md5sum'] = md5_sum
    df.loc[df['file_name'] == file_name, 'file_size'] = file_size
df['file_size'] = df['file_size'].astype(int)
df.to_csv("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_analysed_data_updated.tsv", sep='\t', index=False)
df.head()

Unnamed: 0,type,submitter_id,ms_analyses.submitter_id,file_name,file_size,md5sum,object_id,data_category,data_format,data_type,file_source_repository,repository_accession_id,repository_download_ftp,provenance
0,ms_analysed_data,Prj171:normalized_data.txt,"Prj171:TQSm_042122_010,Prj171:TQSm_042122_011,...",normalized_data.txt,89,6dcf435b67007cc409b989d62288b664,,targeted metabolomics,.txt,integrated peaks,,,,"DataHarmonizer v1.4.10, mass_spec_assay v1.0.0"


In [102]:
# ms_analysed_data
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_analysed_data_updated.tsv", project_id="MyFirstProgram-MyFirstProject")



Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/ms_analysed_data_updated.tsv with 1 records.
Chunk 1 (chunk size: 30, submitted: 0 of 1)
	 Succeeded: 1 entities.
Finished data submission.
Successful records: 1
Failed invalid records: 0


Prj171:TQSm_042122_012 is not updated successfully

In [113]:
df = pd.read_table("/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/metabolite_id.tsv")
df.head()

Unnamed: 0,type,submitter_id,ms_analysed_datas.submitter_id,metabolite_name,refmet_name,dtxsid,inchikey,mass,elution_time,provenance
0,metabolite_id,Prj171:S-(2-carboxyethyl)-L-cysteine,Prj171:normalized_data.txt,S-(2-carboxyethyl)-L-cysteine,,DTXSID40193329,"1S/C6H11NO4S/c7-4(6(10)11)3-12-2-1-5(8)9/h4H,1...",193.22,2.6,
1,metabolite_id,"Prj171:13C,15N-methionine",Prj171:normalized_data.txt,"13C,15N-methionine",methionine,,"1S/C5H11NO2S/c1-9-3-2-4(6)5(7)8/h4H,2-3,6H2,1H...",149.0511,5.12,


In [114]:
# metabolite_id
data = sub.submit_file(filename="/Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/metabolite_id.tsv", project_id="MyFirstProgram-MyFirstProject")


Submitting /Users/apple/Desktop/test_gen3/Karina_realData/gen3_test_update/metabolite_id.tsv with 2 records.
Chunk 1 (chunk size: 30, submitted: 0 of 2)
	 Succeeded: 2 entities.
Finished data submission.
Successful records: 2
Failed invalid records: 0
