# MS SQL - MySQL bridge
Querying cbioportal database 2.7.3 using SQL against CasJobs context cbioportal_mysql.
Comparing with the R library commands documented 
<a href="https://cran.r-project.org/web/packages/cgdsr/cgdsr.pdf" target="_blank">here</a>.

Examples taken from demo script for this R library. TODO add URL

## NB: CasJobs group
For now you also need to be a member of the Polybase group in CasJobs to be allowed to query the cbioportal_mysql database context. Soon that group should be merged with the SciServer Polybase group.

In [None]:
#R.version
options(repr.matrix.max.rows=6000, repr.matrix.max.cols=200)

In [None]:
# must run this first, once
install.packages('cgdsr')
install.packages('reshape')

In [None]:
require('cgdsr')
require('SciServer')
require(reshape)
# define the CasJobs database context that is being queried 
DATABASE="cbioportal_mysql"

In [None]:
# Create CGDS object
mycgds = CGDS("http://www.cbioportal.org/")
#test(mycgds)

# the database
Model in png and pdf in this folder.

<a href="https://github.com/cBioPortal/cbioportal/raw/master/db-scripts/src/main/resources/cbioportal-er-diagram_v2.7.0.png" target="_blank">Link to data model diagram</a>

## getCancerStudies

In [None]:
# Get list of cancer studies at server
allCS=getCancerStudies(mycgds,)
head(allCS,4)

In [None]:
mycancerstudy=allCS[1,1]

In [None]:
sql="
select CANCER_STUDY_IDENTIFIER as cancer_study_id,name,description from cancer_study
order by name
"
df=CasJobs.executeQuery(sql,DATABASE)
head(df,4)

## getCaseLists

In [None]:
mycaselist = getCaseLists(mycgds,mycancerstudy)[1,1]
mycaselist

Note the case_ids column, which is a concatenation of stable_id from the sample table.
This can already be done in Polybase 2019 using the STRING_AGG function. Note, this is likely done on the MS SQL side, i.e. not expected that the GROUP_CONCAT function is used by SQL.


In [None]:
ptm<-proc.time()
sql=sprintf("
select sl.stable_id as case_list_id, max(sl.name) as case_list_name
,      max(sl.description) as case_list_description
,      sl.cancer_study_id
,      STRING_AGG(s.stable_id ,' ') as case_ids
  from cancer_study cs
    inner join sample_list sl
      on sl.CANCER_STUDY_ID=cs.CANCER_STUDY_ID
  join sample_list_list sll
     on sl.list_id=sll.list_id
  join sample s
    on s.internal_id=sll.sample_id
where cs.cancer_study_identifier='%s'
group by sl.stable_id,sl.cancer_study_id
order by 1,2
",mycancerstudy)
df=CasJobs.executeQuery(sql,DATABASE)
print(proc.time()-ptm)
head(df,5)

Note, string aggregation done in MS SQL. Could be done in MySQL using OPENQUERY and GROP_CONCAT.

In [None]:
# Write previous query in MySQL using OPENQUERY
# note, replace single single quotes by two single quotes. 
ptm<-proc.time()
sql=sprintf("select * from OPENQUERY([172.23.250.16],'
select sl.stable_id as case_list_id, max(sl.name) as case_list_name
,      max(sl.description) as case_list_description
,      sl.cancer_study_id
,      GROUP_CONCAT(s.stable_id separator '' '') as case_ids
  from cancer_study cs
    inner join sample_list sl
      on sl.CANCER_STUDY_ID=cs.CANCER_STUDY_ID
  join sample_list_list sll
     on sl.list_id=sll.list_id
  join sample s
    on s.internal_id=sll.sample_id
where cs.cancer_study_identifier=''%s''
group by sl.stable_id,sl.cancer_study_id
order by 1,2') a
",mycancerstudy)
df=CasJobs.executeQuery(sql,DATABASE)
print(proc.time()-ptm)
head(df,4)

## getGeneticProfiles

In [None]:
# Get available genetic profiles
mygeneticprofiles = getGeneticProfiles(mycgds,mycancerstudy)
mygeneticprofile=mygeneticprofiles[1,1]
mygeneticprofiles


In [None]:
sql=sprintf("
select gp.stable_id as genetic_profile_id
,      gp.name as genetic_profile_name
,      gp.description as genetic_profile_description
,      cs.cancer_study_id
,      gp.genetic_alteration_type
,      gp.show_profile_in_analysis_tab
  from cancer_study cs
  join genetic_profile gp
    on gp.cancer_study_id=cs.cancer_study_id
 where cs.cancer_study_identifier='%s'
",mycancerstudy)
gps=CasJobs.executeQuery(sql,DATABASE)
gps

## getClinicalData

In [None]:
cd=getClinicalData(mycgds,mycaselist)
head(cd,4)

In [None]:
ptm<-proc.time()
# 1. Using OPENQUERY to the linked MySQL server.
sql=sprintf("
select * from 
 openquery([172.23.250.16],'select p.stable_id
  ,    cam.attr_id,cp.attr_value
  from cancer_study cs
   inner join clinical_attribute_meta cam
     on cam.cancer_study_id=cs.cancer_study_id
    and cam.patient_attribute=1
   inner join patient p
     on p.cancer_study_id=cs.cancer_study_id
   inner join clinical_patient cp
     on cp.internal_id=p.internal_id
    and cp.attr_id=cam.attr_id
 where cs.cancer_study_identifier=''%s''
union all
select p.stable_id
  ,    cam.attr_id, cls.attr_value
  from cancer_study cs
   inner join clinical_attribute_meta cam
     on cam.cancer_study_id=cs.cancer_study_id
    and cam.patient_attribute=0
   inner join patient p
     on p.cancer_study_id=cs.cancer_study_id
   inner join sample s
     on s.patient_id=p.internal_id
   inner join clinical_sample cls
     on s.internal_id=cls.internal_id
    and cls.attr_id=cam.attr_id
 where cs.cancer_study_identifier=''%s''') a
",mycancerstudy,mycancerstudy)
df_v=CasJobs.executeQuery(sql, DATABASE)
print(proc.time()-ptm)
r=cast(df_v, stable_id ~ attr_id,value='attr_value')
head(r,4)

In [None]:
# 2. using polybase for same query
# This fails when using 'UNION' or UNION ALL', but works for each part individually. 
# Polybase translation not accepted by mariadb
# TODO ask MS Polybase team

sql=sprintf("
select p.stable_id
  ,    cam.attr_id,cp.attr_value
  from cancer_study cs
   inner join clinical_attribute_meta cam
     on cam.cancer_study_id=cs.cancer_study_id
    and cam.patient_attribute=1
   inner join patient p
     on p.cancer_study_id=cs.cancer_study_id
   left outer join clinical_patient cp
     on cp.internal_id=p.internal_id
    and cp.attr_id=cam.attr_id
 where cs.cancer_study_identifier='%s'
union all
select p.stable_id
  ,    cam.attr_id, cls.attr_value
  from cancer_study cs
   inner join clinical_attribute_meta cam
     on cam.cancer_study_id=cs.cancer_study_id
    and cam.patient_attribute=0
   inner join patient p
     on p.cancer_study_id=cs.cancer_study_id
   left outer join sample s
     on s.patient_id=p.internal_id
   left outer join clinical_sample cls
     on s.internal_id=cls.internal_id
    and cls.attr_id=cam.attr_id
 where cs.cancer_study_identifier='%s'
",mycancerstudy,mycancerstudy)

In [None]:
ptm<-proc.time()
df_v=CasJobs.executeQuery(sql, DATABASE)
cat(proc.time() -ptm)
r=cast(df_v, stable_id ~ attr_id,value='attr_value')
head(r,4)

In [None]:
cat(sql)
# try out in casjobs. Also investigate plan.
# seems not to be pushing much of the query to MySQL

## getMutationData


In [None]:
mycgds = CGDS("http://www.cbioportal.org/")
getMutationData(mycgds,mycaselist,mygeneticprofile,c('EGFR','PTEN'))


In [None]:
# input: mycaselist, mygeneticprofile,geneslist
genes=c('PTEN','EGFR')
geneslist=gsub("'","''",paste(shQuote(genes), collapse=","))

sql=sprintf("
select * from openquery([172.23.250.16],'
SELECT g.entrez_gene_id, g.hugo_gene_symbol as gene_symbol
,      s.stable_id as case_id
,      m.center as sequencing_center, m.mutation_status
,      me.mutation_type,m.validation_status
,      me.functional_impact_score,me.link_xvar as xvar_link, link_pdb as xvar_link_pdb, link_msa as xvar_link_msa
,      me.chr, me.start_position, me.end_position, me.reference_allele, me.tumor_seq_allele as variant_allele
,      me.keyword
,      gp.genetic_profile_id
  FROM sample_list sl
  ,    sample_list_list sll
  ,    sample s
  ,    genetic_profile gp
  ,    gene g
  ,    mutation m
  ,    mutation_event me
 where sl.stable_id=''%s''
  and sl.list_id=sll.list_id
  and s.internal_id=sll.sample_id
  and gp.stable_id=''%s''
  and g.hugo_gene_symbol in (%s)
  and m.sample_id=s.internal_id
  and m.genetic_profile_id=gp.genetic_profile_id
  and m.entrez_gene_id = g.entrez_gene_id
  and me.mutation_event_id=m.mutation_event_id
') a
",mycaselist,mygeneticprofile, geneslist)
df=CasJobs.executeQuery(sql,DATABASE)
head(df,4)