# Fetch Mutation Status

To determine which patients have a given mutation, the below code can be used.

## Fetch by Gene

This code can be used to fetch all mutations of the passed in gene.

In [None]:
import json
import requests
import pandas as pd

def fetch_mut_by_gene(gene_id, transcript_list, project, out_file):
    """
        Fetches the sample IDs with the given mutation for the passed in project
        
        gene_id: ENSEMBL gene ID for gene of interest
        transcript_list: list of ENSEMBL transcript ids of the given gene for which 
                         the affect of the given mutation is of interest
        project: TCGA project (ie TCGA-STAD, TCGA-CESC)
    """
    ssms_endpt = 'https://api.gdc.cancer.gov/ssms'
    filters = {"op":"and",
               "content":[
                   {"op":"in",
                    "content":{
                        "field":"consequence.transcript.gene.gene_id",
                        "value":[gene_id]
                    }
                   },{"op":"in",
                      "content":{
                          "field":"occurrence.case.project.project_id",
                          "value":[project]
                        }
                     }
               ]
              }
    fields = ["occurrence.case.case_id", "occurrence.case.project.project_id", "ssm_id", 
              "consequence.transcript.transcript_id",
              "consequence.transcript.annotation.polyphen_impact", 
              "consequence.transcript.annotation.sift_impact", "consequence.transcript.annotation.vep_impact"]
    field_str = ",".join(fields)
    params = {"filters": json.dumps(filters), "fields": field_str, "pretty": True, "size": 1500}
    response = requests.get(ssms_endpt, params=params)
    response = json.loads(response.content.decode("utf-8"))

    columns = ["SSM_ID"]
    columns += ["{}_{}".format(x,y) for x in transcript_list for y in ["VEP", "PolyPhen", "SIFT"]]
    
    df = pd.DataFrame(columns=columns)
    
    for ssm_data in response["data"]["hits"]:
        ssm_id = ssm_data["ssm_id"]
        for occ in ssm_data["occurrence"]:
            if occ["case"]["project"]["project_id"] != project:
                continue
            case_id = occ["case"]["case_id"]
            case_dict = {"SSM_ID": ssm_id}
            for trans in ssm_data["consequence"]:
                transcript = trans["transcript"]
                trans_id = transcript["transcript_id"]
                if trans_id not in transcripts:
                    print trans_id
                    continue
                case_dict["{}_VEP".format(trans_id)] = transcript["annotation"]["vep_impact"]
                case_dict["{}_PolyPhen".format(trans_id)] = transcript["annotation"]["polyphen_impact"]
                case_dict["{}_SIFT".format(trans_id)] = transcript["annotation"]["sift_impact"]
            df.loc[case_id] = pd.Series(case_dict)
    df.to_csv(out_file)

## Fetch by Mutation

Patients with a specific mutation can also be fetched.

In [None]:
import json
import requests
import pandas as pd

def fetch_mut_by_ssmid(ssm_id, transcript_list, project, out_file):
    """
        Fetches the sample IDs with the given mutation for the passed in project
        
        ssm_id: TCGA mutation id
        transcript_list: list of ENSEMBL transcript ids of the given gene for which 
                         the affect of the given mutation is of interest
        project: TCGA project (ie TCGA-STAD, TCGA-CESC)
    """
    # Fetches all mutations of the given gene
    ssms_endpt = 'https://api.gdc.cancer.gov/ssms'
    filters = {"op":"and",
               "content":[
                   {"op":"in",
                    "content":{
                        "field":"ssm_id",
                        "value":[ssm_id]
                    }
                   },{"op":"in",
                      "content":{
                          "field":"occurrence.case.project.project_id",
                          "value":[project]
                        }
                     }
               ]
              }
    fields = ["occurrence.case.case_id", "occurrence.case.project.project_id", "ssm_id", 
              "consequence.transcript.transcript_id",
              "consequence.transcript.annotation.polyphen_impact", 
              "consequence.transcript.annotation.sift_impact", "consequence.transcript.annotation.vep_impact"]
    field_str = ",".join(fields)
    params = {"filters": json.dumps(filters), "fields": field_str, "pretty": True, "size": 1500}
    response = requests.get(ssms_endpt, params=params)
    response = json.loads(response.content.decode("utf-8"))

    columns = ["SSM_ID"]
    columns += ["{}_{}".format(x,y) for x in transcript_list for y in ["VEP", "PolyPhen", "SIFT"]]
    
    df = pd.DataFrame(columns=columns)
    
    for ssm_data in response["data"]["hits"]:
        ssm_id = ssm_data["ssm_id"]
        for occ in ssm_data["occurrence"]:
            if occ["case"]["project"]["project_id"] != project:
                continue
            case_id = occ["case"]["case_id"]
            case_dict = {"SSM_ID": ssm_id}
            for trans in ssm_data["consequence"]:
                transcript = trans["transcript"]
                trans_id = transcript["transcript_id"]
                if trans_id not in transcripts:
                    print trans_id
                    continue
                case_dict["{}_VEP".format(trans_id)] = transcript["annotation"]["vep_impact"]
                case_dict["{}_PolyPhen".format(trans_id)] = transcript["annotation"]["polyphen_impact"]
                case_dict["{}_SIFT".format(trans_id)] = transcript["annotation"]["sift_impact"]
            df.loc[case_id] = pd.Series(case_dict)
    df.to_csv(out_file)