In [1]:
import pandas as pd
from google.cloud import storage
client = storage.Client()
bucket=client.get_bucket('somatic_germline_mutations')
blob = storage.Blob('mim2gene.txt',bucket)
with open('mim2gene.txt', 'wb') as file_obj:
    blob.download_to_file(file_obj)
df_=pd.read_csv('mim2gene.txt',sep='\t',skiprows=5, header=None, \
                names=['MIMNumber','MIMEntryType','EntrezGeneID_NCBI','GeneName','EnsemblGeneID'])

In [2]:
df_omim=df_[df_.GeneName.notnull()] #Removing entries from omim file without a gene name.
df_omim.head(2)

Unnamed: 0,MIMNumber,MIMEntryType,EntrezGeneID_NCBI,GeneName,EnsemblGeneID
7,100640,gene,216.0,ALDH1A1,ENSG00000165092
8,100650,gene/phenotype,217.0,ALDH2,ENSG00000111275


In [3]:
blob = storage.Blob('uniprot-organismHomosapiens9606.tab',bucket)
with open('uniprot-organismHomosapiens9606.tab', 'wb') as file_obj:
    blob.download_to_file(file_obj)

df_=pd.read_csv('uniprot-organismHomosapiens9606.tab',sep='\t', header=0, \
               names=['Entry','ProteinName','GeneName','Organism','Entryname','EnsemblGeneID'])

In [4]:
df_uniprot=df_[df_.GeneName.notnull()] #Removing entries from Uniprot file without a gene name.
df_uniprot.head(5)

Unnamed: 0,Entry,ProteinName,GeneName,Organism,Entryname,EnsemblGeneID
0,P04637,Cellular tumor antigen p53 (Antigen NY-CO-13) ...,TP53 P53,Homo sapiens (Human),P53_HUMAN,ENST00000269305 [P04637-1];ENST00000420246 [P0...
1,P05067,Amyloid-beta A4 protein (ABPP) (APPI) (APP) (A...,APP A4 AD1,Homo sapiens (Human),A4_HUMAN,ENST00000346798 [P05067-1];ENST00000348990 [P0...
2,Q14524,Sodium channel protein type 5 subunit alpha (H...,SCN5A,Homo sapiens (Human),SCN5A_HUMAN,ENST00000333535 [Q14524-1];ENST00000423572 [Q1...
3,P35555,Fibrillin-1 [Cleaved into: Asprosin],FBN1 FBN,Homo sapiens (Human),FBN1_HUMAN,ENST00000316623;
4,P00533,Epidermal growth factor receptor (EC 2.7.10.1)...,EGFR ERBB ERBB1 HER1,Homo sapiens (Human),EGFR_HUMAN,ENST00000275493 [P00533-1];ENST00000342916 [P0...


In [5]:
df_merged_Mim2Uni=pd.merge(df_uniprot,df_omim,how='left', on='GeneName')
df_merged_Mim2Uni.head(3)

Unnamed: 0,Entry,ProteinName,GeneName,Organism,Entryname,EnsemblGeneID_x,MIMNumber,MIMEntryType,EntrezGeneID_NCBI,EnsemblGeneID_y
0,P04637,Cellular tumor antigen p53 (Antigen NY-CO-13) ...,TP53 P53,Homo sapiens (Human),P53_HUMAN,ENST00000269305 [P04637-1];ENST00000420246 [P0...,,,,
1,P05067,Amyloid-beta A4 protein (ABPP) (APPI) (APP) (A...,APP A4 AD1,Homo sapiens (Human),A4_HUMAN,ENST00000346798 [P05067-1];ENST00000348990 [P0...,,,,
2,Q14524,Sodium channel protein type 5 subunit alpha (H...,SCN5A,Homo sapiens (Human),SCN5A_HUMAN,ENST00000333535 [Q14524-1];ENST00000423572 [Q1...,600163.0,gene,6331.0,ENSG00000183873


In [6]:
blob = storage.Blob('CosmicCompleteTargetedScreensMutantExport.tsv',bucket)
with open('CosmicCompleteTargetedScreensMutantExport.tsv', 'wb') as file_obj:
    blob.download_to_file(file_obj)

In [7]:
import pandas as pd
chunksize = 10 ** 6
merged=pd.DataFrame()
cols=[0,1,16,19,23]
colnames=["GeneName","AccessionNumber","GeneCDSlength","HGNCid","SampleName","SampleId","IdTumour", \
          "PrimarySite","SiteSubtype1","SiteSubtype2","SiteSubtype3","PrimaryHistology","HistologySubtype1", \
          "HistologySubtype2","HistologySubtype3","GenomeWideScreen","MutationId","MutationCDS","MutationAA", \
          "MutationDescription","MutationZygosity","LOH","GRCh","MutationGenomePosition","MutationStrand", \
          "SNP","ResistanceMutation","FATHMMPrediction","FATHMMScore","MutationSomaticStatus","Pubmed_PMID", \
          "IdStudy","SampleSource","TumourOrigin","Age"]

for chunk in pd.read_csv('CosmicCompleteTargetedScreensMutantExport.tsv',sep='\t',header=0,names=colnames,low_memory=False, \
                 dtype={"GeneName":object,"AccessionNumber":object,"GeneCDSlength":object, \
                        "HGNCid":object,"SampleName":object,"SampleId":object,"IdTumour":object, \
                        "PrimarySite":object,"SiteSubtype1":object,"SiteSubtype2":object,"SiteSubtype3":object, \
                        "PrimaryHistology":object,"HistologySubtype1":object,"HistologySubtype2":object, \
                        "HistologySubtype3":object,"GenomeWideScreen":object,"MutationId":object, \
                        "MutationCDS":object,"MutationAA":object,"MutationDescription":object, \
                        "MutationZygosity":object,"LOH":object,"GRCh":object,"MutationGenomePosition":object, \
                        "MutationStrand":object,"SNP":object,"ResistanceMutation":object,"FATHMMPrediction":object, \
                        "FATHMMScore":object,"MutationSomaticStatus":object,"Pubmed_PMID":object,"IdStudy":object, \
                        "SampleSource":object,"TumourOrigin":object,"Age":object}, \
                 chunksize=chunksize
                ):
    merged=chunk[['GeneName','AccessionNumber','MutationId','MutationDescription','MutationGenomePosition']] \
    .loc[chunk.MutationId.notnull()] #selecting only with mutation id from Cosmic file

In [12]:
df_merged_Cosmic2Mim2Uni=pd.merge(df_merged_Mim2Uni,merged,how='left', on='GeneName')

In [13]:
df_merged_Cosmic2Mim2Uni.to_csv('somaticmaster.csv')
df_merged_Cosmic2Mim2Uni

Unnamed: 0,Entry,ProteinName,GeneName,Organism,Entryname,EnsemblGeneID_x,MIMNumber,MIMEntryType,EntrezGeneID_NCBI,EnsemblGeneID_y,AccessionNumber,MutationId,MutationDescription,MutationGenomePosition
0,P04637,Cellular tumor antigen p53 (Antigen NY-CO-13) ...,TP53 P53,Homo sapiens (Human),P53_HUMAN,ENST00000269305 [P04637-1];ENST00000420246 [P0...,,,,,,,,
1,P05067,Amyloid-beta A4 protein (ABPP) (APPI) (APP) (A...,APP A4 AD1,Homo sapiens (Human),A4_HUMAN,ENST00000346798 [P05067-1];ENST00000348990 [P0...,,,,,,,,
2,Q14524,Sodium channel protein type 5 subunit alpha (H...,SCN5A,Homo sapiens (Human),SCN5A_HUMAN,ENST00000333535 [Q14524-1];ENST00000423572 [Q1...,600163.0,gene,6331.0,ENSG00000183873,,,,
3,P35555,Fibrillin-1 [Cleaved into: Asprosin],FBN1 FBN,Homo sapiens (Human),FBN1_HUMAN,ENST00000316623;,,,,,,,,
4,P00533,Epidermal growth factor receptor (EC 2.7.10.1)...,EGFR ERBB ERBB1 HER1,Homo sapiens (Human),EGFR_HUMAN,ENST00000275493 [P00533-1];ENST00000342916 [P0...,,,,,,,,
5,P35222,Catenin beta-1 (Beta-catenin),CTNNB1 CTNNB OK/SW-cl.35 PRO2286,Homo sapiens (Human),CTNB1_HUMAN,,,,,,,,,
6,P10275,Androgen receptor (Dihydrotestosterone recepto...,AR DHTR NR3C4,Homo sapiens (Human),ANDR_HUMAN,ENST00000374690 [P10275-1];ENST00000396043 [P1...,,,,,,,,
7,P00451,Coagulation factor VIII (Antihemophilic factor...,F8 F8C,Homo sapiens (Human),FA8_HUMAN,ENST00000330287 [P00451-2];ENST00000360256 [P0...,,,,,,,,
8,Q5S007,Leucine-rich repeat serine/threonine-protein k...,LRRK2 PARK8,Homo sapiens (Human),LRRK2_HUMAN,ENST00000298910;,,,,,,,,
9,Q8WZ42,Titin (EC 2.7.11.1) (Connectin) (Rhabdomyosarc...,TTN,Homo sapiens (Human),TITIN_HUMAN,ENST00000342992 [Q8WZ42-11];ENST00000359218 [Q...,188840.0,gene,7273.0,ENSG00000155657,,,,
