In [1]:
# IMPORT 

import csv
from biomart import BiomartServer
import pandas as pd
import openpyxl

In [2]:
# DATASET

## biomart_server
server = BiomartServer('http://www.ensembl.org/biomart')
dataset = server.datasets['hsapiens_gene_ensembl']

In [3]:
# FUNCTIONS

def gene_dictionary(index,
                    gene_name,
                    gene_list_number,
                    gene_list_id,
                    source,
                    ensembl_gene_id,
                    ensembl_transcript_id,
                    refseq_mrna,
                    hgnc_symbol,
                    organism,
                    alias,
                    direction_string):
    
    dictionary = {
        'Index' : index,
        'Gene_name' : gene_name,
        'Gene_list_number' : gene_list_number,
        'Gene_list_index' : gene_list_id,
        'Source' : source,
        'Ensembl_gene_id' : ensembl_gene_id,
        'Ensembl_transcript_id' : ensembl_transcript_id,
        'RefSeq_mRNA_id' : refseq_mrna,
        'Organism' : organism,
        'HGNC_symbol' : hgnc_symbol,
        'Alias' : alias,
        'Info' : direction_string
    }
    
    return dictionary


  
def biomartParameters(mgi_symbol):

    attributes = ['ensembl_gene_id',
                  'ensembl_transcript_id',
                  'refseq_mrna']

    filters = {'external_gene_name':[mgi_symbol]}               # gene_name = mgi_symbol
    response = dataset.search({'attributes':attributes,'filters':filters})

    # response_convertion
    values = [line.split("\t") for line in response.text.split("\n") if line.strip()]
        
    return values 



def alias_and_official(ls_notResponse,ls_row_10,ls_row_2):

    ls_response = []
    ls_response2 = []
    
    for notResponse in ls_notResponse:
        word = str(notResponse).lower()

        for i in range(len(ls_row_10)):
            if ls_row_10[i]:
                temp = str(ls_row_10[i]).lower()
                t_strings = temp.split("|")
                if word in t_strings:
                    ls_response.append([word, i, ls_row_1[i]])

    ls_response.insert(0, ['gene_name','alias_index','mgi_id'])         # print(len(ls_response))


    for notResponse in ls_notResponse:
        word = str(notResponse).lower()

        for i in range(len(ls_row_10)):
            if ls_row_10[i]:
                temp = str(ls_row_10[i]).lower()
                if word == temp:
                    ls_response2.append([word, i, ls_row_1[i]])
                    

    ls_response.insert(0, ['gene_name','official_index','mgi_id'])      # print(len(ls_response2))

    ls_response_3 = ls_response
    
    for response in ls_response2:
        if response not in ls_response_3:
            ls_response_3.append(response)
        else: print(response)

    ls_response_3.pop(0)
    ls_response_3.pop(0)
    ls_response.insert(0, ['gene_name','official/alias_index','mgi_id'])
    
    return ls_response_3

def biomartParameters_mgi(gene_name):

    attributes = ['ensembl_gene_id',
                  'ensembl_transcript_id',
                  'refseq_mrna']
    filters = {'uniprot_gn_symbol':[gene_name]}                    
    response = dataset.search({'attributes':attributes,'filters':filters})
    
    # response_convertion
    values = [line.split("\t") for line in response.text.split("\n") if line.strip()]
        
    return values 


def updateCellswithAlias(mgi_file_path, dictionary_file_path, file_path):
    
    # Load MGI file and open workbook
    wb_mgi = openpyxl.load_workbook(mgi_file_path)
    ws_mgi = wb_mgi.active

    ls_row_alias = [ws_mgi.cell(row=i,column=10).value for i in range(2,ws_mgi.max_row+1)]
    ls_row_g_name = [ws_mgi.cell(row=i,column=2).value for i in range(2,ws_mgi.max_row+1)] 

    # Load dictionary file and open workbook
    workbook = openpyxl.load_workbook(dictionary_file_path)
    sheet = workbook.active

    ls_row_2 = [sheet.cell(row=i,column=2).value for i in range(2,sheet.max_row+1)]
    ls_row_9 = [sheet.cell(row=i,column=11).value for i in range(2,sheet.max_row+1)]

    
    length = len(ls_row_2) + 2

    # Modify the desired cell
    for i in range(length):
        K_cel = 'K' + str(i + 2)

        for k in range(len(ls_row_g_name)):
            try:
                if str(ls_row_g_name[k]).lower() == str(ls_row_2[i]).lower():
                    if ls_row_alias[k] is not None:
                        sheet[K_cel] = ls_row_alias[k]
                    else:
                        continue
            except:
                continue

    # Save the file
    workbook.save(file_path)


In [4]:
# LOAD START FILES

## PUBLICATION SOURCE
wb = openpyxl.load_workbook('../ifpan-GR-database-papers.xlsx')
ws = wb['lung']

publication_geneName = [ws.cell(row=i,column=3).value for i in range(2,ws.max_row+1)]
direction = [ws.cell(row=i,column=6).value for i in range(2,ws.max_row+1)]

## MGI
wb_1 = openpyxl.load_workbook(r'../MGI_EntrezGene.xlsx')
ws_1 = wb_1.active

ls_row_10 = [ws_1.cell(row=i,column=10).value for i in range(2,ws_1.max_row+1)]
ls_row_2 = [ws_1.cell(row=i,column=2).value for i in range(2,ws_1.max_row+1)]
ls_row_1 = [ws_1.cell(row=i,column=1).value for i in range(2,ws_1.max_row+1)]  

In [5]:
# LISTS

dictionary = {}
ls_geneDictionaries = []
ls_notResponse = []
ls_notResponse_after = []


In [6]:
# DICTIONARY

for i in range(len(publication_geneName)):
    gene_name = publication_geneName[i]
    direction_string = direction[i]   # => info
    hgnc_symbol = gene_name
    
    if gene_name != '':

        # VARIABLES
        index = i + 1
        gene_list_number = 24
        gene_list_id =  'all_sig_genes_19801529'      # => cluster
        source = 'PMID: 19801529'
        organism = 'human'
        print(direction_string)
        alias = ''


        ensembl_gene_id_temp = []
        ensembl_transcript_id_temp = []
        refseq_mrna_temp = []

        ls_biomartParameters = biomartParameters(gene_name)
        if not ls_biomartParameters:
            ls_notResponse.append(gene_name)
            print(ls_notResponse)


        for ls in ls_biomartParameters:
            for j in range(3):
                if len(ls) < (j+1):
                    ls.append('')
            ensembl_gene_id_temp.append(ls[0])
            ensembl_transcript_id_temp.append(ls[1])
            refseq_mrna_temp.append(ls[2])  

        ensembl_gene_id = '|'.join(list(set(filter(None, ensembl_gene_id_temp))))
        ensembl_transcript_id = '|'.join(list(set(filter(None, ensembl_transcript_id_temp))))
        refseq_mrna = '|'.join(list(set(filter(None, refseq_mrna_temp))))
        
      
        
        temp_gene_dictionary = gene_dictionary(index,
                                               gene_name,
                                               gene_list_number,
                                               gene_list_id,
                                               source,
                                               ensembl_gene_id,
                                               ensembl_transcript_id, 
                                               refseq_mrna,
                                               hgnc_symbol,
                                               organism,
                                               alias,
                                               direction_string)

        ls_geneDictionaries.append(temp_gene_dictionary)
        print(temp_gene_dictionary)





up
{'Index': 1, 'Gene_name': 'PTGER4', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000171522', 'Ensembl_transcript_id': 'ENST00000513635|ENST00000512578|ENST00000509543|ENST00000514343|ENST00000302472', 'RefSeq_mRNA_id': 'NM_000958', 'Organism': 'human', 'HGNC_symbol': 'PTGER4', 'Alias': '', 'Info': 'up'}
up
{'Index': 2, 'Gene_name': 'SOCS1', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000185338', 'Ensembl_transcript_id': 'ENST00000332029|ENST00000644787', 'RefSeq_mRNA_id': 'NM_003745', 'Organism': 'human', 'HGNC_symbol': 'SOCS1', 'Alias': '', 'Info': 'up'}
up
{'Index': 3, 'Gene_name': 'TSC22D3', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000157514', 'Ensembl_transcript_id': 'ENST00000505965|ENST00000372384|ENST00000506081|ENST00000514426|ENST000003723

{'Index': 18, 'Gene_name': 'BIRC3', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000023445', 'Ensembl_transcript_id': 'ENST00000527309|ENST00000532808|ENST00000527336|ENST00000615299|ENST00000528940|ENST00000263464|ENST00000526421|ENST00000673846', 'RefSeq_mRNA_id': 'NM_182962|NM_001165', 'Organism': 'human', 'HGNC_symbol': 'BIRC3', 'Alias': '', 'Info': 'up'}
up
{'Index': 19, 'Gene_name': 'KLF4', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000136826', 'Ensembl_transcript_id': 'ENST00000374672|ENST00000493306|ENST00000497048|ENST00000420475|ENST00000610832|ENST00000411706', 'RefSeq_mRNA_id': 'NM_004235|NM_001314052', 'Organism': 'human', 'HGNC_symbol': 'KLF4', 'Alias': '', 'Info': 'up'}
up
{'Index': 20, 'Gene_name': 'ENTPD2', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_ge

['HRASLS2', 'GOLSYN', 'RGC32', 'SDPR', 'PAMCI', 'C9orf150']
{'Index': 38, 'Gene_name': 'C9orf150', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': '', 'Ensembl_transcript_id': '', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'C9orf150', 'Alias': '', 'Info': 'up'}
up
{'Index': 39, 'Gene_name': 'KLF9', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000119138', 'Ensembl_transcript_id': 'ENST00000377126', 'RefSeq_mRNA_id': 'NM_001206', 'Organism': 'human', 'HGNC_symbol': 'KLF9', 'Alias': '', 'Info': 'up'}
up
{'Index': 40, 'Gene_name': 'FAM43A', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000185112', 'Ensembl_transcript_id': 'ENST00000329759', 'RefSeq_mRNA_id': 'NM_153690', 'Organism': 'human', 'HGNC_symbol': 'FAM43A', 'Alias': '', 'Info': 'up'}
up
{'Index': 

{'Index': 55, 'Gene_name': 'ELL2', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000118985', 'Ensembl_transcript_id': 'ENST00000508757|ENST00000515020|ENST00000508694|ENST00000506628|ENST00000513343|ENST00000237853|ENST00000505584|ENST00000635633', 'RefSeq_mRNA_id': 'NM_012081', 'Organism': 'human', 'HGNC_symbol': 'ELL2', 'Alias': '', 'Info': 'up'}
up
{'Index': 56, 'Gene_name': 'CPEB4', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000113742', 'Ensembl_transcript_id': 'ENST00000519152|ENST00000522336|ENST00000659882|ENST00000517880|ENST00000518141|ENST00000522344|ENST00000520867|ENST00000519835|ENST00000265085|ENST00000656232|ENST00000519467|ENST00000334035|ENST00000657000', 'RefSeq_mRNA_id': 'NM_001308193|NM_001308189|NM_001308191|NM_001308192|NM_030627', 'Organism': 'human', 'HGNC_symbol': 'CPEB4', 'Alias': '', 'Info': 'up'}
up
{'In

{'Index': 74, 'Gene_name': 'RHOU', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000116574|ENSG00000284984', 'Ensembl_transcript_id': 'ENST00000366691|ENST00000646945', 'RefSeq_mRNA_id': 'NM_021205', 'Organism': 'human', 'HGNC_symbol': 'RHOU', 'Alias': '', 'Info': 'up'}
up
{'Index': 75, 'Gene_name': 'MT2A', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000125148', 'Ensembl_transcript_id': 'ENST00000567300|ENST00000561491|ENST00000245185|ENST00000563985|ENST00000562017', 'RefSeq_mRNA_id': 'NM_005953', 'Organism': 'human', 'HGNC_symbol': 'MT2A', 'Alias': '', 'Info': 'up'}
up
{'Index': 76, 'Gene_name': 'ITPKC', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000086544', 'Ensembl_transcript_id': 'ENST00000699488|ENST00000263370|ENST00000597003|ENST00000699490|ENST0

{'Index': 94, 'Gene_name': 'SLC26A2', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000155850', 'Ensembl_transcript_id': 'ENST00000690410|ENST00000433184|ENST00000286298|ENST00000503336', 'RefSeq_mRNA_id': 'NM_000112', 'Organism': 'human', 'HGNC_symbol': 'SLC26A2', 'Alias': '', 'Info': 'up'}
up
{'Index': 95, 'Gene_name': 'PYGB', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000100994', 'Ensembl_transcript_id': 'ENST00000471359|ENST00000216962|ENST00000428458', 'RefSeq_mRNA_id': 'NM_002862', 'Organism': 'human', 'HGNC_symbol': 'PYGB', 'Alias': '', 'Info': 'up'}
up
{'Index': 96, 'Gene_name': 'FOXN2', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000170802', 'Ensembl_transcript_id': 'ENST00000413569|ENST00000340553', 'RefSeq_mRNA_id': 'NM_001375449|NM_001375445|

{'Index': 113, 'Gene_name': 'ANKRD1', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000148677', 'Ensembl_transcript_id': 'ENST00000371697', 'RefSeq_mRNA_id': 'NM_014391', 'Organism': 'human', 'HGNC_symbol': 'ANKRD1', 'Alias': '', 'Info': 'up'}
up
{'Index': 114, 'Gene_name': 'LRRC8A', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000136802', 'Ensembl_transcript_id': 'ENST00000259324|ENST00000372599|ENST00000492784|ENST00000372600|ENST00000483638', 'RefSeq_mRNA_id': 'NM_001127245|NM_019594|NM_001127244', 'Organism': 'human', 'HGNC_symbol': 'LRRC8A', 'Alias': '', 'Info': 'up'}
up
{'Index': 115, 'Gene_name': 'RHOB', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000143878', 'Ensembl_transcript_id': 'ENST00000272233', 'RefSeq_mRNA_id': 'NM_004040', 'Organism': 'hum

{'Index': 133, 'Gene_name': 'MID1IP1', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000165175', 'Ensembl_transcript_id': 'ENST00000336949|ENST00000378474|ENST00000614558|ENST00000457894', 'RefSeq_mRNA_id': 'NM_021242|NM_001098790|NM_001098791', 'Organism': 'human', 'HGNC_symbol': 'MID1IP1', 'Alias': '', 'Info': 'down'}
down
{'Index': 134, 'Gene_name': 'ID2', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000115738', 'Ensembl_transcript_id': 'ENST00000472142|ENST00000331129|ENST00000396290|ENST00000234091', 'RefSeq_mRNA_id': 'NM_002166', 'Organism': 'human', 'HGNC_symbol': 'ID2', 'Alias': '', 'Info': 'down'}
down
{'Index': 135, 'Gene_name': 'OSGIN1', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000140961', 'Ensembl_transcript_id': 'ENST00000393306|ENST0000036

{'Index': 153, 'Gene_name': 'FZD8', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000177283', 'Ensembl_transcript_id': 'ENST00000374694', 'RefSeq_mRNA_id': 'NM_031866', 'Organism': 'human', 'HGNC_symbol': 'FZD8', 'Alias': '', 'Info': 'down'}
down
{'Index': 154, 'Gene_name': 'C6orf141', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000197261', 'Ensembl_transcript_id': 'ENST00000526429|ENST00000371194|ENST00000415078|ENST00000530382|ENST00000529246|ENST00000414696|ENST00000424426', 'RefSeq_mRNA_id': 'NM_001145652', 'Organism': 'human', 'HGNC_symbol': 'C6orf141', 'Alias': '', 'Info': 'down'}
down
{'Index': 155, 'Gene_name': 'EREG', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000124882', 'Ensembl_transcript_id': 'ENST00000244869|ENST00000503689|ENST00000507603'

{'Index': 171, 'Gene_name': 'SERTAD2', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000179833', 'Ensembl_transcript_id': 'ENST00000608423|ENST00000476805|ENST00000313349', 'RefSeq_mRNA_id': 'NM_014755', 'Organism': 'human', 'HGNC_symbol': 'SERTAD2', 'Alias': '', 'Info': 'down'}
down
['HRASLS2', 'GOLSYN', 'RGC32', 'SDPR', 'PAMCI', 'C9orf150', 'CTGF', 'GENX-3414', 'HIG2', 'SGK', 'KIAA1754', 'FAM105A', 'DSCR1', 'CXCR7', 'HIST1H2BJ', 'HIST3H2A', 'FLJ45248']
{'Index': 172, 'Gene_name': 'FLJ45248', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': '', 'Ensembl_transcript_id': '', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'FLJ45248', 'Alias': '', 'Info': 'down'}
down
{'Index': 173, 'Gene_name': 'FGFR3', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000068078',

{'Index': 191, 'Gene_name': 'FOSL1', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000175592', 'Ensembl_transcript_id': 'ENST00000312562|ENST00000532401|ENST00000448083|ENST00000531493|ENST00000534222', 'RefSeq_mRNA_id': 'NM_001300856|NM_005438|NM_001300844|NM_001300857|NM_001300855', 'Organism': 'human', 'HGNC_symbol': 'FOSL1', 'Alias': '', 'Info': 'down'}
down
{'Index': 192, 'Gene_name': 'PLK2', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000145632', 'Ensembl_transcript_id': 'ENST00000617412|ENST00000511326|ENST00000508300|ENST00000509422|ENST00000502671|ENST00000509555|ENST00000274289|ENST00000505244|ENST00000503713|ENST00000504196|ENST00000503378|ENST00000514306|ENST00000503115|ENST00000510629|ENST00000515415', 'RefSeq_mRNA_id': 'NM_006622|NM_001252226', 'Organism': 'human', 'HGNC_symbol': 'PLK2', 'Alias': '', 'Info': 'down'}
do

{'Index': 211, 'Gene_name': 'MRPS21P6', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000214298', 'Ensembl_transcript_id': 'ENST00000415184', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'MRPS21P6', 'Alias': '', 'Info': 'up'}
up
{'Index': 212, 'Gene_name': 'MT1P3', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000229230', 'Ensembl_transcript_id': 'ENST00000421424', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'MT1P3', 'Alias': '', 'Info': 'up'}
up
{'Index': 213, 'Gene_name': 'MT2P1', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000162840|ENSG00000278061', 'Ensembl_transcript_id': 'ENST00000294916|ENST00000262499', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'MT2P1', 'Alias': '', 'Info': 'up'}
up
['HRASLS2', 'GOLSYN', '

{'Index': 229, 'Gene_name': 'RPS3P1', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000230935', 'Ensembl_transcript_id': 'ENST00000456781', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'RPS3P1', 'Alias': '', 'Info': 'down'}
down
{'Index': 230, 'Gene_name': 'RPSAP14', 'Gene_list_number': 24, 'Gene_list_index': 'all_sig_genes_19801529', 'Source': 'PMID: 19801529', 'Ensembl_gene_id': 'ENSG00000233984', 'Ensembl_transcript_id': 'ENST00000429143', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'RPSAP14', 'Alias': '', 'Info': 'down'}
down
['HRASLS2', 'GOLSYN', 'RGC32', 'SDPR', 'PAMCI', 'C9orf150', 'CTGF', 'GENX-3414', 'HIG2', 'SGK', 'KIAA1754', 'FAM105A', 'DSCR1', 'CXCR7', 'HIST1H2BJ', 'HIST3H2A', 'FLJ45248', 'JMJD3', 'SUV420H2', 'FKHL18', 'BHLHB2', 'POU5F1P1', 'hCG_1790474', 'COX5BL1', 'FTLL1', 'TMSL4', 'RPS6P1', 'hCG_1749005', 'COX6A1P', 'hCG_1642354', 'COX5BL7', 'hCG_2041321', 'hCG_1818237

In [7]:
print(ls_notResponse)

['HRASLS2', 'GOLSYN', 'RGC32', 'SDPR', 'PAMCI', 'C9orf150', 'CTGF', 'GENX-3414', 'HIG2', 'SGK', 'KIAA1754', 'FAM105A', 'DSCR1', 'CXCR7', 'HIST1H2BJ', 'HIST3H2A', 'FLJ45248', 'JMJD3', 'SUV420H2', 'FKHL18', 'BHLHB2', 'POU5F1P1', 'hCG_1790474', 'COX5BL1', 'FTLL1', 'TMSL4', 'RPS6P1', 'hCG_1749005', 'COX6A1P', 'hCG_1642354', 'COX5BL7', 'hCG_2041321', 'hCG_1818237', 'FTLP']


In [8]:
# SCORES 

### from DICTIONARY to file
data = ls_geneDictionaries
df = pd.DataFrame.from_dict(data)
df.to_excel(r'.\Dictionary.xlsx', index=False)

### from ls_notResponse to file
data1 = ls_notResponse
df1 = pd.DataFrame.from_dict(data1)
df1.to_excel(r'.\notResponse.xlsx', index=False)

# LOAD 
# notRESPONSE
wb_2 = openpyxl.load_workbook(r'.\notResponse.xlsx')
ws_2 = wb_2.active
ls_notResponse_v2 = [ws_2.cell(row=i,column=1).value for i in range(2,ws_2.max_row+1)]


### from alias_and_official to file
data2 = alias_and_official(ls_notResponse, ls_row_10, ls_row_2)
df2 = pd.DataFrame.from_dict(data2)
df2.to_excel(r'.\responsewithMGI.xlsx', index=False)


# LOAD responsewithMGI
wb_3 = openpyxl.load_workbook(r'.\responsewithMGI.xlsx')
ws_3 = wb_3.active
gene_name_v2 = [ws_3.cell(row=i,column=1).value for i in range(3,ws_3.max_row+1)]

['pamci', 73414, 'MGI:2384307']
['sgk', 36747, 'MGI:1340062']
['fam105a', 83077, 'MGI:2687281']
['hist1h2bj', 78779, 'MGI:2448388']
['jmjd3', 79607, 'MGI:2448492']


In [9]:
print (ls_notResponse)

['HRASLS2', 'GOLSYN', 'RGC32', 'SDPR', 'PAMCI', 'C9orf150', 'CTGF', 'GENX-3414', 'HIG2', 'SGK', 'KIAA1754', 'FAM105A', 'DSCR1', 'CXCR7', 'HIST1H2BJ', 'HIST3H2A', 'FLJ45248', 'JMJD3', 'SUV420H2', 'FKHL18', 'BHLHB2', 'POU5F1P1', 'hCG_1790474', 'COX5BL1', 'FTLL1', 'TMSL4', 'RPS6P1', 'hCG_1749005', 'COX6A1P', 'hCG_1642354', 'COX5BL7', 'hCG_2041321', 'hCG_1818237', 'FTLP']


In [14]:
print (gene_name_v2)


['sdpr', 'pamci', 'ctgf', 'sgk', 'fam105a', 'dscr1', 'cxcr7', 'hist1h2bj', 'hist3h2a', 'jmjd3', 'suv420h2', 'fkhl18', 'bhlhb2']


In [15]:
# SECOND DICTIONARY

for i in range(len(gene_name_v2)):
    gene_name = gene_name_v2[i]
    print(gene_name)
    hgnc_symbol = gene_name
    
    if gene_name != '':

        # variables
        alias = ''

        ensembl_gene_id_temp = []
        ensembl_transcript_id_temp = []
        refseq_mrna_temp = []
        

        ls_biomartParameters_mgi = biomartParameters_mgi(gene_name)
        
        print(ls_biomartParameters_mgi)
        
        if not ls_biomartParameters_mgi:
            ls_notResponse_after.append(gene_name)


        for ls in ls_biomartParameters_mgi:
            for j in range(3):
                if len(ls) < (j+1):
                    ls.append('')
            ensembl_gene_id_temp.append(ls[0])
            ensembl_transcript_id_temp.append(ls[1])
            refseq_mrna_temp.append(ls[2])  

        ensembl_gene_id = '|'.join(list(set(filter(None, ensembl_gene_id_temp))))
        ensembl_transcript_id = '|'.join(list(set(filter(None, ensembl_transcript_id_temp))))
        refseq_mrna = '|'.join(list(set(filter(None, refseq_mrna_temp))))
        
        
        
        for k in range(len(ls_geneDictionaries)):
            temp = str(ls_geneDictionaries[k]['Gene_name']).lower()

            if temp == gene_name:
                print('response')
                ls_geneDictionaries[k]['Ensembl_gene_id'] = ensembl_gene_id
                ls_geneDictionaries[k]['Ensembl_transcript_id'] = ensembl_transcript_id
                ls_geneDictionaries[k]['RefSeq_mRNA_id'] = refseq_mrna
                ls_geneDictionaries[k]['HGNC_symbol'] = hgnc_symbol
                


sdpr
[]
response
pamci
[]
response
ctgf
[]
response
sgk
[]
response
fam105a
[]
response
dscr1
[['ENSG00000159200', 'ENST00000487434', ''], ['ENSG00000159200', 'ENST00000313806', 'NM_004414'], ['ENSG00000159200', 'ENST00000620920', 'NM_001285393'], ['ENSG00000159200', 'ENST00000381135', 'NM_001285391'], ['ENSG00000159200', 'ENST00000443408', 'NM_203417'], ['ENSG00000159200', 'ENST00000443408', 'NM_001285392'], ['ENSG00000159200', 'ENST00000482533', ''], ['ENSG00000159200', 'ENST00000481448', ''], ['ENSG00000159200', 'ENST00000381132', 'NM_203418'], ['ENSG00000159200', 'ENST00000487990', ''], ['ENSG00000159200', 'ENST00000399272', 'NM_001285389'], ['ENSG00000159200', 'ENST00000489903', ''], ['ENSG00000159200', 'ENST00000492600', 'NM_001331016'], ['ENSG00000159200', 'ENST00000609325', ''], ['ENSG00000159200', 'ENST00000463276', '']]
response
cxcr7
[]
response
hist1h2bj
[['ENSG00000124635', 'ENST00000607124', ''], ['ENSG00000124635', 'ENST00000606923', ''], ['ENSG00000124635', 'ENST0000033

In [16]:
# SCORES 

### from SECOND DICTIONARY to file

data3 = ls_geneDictionaries
df3 = pd.DataFrame.from_dict(data3)
df3.to_excel(r'.\secondDictionary.xlsx', index=False)
df3.to_csv(r'.\secondDictionary.tsv', sep="\t")

data4 = ls_notResponse_after
df4 = pd.DataFrame.from_dict(data4)
df4.to_excel(r'.\ls_notResponse_after.xlsx', index=False)



In [17]:
# Add ALIAS

mgi_file_path = r'..\MGI_EntrezGene.xlsx'
dictionary_file_path = r'.\secondDictionary.xlsx'
save_file_path = r'.\withAlias.xlsx'


updateCellswithAlias(mgi_file_path, dictionary_file_path, save_file_path)


In [18]:
# Add INFO

def updateCellswithINFO(source_file_path, source_sheet, alias_file_path, info_file_path):
    
    wb_info = openpyxl.load_workbook(source_file_path)
    ws_info = wb_info[source_sheet]
    ls_row_g_name = [ws_info.cell(row=i, column=3).value for i in range(2, ws_info.max_row + 1)]


    wb_alias = openpyxl.load_workbook(alias_file_path)
    ws_alias = wb_alias.active
    ls_row_2 = [ws_alias.cell(row=i, column=2).value for i in range(2, ws_alias.max_row + 1)]


    matched_indices = set()               # to keep track of matched id

    for j, (g_name, row_2) in enumerate(zip(ls_row_g_name, ls_row_2), start=2):
        if j not in matched_indices and str(g_name).lower() == str(row_2).lower():
            print(str(g_name).lower(), str(row_2).lower())
            L_cel = 'L' + str(j)
            tempInfo = str(ws_alias[L_cel].value)
            matched_indices.add(j)        # add matched index to set - avoid duplicate

### the number and type of items in the INFO column depends on the source publication


            ws_alias[L_cel] = ''
            ws_alias[L_cel] = ('Regulation:' + str(tempInfo).upper() + '|'
                            + 'Log2Ratio:' + str(ws_info['E' + str(j)].value) + '|'
                            + 'tissue:' + str(ws_info['H' + str(j)].value) + '|'
                            + 'cell:' + str(ws_info['I' + str(j)].value) + '|'
                            + 'enviroroment:' + str(ws_info['J' + str(j)].value) + '|'
                            + 'treatment:' + str(ws_info['K' + str(j)].value) + '|'
                            + 'dose:' + str(ws_info['L' + str(j)].value) + '|'
                            + 'time:' + str(ws_info['M' + str(j)].value) + '|'
                            + 'FDR_threshold:' + str(ws_info['N' + str(j)].value) + '|'
                            + 'method:' + str(ws_info['O' + str(j)].value) + '|'
                            + 'genome_element' + str(ws_info['P' + str(j)].value))

        
        


    # Save the file
    wb_alias.save(info_file_path)



source_file_path = r'..\ifpan-GR-database-papers.xlsx'
source_sheet = 'lung'
alias_file_path = '.\withAlias.xlsx'
info_file_path = r'.\withINFO.xlsx'

updateCellswithINFO(source_file_path, source_sheet, alias_file_path, info_file_path)


ptger4 ptger4
socs1 socs1
tsc22d3 tsc22d3
spry1 spry1
rasd1 rasd1
psors1c2 psors1c2
per1 per1
rgs2 rgs2
zfp36 zfp36
psors1c1 psors1c1
hrasls2 hrasls2
pou5f1 pou5f1
tfcp2l1 tfcp2l1
igfbp1 igfbp1
f3 f3
angptl4 angptl4
golsyn golsyn
birc3 birc3
klf4 klf4
entpd2 entpd2
rgc32 rgc32
pdk4 pdk4
cidec cidec
tiparp tiparp
bcl6 bcl6
snai2 snai2
cited2 cited2
klf6 klf6
slc19a2 slc19a2
nfkbia nfkbia
cebpd cebpd
errfi1 errfi1
sdpr sdpr
prdm1 prdm1
pamci pamci
galnt4 galnt4
gadd45a gadd45a
c9orf150 c9orf150
klf9 klf9
fam43a fam43a
cchcr1 cchcr1
rrad rrad
dusp1 dusp1
cdc42ep3 cdc42ep3
zc3h12a zc3h12a
tnfaip3 tnfaip3
stra8 stra8
pard6b pard6b
per2 per2
ern1 ern1
stom stom
thbd thbd
ctgf ctgf
alox5ap alox5ap
ell2 ell2
cpeb4 cpeb4
cebpb cebpb
cxcl2 cxcl2
edn2 edn2
arl4d arl4d
znf57 znf57
cry2 cry2
ripk4 ripk4
tspyl2 tspyl2
arl4a arl4a
stk17b stk17b
pxn pxn
fgd4 fgd4
genx-3414 genx-3414
fosl2 fosl2
mcl1 mcl1
pim3 pim3
hig2 hig2
rhou rhou
mt2a mt2a
itpkc itpkc
mt1x mt1x
plekhf2 plekhf2
tsc22d2 tsc22d2
tbx3

In [None]:
                                ''' + '|'
                                 + '|'
                                +'TTEST.NEURO.ALD:' + str(ws_mgi[S_cel].value) + '|'
                                +'TTEST.NEURO.DHT:' + str(ws_mgi[T_cel].value) + '|'
                                +'TTEST.NEURO.PRG:' + str(ws_mgi[U_cel].value) + '|'
                                +'TTEST.NEURO.ESR:' + str(ws_mgi[V_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.DEX:' + str(ws_mgi[W_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.ALD:' + str(ws_mgi[X_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.DHT:' + str(ws_mgi[Y_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.PRG:' + str(ws_mgi[Z_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.ESR:' + str(ws_mgi[AA_cel].value) + '|'
                                +'FDR_TTEST.NEURO.DEX:' + str(ws_mgi[AB_cel].value) + '|'
                                +'FDR_TTEST.NEURO.ALD:' + str(ws_mgi[AC_cel].value) + '|'
                                +'FDR_TTEST.NEURO.DHT:' + str(ws_mgi[AD_cel].value) + '|'
                                +'FDR_TTEST.NEURO.PRG:' + str(ws_mgi[AE_cel].value) + '|'
                                +'FDR_TTEST.NEURO.ESR:' + str(ws_mgi[AF_cel].value) + '|'
                                +'FOLD.ASTRO.DEX:' + str(ws_mgi[AG_cel].value) + '|'
                                +'FOLD.ASTRO.ALD:' + str(ws_mgi[AH_cel].value) + '|'
                                +'FOLD.ASTRO.DHT:' + str(ws_mgi[AI_cel].value) + '|'
                                +'FOLD.ASTRO.PRG:' + str(ws_mgi[AJ_cel].value) + '|'
                                +'FOLD.ASTRO.ESR:' + str(ws_mgi[AK_cel].value) + '|'
                                +'FOLD.NEURO.DEX:' + str(ws_mgi[AL_cel].value) + '|'
                                +'FOLD.NEURO.ALD:' + str(ws_mgi[AM_cel].value) + '|'
                                +'FOLD.NEURO.DHT:' + str(ws_mgi[AN_cel].value) + '|'
                                +'FOLD.NEURO.PRG:' + str(ws_mgi[AO_cel].value) + '|'
                                +'FOLD.NEURO.ESR:' + str(ws_mgi[AP_cel].value))'''