In [1]:
# IMPORT 

import csv
from biomart import BiomartServer
import pandas as pd
import openpyxl

In [2]:
# DATASET

## biomart_server
server = BiomartServer('http://www.ensembl.org/biomart')
dataset = server.datasets['hsapiens_gene_ensembl']

In [3]:
# FUNCTIONS

def gene_dictionary(index,
                    gene_name,
                    gene_list_number,
                    gene_list_id,
                    source,
                    ensembl_gene_id,
                    ensembl_transcript_id,
                    refseq_mrna,
                    hgnc_symbol,
                    organism,
                    alias,
                    direction_string):
    
    dictionary = {
        'Index' : index,
        'Gene_name' : gene_name,
        'Gene_list_number' : gene_list_number,
        'Gene_list_index' : gene_list_id,
        'Source' : source,
        'Ensembl_gene_id' : ensembl_gene_id,
        'Ensembl_transcript_id' : ensembl_transcript_id,
        'RefSeq_mRNA_id' : refseq_mrna,
        'Organism' : organism,
        'HGNC_symbol' : hgnc_symbol,
        'Alias' : alias,
        'Info' : direction_string
    }
    
    return dictionary


  
def biomartParameters(mgi_symbol):

    attributes = ['ensembl_gene_id',
                  'ensembl_transcript_id',
                  'refseq_mrna']

    filters = {'external_gene_name':[mgi_symbol]}               # gene_name = mgi_symbol
    response = dataset.search({'attributes':attributes,'filters':filters})
    

    # response_convertion
    lines = response.text.split("\n")
    lines = list(filter(lambda l: l != "",lines))
    values = []
    for l in lines:
        temp = []
        raw_values = l.split("\t")
        values.append(raw_values)
        
    return values  



def alias_and_official(ls_notResponse,ls_row_10,ls_row_2):

    ls_response = []
    ls_response2 = []
    
    for notResponse in ls_notResponse:
        word = str(notResponse).lower()

        for i in range(len(ls_row_10)):
            if ls_row_10[i]:
                temp = str(ls_row_10[i]).lower()
                t_strings = temp.split("|")
                if word in t_strings:
                    ls_response.append([word, i, ls_row_1[i]])

    ls_response.insert(0, ['gene_name','alias_index','mgi_id'])         # print(len(ls_response))


    for notResponse in ls_notResponse:
        word = str(notResponse).lower()

        for i in range(len(ls_row_10)):
            if ls_row_10[i]:
                temp = str(ls_row_10[i]).lower()
                if word == temp:
                    ls_response2.append([word, i, ls_row_1[i]])
                    

    ls_response.insert(0, ['gene_name','official_index','mgi_id'])      # print(len(ls_response2))

    ls_response_3 = ls_response
    
    for response in ls_response2:
        if response not in ls_response_3:
            ls_response_3.append(response)
        else: print(response)

    ls_response_3.pop(0)
    ls_response_3.pop(0)
    ls_response.insert(0, ['gene_name','official/alias_index','mgi_id'])
    
    return ls_response_3

def biomartParameters_mgi(gene_name):

    attributes = ['ensembl_gene_id',
                  'ensembl_transcript_id',
                  'refseq_mrna']
    filters = {'uniprot_gn_symbol':[gene_name]}                     # gene_name = mgi_id
    response = dataset.search({'attributes':attributes,'filters':filters})
    
    # response_convertion
    lines = response.text.split("\n")
    lines = list(filter(lambda l: l != "",lines))
    values = []
    for l in lines:
        temp = []
        raw_values = l.split("\t")
        values.append(raw_values)

    return values  


def updateCellswithAlias(mgi_file_path, dictionary_file_path, file_path):
    
    # Load MGI file and open workbook
    wb_mgi = openpyxl.load_workbook(mgi_file_path)
    ws_mgi = wb_mgi.active

    ls_row_alias = [ws_mgi.cell(row=i,column=10).value for i in range(2,ws_mgi.max_row+1)]
    ls_row_g_name = [ws_mgi.cell(row=i,column=2).value for i in range(2,ws_mgi.max_row+1)] 

    # Load dictionary file and open workbook
    workbook = openpyxl.load_workbook(dictionary_file_path)
    sheet = workbook.active

    ls_row_2 = [sheet.cell(row=i,column=2).value for i in range(2,sheet.max_row+1)]
    ls_row_9 = [sheet.cell(row=i,column=11).value for i in range(2,sheet.max_row+1)]

    
    length = len(ls_row_2) + 2

    # Modify the desired cell
    for i in range(length):
        K_cel = 'K' + str(i + 2)

        for k in range(len(ls_row_g_name)):
            try:
                if str(ls_row_g_name[k]).lower() == str(ls_row_2[i]).lower():
                    if ls_row_alias[k] is not None:
                        sheet[K_cel] = ls_row_alias[k]
                    else:
                        continue
            except:
                continue

    # Save the file
    workbook.save(file_path)


In [4]:
# LOAD START FILES

## PUBLICATION SOURCE
wb = openpyxl.load_workbook('../ifpan-GR-database-papers.xlsx')
ws = wb['placenta-PNSS']

publication_geneName = [ws.cell(row=i,column=3).value for i in range(2,ws.max_row+1)]
direction = [ws.cell(row=i,column=6).value for i in range(2,ws.max_row+1)]

## MGI
wb_1 = openpyxl.load_workbook(r'../MGI_EntrezGene.xlsx')
ws_1 = wb_1.active

ls_row_10 = [ws_1.cell(row=i,column=10).value for i in range(2,ws_1.max_row+1)]
ls_row_2 = [ws_1.cell(row=i,column=2).value for i in range(2,ws_1.max_row+1)]
ls_row_1 = [ws_1.cell(row=i,column=1).value for i in range(2,ws_1.max_row+1)]  

In [5]:
# LISTS

dictionary = {}
ls_geneDictionaries = []
ls_notResponse = []
ls_notResponse_after = []


In [6]:
# DICTIONARY

for i in range(len(publication_geneName)):
    gene_name = publication_geneName[i]
    direction_string = direction[i]   # => info
    hgnc_symbol = gene_name
    
    if gene_name != '':

        # VARIABLES
        index = i + 1
        gene_list_number = 28
        gene_list_id =  'all_sig_genes_33981007'      # => cluster
        source = 'PMID: 33981007'
        organism = 'human'
        print(direction_string)
        alias = ''


        ensembl_gene_id_temp = []
        ensembl_transcript_id_temp = []
        refseq_mrna_temp = []

        ls_biomartParameters = biomartParameters(gene_name)
        if not ls_biomartParameters:
            ls_notResponse.append(gene_name)
            print(ls_notResponse)


        for ls in ls_biomartParameters:
            for j in range(3):
                if len(ls) < (j+1):
                    ls.append('')
            ensembl_gene_id_temp.append(ls[0])
            ensembl_transcript_id_temp.append(ls[1])
            refseq_mrna_temp.append(ls[2])  

        ensembl_gene_id = '|'.join(list(set(filter(None, ensembl_gene_id_temp))))
        ensembl_transcript_id = '|'.join(list(set(filter(None, ensembl_transcript_id_temp))))
        refseq_mrna = '|'.join(list(set(filter(None, refseq_mrna_temp))))
        
      
        
        temp_gene_dictionary = gene_dictionary(index,
                                               gene_name,
                                               gene_list_number,
                                               gene_list_id,
                                               source,
                                               ensembl_gene_id,
                                               ensembl_transcript_id, 
                                               refseq_mrna,
                                               hgnc_symbol,
                                               organism,
                                               alias,
                                               direction_string)

        ls_geneDictionaries.append(temp_gene_dictionary)
        print(temp_gene_dictionary)





down
{'Index': 1, 'Gene_name': 'RN7SL4P', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000263740', 'Ensembl_transcript_id': 'ENST00000584058', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'RN7SL4P', 'Alias': '', 'Info': 'down'}
down
{'Index': 2, 'Gene_name': 'LINC01816', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000231327', 'Ensembl_transcript_id': 'ENST00000414141|ENST00000691543', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'LINC01816', 'Alias': '', 'Info': 'down'}
down
{'Index': 3, 'Gene_name': 'FBLN1', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000077942', 'Ensembl_transcript_id': 'ENST00000454279|ENST00000476366|ENST00000327858|ENST00000455233|ENST00000442170|ENST00000262722|ENST00000340923|ENST00000411478|ENST000004398

{'Index': 21, 'Gene_name': 'ALPP', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000163283', 'Ensembl_transcript_id': 'ENST00000392027|ENST00000485563|ENST00000474529', 'RefSeq_mRNA_id': 'NM_001632', 'Organism': 'human', 'HGNC_symbol': 'ALPP', 'Alias': '', 'Info': 'down'}
down
{'Index': 22, 'Gene_name': 'CITED4', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000179862', 'Ensembl_transcript_id': 'ENST00000372638', 'RefSeq_mRNA_id': 'NM_133467', 'Organism': 'human', 'HGNC_symbol': 'CITED4', 'Alias': '', 'Info': 'down'}
down
{'Index': 23, 'Gene_name': 'PSG8', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000124467', 'Ensembl_transcript_id': 'ENST00000478387|ENST00000306511|ENST00000593692|ENST00000469260|ENST00000406636|ENST00000600709|ENST00000404209|ENST000004

{'Index': 39, 'Gene_name': 'SYT12', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000173227', 'Ensembl_transcript_id': 'ENST00000533427|ENST00000393946|ENST00000527043|ENST00000531392|ENST00000525457|ENST00000526281|ENST00000525149', 'RefSeq_mRNA_id': 'NM_001318775|NM_001318773|NM_177963|NM_001177880', 'Organism': 'human', 'HGNC_symbol': 'SYT12', 'Alias': '', 'Info': 'down'}
down
{'Index': 40, 'Gene_name': 'SLC7A2', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000003989', 'Ensembl_transcript_id': 'ENST00000522656|ENST00000494857|ENST00000640220|ENST00000004531|ENST00000470360|ENST00000398090', 'RefSeq_mRNA_id': 'NM_003046|NM_001370338|NM_001008539|NM_001370337|NM_001164771', 'Organism': 'human', 'HGNC_symbol': 'SLC7A2', 'Alias': '', 'Info': 'down'}
down
{'Index': 41, 'Gene_name': 'KISS1', 'Gene_list_number': 28, 'Gene_list_index': 'a

{'Index': 52, 'Gene_name': 'SH3TC2', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000169247', 'Ensembl_transcript_id': 'ENST00000511949|ENST00000513340|ENST00000676367|ENST00000504091|ENST00000515229|ENST00000515425|ENST00000643113|ENST00000504517|ENST00000323829|ENST00000675793|ENST00000511307|ENST00000502274|ENST00000676056|ENST00000674983|ENST00000503071|ENST00000504690|ENST00000510779|ENST00000512049|ENST00000674655|ENST00000510350|ENST00000513604', 'RefSeq_mRNA_id': 'NM_024577', 'Organism': 'human', 'HGNC_symbol': 'SH3TC2', 'Alias': '', 'Info': 'down'}
down
{'Index': 53, 'Gene_name': 'NCF4-AS1', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000183822|ENSG00000281093', 'Ensembl_transcript_id': 'ENST00000619915|ENST00000625405|ENST00000629623|ENST00000431290', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'NCF4-AS1', '

{'Index': 70, 'Gene_name': 'SLAMF7', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000026751', 'Ensembl_transcript_id': 'ENST00000368042|ENST00000359331|ENST00000444090|ENST00000484221|ENST00000458104|ENST00000441662|ENST00000495334|ENST00000621377|ENST00000488819|ENST00000368043|ENST00000458602', 'RefSeq_mRNA_id': 'NM_001282592|NM_001282594|NM_021181|NM_001282595|NM_001282590|NM_001282593|NM_001282588|NM_001282589|NM_001282591|NM_001282596', 'Organism': 'human', 'HGNC_symbol': 'SLAMF7', 'Alias': '', 'Info': 'up'}
down
{'Index': 71, 'Gene_name': 'CCDC33', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000140481|ENSG00000288407', 'Ensembl_transcript_id': 'ENST00000635913|ENST00000558659|ENST00000268082|ENST00000673367|ENST00000560148|ENST00000672359|ENST00000560565|ENST00000672175|ENST00000558821|ENST00000671751|ENST00000673517|ENST0000

{'Index': 88, 'Gene_name': 'LYPD5', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000159871', 'Ensembl_transcript_id': 'ENST00000601224|ENST00000599397|ENST00000651184|ENST00000594049|ENST00000595666|ENST00000414615|ENST00000594013|ENST00000602179|ENST00000377950', 'RefSeq_mRNA_id': 'NM_182573|NM_001288763|NM_001031749', 'Organism': 'human', 'HGNC_symbol': 'LYPD5', 'Alias': '', 'Info': 'down'}
down
{'Index': 89, 'Gene_name': 'STYK1', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000060140', 'Ensembl_transcript_id': 'ENST00000542562|ENST00000542924|ENST00000075503|ENST00000538867|ENST00000541561|ENST00000535345', 'RefSeq_mRNA_id': 'NM_018423', 'Organism': 'human', 'HGNC_symbol': 'STYK1', 'Alias': '', 'Info': 'down'}
down
{'Index': 90, 'Gene_name': 'HLF', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PM

{'Index': 106, 'Gene_name': 'MFSD2A', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000168389', 'Ensembl_transcript_id': 'ENST00000483824|ENST00000372809|ENST00000434861|ENST00000481612|ENST00000459917|ENST00000491515|ENST00000372811|ENST00000420632|ENST00000480630|ENST00000469745', 'RefSeq_mRNA_id': 'NM_001349823|NM_001349821|NM_032793|NM_001349822|NM_001287809|NM_001136493|NM_001287808', 'Organism': 'human', 'HGNC_symbol': 'MFSD2A', 'Alias': '', 'Info': 'down'}
down
{'Index': 107, 'Gene_name': 'LINC02365', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000254233', 'Ensembl_transcript_id': 'ENST00000520280|ENST00000519173|ENST00000522554', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'LINC02365', 'Alias': '', 'Info': 'down'}
down
{'Index': 108, 'Gene_name': 'LINC02518', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_

{'Index': 126, 'Gene_name': 'GAPDHP32', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000226443', 'Ensembl_transcript_id': 'ENST00000433816', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'GAPDHP32', 'Alias': '', 'Info': 'down'}
down
{'Index': 127, 'Gene_name': 'CGB5', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000189052', 'Ensembl_transcript_id': 'ENST00000301408', 'RefSeq_mRNA_id': 'NM_033043', 'Organism': 'human', 'HGNC_symbol': 'CGB5', 'Alias': '', 'Info': 'down'}
down
{'Index': 128, 'Gene_name': 'SERPINB2', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000197632', 'Ensembl_transcript_id': 'ENST00000413956|ENST00000299502|ENST00000482254|ENST00000457692|ENST00000404622|ENST00000443281', 'RefSeq_mRNA_id': 'NM_001143818|NM_002575', 'Organism'

{'Index': 147, 'Gene_name': 'CSH2', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000213218', 'Ensembl_transcript_id': 'ENST00000613718|ENST00000392886|ENST00000560142|ENST00000558516|ENST00000336844|ENST00000345366|ENST00000559928', 'RefSeq_mRNA_id': 'NM_020991|NM_022645|NM_022644', 'Organism': 'human', 'HGNC_symbol': 'CSH2', 'Alias': '', 'Info': 'down'}
down
{'Index': 148, 'Gene_name': 'EBI3', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000105246', 'Ensembl_transcript_id': 'ENST00000599339|ENST00000221847', 'RefSeq_mRNA_id': 'NM_005755', 'Organism': 'human', 'HGNC_symbol': 'EBI3', 'Alias': '', 'Info': 'down'}
down
{'Index': 149, 'Gene_name': 'ERVE-1', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000267259', 'Ensembl_transcript_id': 'ENST00000592016', 'Re

{'Index': 168, 'Gene_name': 'FGFBP1', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000137440', 'Ensembl_transcript_id': 'ENST00000382333', 'RefSeq_mRNA_id': 'NM_005130', 'Organism': 'human', 'HGNC_symbol': 'FGFBP1', 'Alias': '', 'Info': 'down'}
up
{'Index': 169, 'Gene_name': 'HBD', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000223609', 'Ensembl_transcript_id': 'ENST00000417377|ENST00000429817|ENST00000292901|ENST00000643122|ENST00000650601', 'RefSeq_mRNA_id': 'NM_000519', 'Organism': 'human', 'HGNC_symbol': 'HBD', 'Alias': '', 'Info': 'up'}
down
{'Index': 170, 'Gene_name': 'NIPAL1', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000163293', 'Ensembl_transcript_id': 'ENST00000511123|ENST00000508180|ENST00000295461|ENST00000513724|ENST00000500571|ENST0000051

{'Index': 189, 'Gene_name': 'PDCD1', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000188389|ENSG00000276977', 'Ensembl_transcript_id': 'ENST00000334409|ENST00000630230|ENST00000343705|ENST00000618185|ENST00000630560|ENST00000418831', 'RefSeq_mRNA_id': 'NM_005018', 'Organism': 'human', 'HGNC_symbol': 'PDCD1', 'Alias': '', 'Info': 'up'}
down
{'Index': 190, 'Gene_name': 'PLA2G2F', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000158786', 'Ensembl_transcript_id': 'ENST00000465062|ENST00000375102', 'RefSeq_mRNA_id': 'NM_022819|NM_001360869', 'Organism': 'human', 'HGNC_symbol': 'PLA2G2F', 'Alias': '', 'Info': 'down'}
down
['LINC00514', 'SPERT', 'HIST3H2BB']
{'Index': 191, 'Gene_name': 'HIST3H2BB', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': '', 'Ensembl_transcript_id':

{'Index': 209, 'Gene_name': 'OR5H5P', 'Gene_list_number': 28, 'Gene_list_index': 'all_sig_genes_33981007', 'Source': 'PMID: 33981007', 'Ensembl_gene_id': 'ENSG00000249321', 'Ensembl_transcript_id': 'ENST00000503164', 'RefSeq_mRNA_id': '', 'Organism': 'human', 'HGNC_symbol': 'OR5H5P', 'Alias': '', 'Info': 'down'}


In [7]:
print(ls_notResponse)

['LINC00514', 'SPERT', 'HIST3H2BB']


In [8]:
# SCORES 

### from DICTIONARY to file
data = ls_geneDictionaries
df = pd.DataFrame.from_dict(data)
df.to_excel(r'.\Dictionary.xlsx', index=False)

### from ls_notResponse to file
data1 = ls_notResponse
df1 = pd.DataFrame.from_dict(data1)
df1.to_excel(r'.\notResponse.xlsx', index=False)

# LOAD 
# notRESPONSE
wb_2 = openpyxl.load_workbook(r'.\notResponse.xlsx')
ws_2 = wb_2.active
ls_notResponse_v2 = [ws_2.cell(row=i,column=1).value for i in range(2,ws_2.max_row+1)]


### from alias_and_official to file
data2 = alias_and_official(ls_notResponse, ls_row_10, ls_row_2)
df2 = pd.DataFrame.from_dict(data2)
df2.to_excel(r'.\responsewithMGI.xlsx', index=False)


# LOAD responsewithMGI
wb_3 = openpyxl.load_workbook(r'.\responsewithMGI.xlsx')
ws_3 = wb_3.active
gene_name_v2 = [ws_3.cell(row=i,column=1).value for i in range(3,ws_3.max_row+1)]

In [9]:
print (ls_notResponse)

['LINC00514', 'SPERT', 'HIST3H2BB']


In [10]:
print (gene_name_v2)


['spert']


In [11]:
# SECOND DICTIONARY

for i in range(len(gene_name_v2)):
    gene_name = gene_name_v2[i]
    print(gene_name)
    hgnc_symbol = gene_name
    
    if gene_name != '':

        # variables
        alias = ''

        ensembl_gene_id_temp = []
        ensembl_transcript_id_temp = []
        refseq_mrna_temp = []
        

        ls_biomartParameters_mgi = biomartParameters_mgi(gene_name)
        
        print(ls_biomartParameters_mgi)
        
        if not ls_biomartParameters_mgi:
            ls_notResponse_after.append(gene_name)


        for ls in ls_biomartParameters_mgi:
            for j in range(3):
                if len(ls) < (j+1):
                    ls.append('')
            ensembl_gene_id_temp.append(ls[0])
            ensembl_transcript_id_temp.append(ls[1])
            refseq_mrna_temp.append(ls[2])  

        ensembl_gene_id = '|'.join(list(set(filter(None, ensembl_gene_id_temp))))
        ensembl_transcript_id = '|'.join(list(set(filter(None, ensembl_transcript_id_temp))))
        refseq_mrna = '|'.join(list(set(filter(None, refseq_mrna_temp))))
        
        
        
        for k in range(len(ls_geneDictionaries)):
            temp = str(ls_geneDictionaries[k]['Gene_name']).lower()

            if temp == gene_name:
                print('response')
                ls_geneDictionaries[k]['Ensembl_gene_id'] = ensembl_gene_id
                ls_geneDictionaries[k]['Ensembl_transcript_id'] = ensembl_transcript_id
                ls_geneDictionaries[k]['RefSeq_mRNA_id'] = refseq_mrna
                ls_geneDictionaries[k]['HGNC_symbol'] = hgnc_symbol
                


spert
[]
response


In [12]:
# SCORES 

### from SECOND DICTIONARY to file

data3 = ls_geneDictionaries
df3 = pd.DataFrame.from_dict(data3)
df3.to_excel(r'.\secondDictionary.xlsx', index=False)
df3.to_csv(r'.\secondDictionary.tsv', sep="\t")

data4 = ls_notResponse_after
df4 = pd.DataFrame.from_dict(data4)
df4.to_excel(r'.\ls_notResponse_after.xlsx', index=False)



In [13]:
# Add ALIAS

mgi_file_path = r'..\MGI_EntrezGene.xlsx'
dictionary_file_path = r'.\secondDictionary.xlsx'
save_file_path = r'.\withAlias.xlsx'


updateCellswithAlias(mgi_file_path, dictionary_file_path, save_file_path)


In [14]:
# Add INFO

def updateCellswithINFO(source_file_path, source_sheet, alias_file_path, info_file_path):
    
    wb_info = openpyxl.load_workbook(source_file_path)
    ws_info = wb_info[source_sheet]
    ls_row_g_name = [ws_info.cell(row=i, column=3).value for i in range(2, ws_info.max_row + 1)]


    wb_alias = openpyxl.load_workbook(alias_file_path)
    ws_alias = wb_alias.active
    ls_row_2 = [ws_alias.cell(row=i, column=2).value for i in range(2, ws_alias.max_row + 1)]


    matched_indices = set()               # to keep track of matched id

    for j, (g_name, row_2) in enumerate(zip(ls_row_g_name, ls_row_2), start=2):
        if j not in matched_indices and str(g_name).lower() == str(row_2).lower():
            print(str(g_name).lower(), str(row_2).lower())
            L_cel = 'L' + str(j)
            tempInfo = str(ws_alias[L_cel].value)
            matched_indices.add(j)        # add matched index to set - avoid duplicate

### the number and type of items in the INFO column depends on the source publication


            ws_alias[L_cel] = ''
            ws_alias[L_cel] = ('Regulation:' + str(tempInfo).upper() + '|'
                            + 'Log2Ratio:' + str(ws_info['E' + str(j)].value) + '|'
                            + 'tissue:' + str(ws_info['H' + str(j)].value) + '|'
                            + 'cell:' + str(ws_info['I' + str(j)].value) + '|'
                            + 'enviroroment:' + str(ws_info['J' + str(j)].value) + '|'
                            + 'treatment:' + str(ws_info['K' + str(j)].value) + '|'
                            + 'dose:' + str(ws_info['L' + str(j)].value) + '|'
                            + 'time:' + str(ws_info['M' + str(j)].value) + '|'
                            + 'FDR_threshold:' + str(ws_info['N' + str(j)].value) + '|'
                            + 'method:' + str(ws_info['O' + str(j)].value) + '|'
                            + 'genome_element' + str(ws_info['P' + str(j)].value) + '|'
                            + 'FDR:' + str(ws_info['Q' + str(j)].value) + '|'
                            + 'comparison:' + str(ws_info['R' + str(j)].value) + '|'                             
                            + 'statistics:' + str(ws_info['S' + str(j)].value))

        
        


    # Save the file
    wb_alias.save(info_file_path)



source_file_path = r'..\ifpan-GR-database-papers.xlsx'
source_sheet = 'placenta-PNSS'
alias_file_path = '.\withAlias.xlsx'
info_file_path = r'.\withINFO.xlsx'

updateCellswithINFO(source_file_path, source_sheet, alias_file_path, info_file_path)


rn7sl4p rn7sl4p
linc01816 linc01816
fbln1 fbln1
ankrd33 ankrd33
cgb3 cgb3
mrgpre mrgpre
muc15 muc15
psca psca
ca1 ca1
ppp1r14c ppp1r14c
rn7sl3 rn7sl3
adhfe1 adhfe1
krt34 krt34
cpxm2 cpxm2
sik1 sik1
cgb7 cgb7
rab6b rab6b
plac1 plac1
muc5b muc5b
smkr1 smkr1
alpp alpp
cited4 cited4
psg8 psg8
ephb6 ephb6
lrrn1 lrrn1
arms2 arms2
hsd17b1 hsd17b1
acoxl acoxl
znf554 znf554
aldh3b2 aldh3b2
pkd1p5 pkd1p5
stx1b stx1b
abhd17c abhd17c
duox2 duox2
psg1 psg1
gdf15 gdf15
ank3 ank3
nectin3 nectin3
syt12 syt12
slc7a2 slc7a2
kiss1 kiss1
lcal1 lcal1
linc01554 linc01554
misp3 misp3
pgf pgf
tprxl tprxl
gda gda
tc2n tc2n
cab39p1 cab39p1
muc4 muc4
lad1 lad1
sh3tc2 sh3tc2
ncf4-as1 ncf4-as1
crybg2 crybg2
ftcd ftcd
phyhipl phyhipl
nav2-as4 nav2-as4
csh1 csh1
gh2 gh2
hsd11b2 hsd11b2
cpne7 cpne7
cyp19a1 cyp19a1
sema3b sema3b
p2rx2 p2rx2
optc optc
psg6 psg6
hsd3b1 hsd3b1
erv3-1 erv3-1
hs3st3b1 hs3st3b1
slamf7 slamf7
ccdc33 ccdc33
dhrs2 dhrs2
akr1b15 akr1b15
treml2 treml2
gabrb1 gabrb1
psg4 psg4
epha1-as1 epha1-as1


In [None]:
                                ''' + '|'
                                 + '|'
                                +'TTEST.NEURO.ALD:' + str(ws_mgi[S_cel].value) + '|'
                                +'TTEST.NEURO.DHT:' + str(ws_mgi[T_cel].value) + '|'
                                +'TTEST.NEURO.PRG:' + str(ws_mgi[U_cel].value) + '|'
                                +'TTEST.NEURO.ESR:' + str(ws_mgi[V_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.DEX:' + str(ws_mgi[W_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.ALD:' + str(ws_mgi[X_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.DHT:' + str(ws_mgi[Y_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.PRG:' + str(ws_mgi[Z_cel].value) + '|'
                                +'FDR_TTEST.ASTRO.ESR:' + str(ws_mgi[AA_cel].value) + '|'
                                +'FDR_TTEST.NEURO.DEX:' + str(ws_mgi[AB_cel].value) + '|'
                                +'FDR_TTEST.NEURO.ALD:' + str(ws_mgi[AC_cel].value) + '|'
                                +'FDR_TTEST.NEURO.DHT:' + str(ws_mgi[AD_cel].value) + '|'
                                +'FDR_TTEST.NEURO.PRG:' + str(ws_mgi[AE_cel].value) + '|'
                                +'FDR_TTEST.NEURO.ESR:' + str(ws_mgi[AF_cel].value) + '|'
                                +'FOLD.ASTRO.DEX:' + str(ws_mgi[AG_cel].value) + '|'
                                +'FOLD.ASTRO.ALD:' + str(ws_mgi[AH_cel].value) + '|'
                                +'FOLD.ASTRO.DHT:' + str(ws_mgi[AI_cel].value) + '|'
                                +'FOLD.ASTRO.PRG:' + str(ws_mgi[AJ_cel].value) + '|'
                                +'FOLD.ASTRO.ESR:' + str(ws_mgi[AK_cel].value) + '|'
                                +'FOLD.NEURO.DEX:' + str(ws_mgi[AL_cel].value) + '|'
                                +'FOLD.NEURO.ALD:' + str(ws_mgi[AM_cel].value) + '|'
                                +'FOLD.NEURO.DHT:' + str(ws_mgi[AN_cel].value) + '|'
                                +'FOLD.NEURO.PRG:' + str(ws_mgi[AO_cel].value) + '|'
                                +'FOLD.NEURO.ESR:' + str(ws_mgi[AP_cel].value))'''