In [1]:
# IMPORT 

import os
import csv
from biomart import BiomartServer
import pandas as pd
import openpyxl

In [2]:
# DATASET

## biomart_server
server = BiomartServer('http://www.ensembl.org/biomart')
dataset = server.datasets['hsapiens_gene_ensembl']

In [3]:
# FUNCTIONS

def gene_dictionary(index,
                    gene_name,
                    gene_list_number,
                    gene_list_id,
                    source,
                    ensembl_gene_id,
                    ensembl_transcript_id,
                    refseq_mrna,
                    hgnc_symbol,
                    organism,
                    alias,
                    info):
    
    dictionary = {
        'Index' : index,
        'Gene_name' : gene_name,
        'Gene_list_number' : gene_list_number,
        'Gene_list_index' : gene_list_id,
        'Source' : source,
        'Ensembl_gene_id' : ensembl_gene_id,
        'Ensembl_transcript_id' : ensembl_transcript_id,
        'RefSeq_mRNA_id' : refseq_mrna,
        'Organism' : organism,
        'HGNC_symbol' : hgnc_symbol,
        'Alias' : alias,
        'Info' : info
    }
    
    return dictionary


  
def biomartParameters(mgi_symbol):

    attributes = ['ensembl_gene_id',
                  'ensembl_transcript_id',
                  'refseq_mrna']

    filters = {'external_gene_name':[mgi_symbol]}               # gene_name = mgi_symbol
    response = dataset.search({'attributes':attributes,'filters':filters})
    

    # response_convertion
    lines = response.text.split("\n")
    lines = list(filter(lambda l: l != "",lines))
    values = []
    for l in lines:
        temp = []
        raw_values = l.split("\t")
        values.append(raw_values)
        
    return values

def biomartParametersbyEnsembl(ensembl_id):

    attributes = ['external_gene_name',
                  'ensembl_transcript_id',
                  'refseq_mrna']

    filters = {'ensembl_gene_id':[ensembl_gene_id]}               # gene_name = mgi_symbol
    response = dataset.search({'attributes':attributes,'filters':filters})
    

    # response_convertion
    values = [line.split("\t") for line in response.text.split("\n") if line.strip()]
        
    return values  


def alias_and_official(ls_notResponse,ls_row_10,ls_row_2):

    ls_response = []
    ls_response2 = []
    
    for notResponse in ls_notResponse:
        word = str(notResponse).lower()

        for i in range(len(ls_row_10)):
            if ls_row_10[i]:
                temp = str(ls_row_10[i]).lower()
                t_strings = temp.split("|")
                if word in t_strings:
                    ls_response.append([word, i, ls_row_1[i]])

    ls_response.insert(0, ['gene_name','alias_index','mgi_id'])         # print(len(ls_response))


    for notResponse in ls_notResponse:
        word = str(notResponse).lower()

        for i in range(len(ls_row_10)):
            if ls_row_10[i]:
                temp = str(ls_row_10[i]).lower()
                if word == temp:
                    ls_response2.append([word, i, ls_row_1[i]])
                    

    ls_response.insert(0, ['gene_name','official_index','mgi_id'])      # print(len(ls_response2))

    ls_response_3 = ls_response
    
    for response in ls_response2:
        if response not in ls_response_3:
            ls_response_3.append(response)
        else: print(response)

    ls_response_3.pop(0)
    ls_response_3.pop(0)
    ls_response.insert(0, ['gene_name','official/alias_index','mgi_id'])
    
    return ls_response_3


def biomartParameters_mgi(gene_name):

    attributes = ['ensembl_gene_id',
                  'ensembl_transcript_id',
                  'refseq_mrna']
    filters = {'uniprot_gn_symbol':[gene_name]}                     # gene_name = mgi_id
    response = dataset.search({'attributes':attributes,'filters':filters})
    
    # response_convertion
    lines = response.text.split("\n")
    lines = list(filter(lambda l: l != "",lines))
    values = []
    for l in lines:
        temp = []
        raw_values = l.split("\t")
        values.append(raw_values)

    return values  


def updateCellswithAlias(mgi_file_path, dictionary_file_path, file_path):
    
    # Load MGI file and open workbook
    wb_mgi = openpyxl.load_workbook(mgi_file_path)
    ws_mgi = wb_mgi.active

    ls_row_alias = [ws_mgi.cell(row=i,column=10).value for i in range(2,ws_mgi.max_row+1)]
    ls_row_g_name = [ws_mgi.cell(row=i,column=2).value for i in range(2,ws_mgi.max_row+1)] 

    # Load dictionary file and open workbook
    workbook = openpyxl.load_workbook(dictionary_file_path)
    sheet = workbook.active

    ls_row_2 = [sheet.cell(row=i,column=2).value for i in range(2,sheet.max_row+1)]
    ls_row_9 = [sheet.cell(row=i,column=11).value for i in range(2,sheet.max_row+1)]

    
    length = len(ls_row_2) + 2

    # Modify the desired cell
    for i in range(length):
        K_cel = 'K' + str(i + 2)

        for k in range(len(ls_row_g_name)):
            try:
                if str(ls_row_g_name[k]).lower() == str(ls_row_2[i]).lower():
                    if ls_row_alias[k] is not None:
                        sheet[K_cel] = ls_row_alias[k]
                    else:
                        continue
            except:
                continue

    # Save the file
    workbook.save(file_path)


In [4]:
# LOAD START FILES

##TSV
input_directory = r'./Nightingale 11'
dir_len = len([entry for entry in os.listdir(input_directory) if os.path.isfile(os.path.join(input_directory, entry))])
tsv_files = [file for file in os.listdir(input_directory) if file.endswith(".tsv")]


## MGI
wb_1 = openpyxl.load_workbook(r'../MGI_EntrezGene.xlsx')
ws_1 = wb_1.active

ls_row_10 = [ws_1.cell(row=i,column=10).value for i in range(2,ws_1.max_row+1)]
ls_row_2 = [ws_1.cell(row=i,column=2).value for i in range(2,ws_1.max_row+1)]
ls_row_1 = [ws_1.cell(row=i,column=1).value for i in range(2,ws_1.max_row+1)]  

In [5]:
# LISTS


In [None]:

dictionary = {}
ls_geneDictionaries = []
ls_notResponse = []
ls_notResponse_after = []

count = 759

for file_name in tsv_files:
    
    dictionary = {}
    ls_geneDictionaries = []
    ls_notResponse = []

    input_file_path = os.path.join(input_directory, file_name)
    output_file_path = os.path.join(input_directory, 'Dictionaries/dict_' + file_name)
    output_file_path_notResponse = os.path.join(input_directory, 'Dictionaries/ls_notResponse_' + file_name)
    
    # Read the TSV file => df
    df_pub = pd.read_csv(input_file_path, delimiter="\t")

    # Extract the desired columns as lists
    publication_geneName = df_pub['gene_name'].tolist()
    publication_info = df_pub['info'].tolist()
    
    print(file_name)
    
    count += 1
    
    # VARIABLES
    gene_list_number = count
    gene_list_id =  str('Nithingale_'+ file_name[:-4]) 
    source = str('omicspred_nithingale_'+ file_name[:-4])
    organism = 'human'
    alias = ''


    for i in range(len(publication_geneName)):
        index = i + 1
        gene_name = publication_geneName[i]
        hgnc_symbol = gene_name
        info = publication_info[i]
        print(gene_name)

        if gene_name != '':
            ensembl_gene_id_temp = []
            ensembl_transcript_id_temp = []
            refseq_mrna_temp = []
            external_gene_name_temp = []
            

            ls_biomartParameters = biomartParameters(gene_name)
            for ls in ls_biomartParameters:
                for j in range(3):
                    if len(ls) < (j+1):
                        ls.append('')
                ensembl_gene_id_temp.append(ls[0])
                ensembl_transcript_id_temp.append(ls[1])
                refseq_mrna_temp.append(ls[2])  

            ensembl_gene_id = '|'.join(list(set(filter(None, ensembl_gene_id_temp))))
            ensembl_transcript_id = '|'.join(list(set(filter(None, ensembl_transcript_id_temp))))
            refseq_mrna = '|'.join(list(set(filter(None, refseq_mrna_temp))))
            
            # if ls_biomartParameter - normal row adding      
            temp_gene_dictionary = gene_dictionary(index,
                                               gene_name,
                                               gene_list_number,
                                               gene_list_id,
                                               source,
                                               ensembl_gene_id,
                                               ensembl_transcript_id, 
                                               refseq_mrna,
                                               hgnc_symbol,
                                               organism,
                                               alias,
                                               info)

            ls_geneDictionaries.append(temp_gene_dictionary) 
            

            if not ls_biomartParameters:
                ensembl_id = str(gene_name)
                print(ensembl_id)
                ls_biomartParametersbyEnsembl = biomartParametersbyEnsembl(ensembl_id)

                for ls in ls_biomartParametersbyEnsembl:
                    for j in range(3):
                        if len(ls) < (j+1):
                            ls.append('')
                    external_gene_name_temp.append(ls[0])
                    ensembl_transcript_id_temp.append(ls[1])
                    refseq_mrna_temp.append(ls[2])

                gene_name = '|'.join(list(set(filter(None, external_gene_name_temp))))
                ensembl_transcript_id = '|'.join(list(set(filter(None, ensembl_transcript_id_temp))))
                refseq_mrna = '|'.join(list(set(filter(None, refseq_mrna_temp))))

                ensembl_gene_id = ensembl_id

                found_existing_row = False

                for k in range(len(ls_geneDictionaries)):
                    temp = ls_geneDictionaries[k]['Gene_name']

                    if temp == gene_name:
                        ls_geneDictionaries[k]['Gene_name'] = gene_name
                        ls_geneDictionaries[k]['Ensembl_transcript_id'] = ls_geneDictionaries[k]['Ensembl_transcript_id'] + 'auuu' + ensembl_transcript_id
                        ls_geneDictionaries[k]['RefSeq_mRNA_id'] = ls_geneDictionaries[k]['RefSeq_mRNA_id'] + 'auuuuuu'
                        found_existing_row = True
                        break

                if not found_existing_row:
                    ls_notResponse.append(gene_name)
                    print(ls_notResponse)
                    continue  

    ### from DICTIONARY to file

    data = ls_geneDictionaries
    df = pd.DataFrame.from_dict(data)
    df.to_csv(output_file_path, sep='\t', index=False)

    ### from ls_notResponse to file
    data1 = ls_notResponse
    df1 = pd.DataFrame.from_dict(data1)
    df1.to_csv(output_file_path_notResponse, sep='\t', index=False)

    data = ''
    df = ''
    data1 = ''
    df1 = ''

In [None]:
# DICTIONARY

dictionary = {}
ls_geneDictionaries = []
ls_notResponse = []
ls_notResponse_after = []

count = 759

for file_name in tsv_files:
    
    dictionary = {}
    ls_geneDictionaries = []
    ls_notResponse = []

    input_file_path = os.path.join(input_directory, file_name)
    output_file_path = os.path.join(input_directory, 'Dictionaries/dict_' + file_name)
    output_file_path_notResponse = os.path.join(input_directory, 'Dictionaries/ls_notResponse_' + file_name)
    
    # Read the TSV file => df
    df_pub = pd.read_csv(input_file_path, delimiter="\t")

    # Extract the desired columns as lists
    publication_geneName = df_pub['gene_name'].tolist()
    publication_info = df_pub['info'].tolist()
    
    print(file_name)
    
    count += 1
    
    # VARIABLES
    gene_list_number = count
    gene_list_id =  str('Nithingale_'+ file_name[:-4]) 
    source = str('omicspred_nithingale_'+ file_name[:-4])
    organism = 'human'
    alias = ''


    for i in range(len(publication_geneName)):
        index = i + 1
        gene_name = publication_geneName[i]
        hgnc_symbol = gene_name
        info = publication_info[i]
        print(gene_name)

        if gene_name != '':
            ensembl_gene_id_temp = []
            ensembl_transcript_id_temp = []
            refseq_mrna_temp = []
            external_gene_name_temp = []            

            ls_biomartParameters = biomartParameters(gene_name)
            
            if ls_biomartParameters:
                for ls in ls_biomartParameters:
                    for j in range(3):
                        if len(ls) < (j+1):
                            ls.append('')
                    ensembl_gene_id_temp.append(ls[0])
                    ensembl_transcript_id_temp.append(ls[1])
                    refseq_mrna_temp.append(ls[2])  

                ensembl_gene_id = '|'.join(list(set(filter(None, ensembl_gene_id_temp))))
                ensembl_transcript_id = '|'.join(list(set(filter(None, ensembl_transcript_id_temp))))
                refseq_mrna = '|'.join(list(set(filter(None, refseq_mrna_temp))))

               # if ls_biomartParameter - normal row adding      
                temp_gene_dictionary = gene_dictionary(index,
                                                   gene_name,
                                                   gene_list_number,
                                                   gene_list_id,
                                                   source,
                                                   ensembl_gene_id,
                                                   ensembl_transcript_id, 
                                                   refseq_mrna,
                                                   hgnc_symbol,
                                                   organism,
                                                   alias,
                                                   info)

                ls_geneDictionaries.append(temp_gene_dictionary) 
            
            
            elif ls_biomartParameters == []:

                ensembl_id = gene_name                # only in that case!!!
                ls_biomartParametersbyEnsembl = biomartParametersbyEnsembl(ensembl_id)

                for ls in ls_biomartParameters:
                    for j in range(3):
                        if len(ls) < (j+1):
                            ls.append('')
                    external_gene_name_temp.append(ls[0])
                    ensembl_transcript_id_temp.append(ls[1])
                    refseq_mrna_temp.append(ls[2])  

                external_gene_name = '|'.join(list(set(filter(None, external_gene_name_temp))))
                ensembl_transcript_id = '|'.join(list(set(filter(None, ensembl_transcript_id_temp))))
                refseq_mrna = '|'.join(list(set(filter(None, refseq_mrna_temp))))

                ensembl_gene_id = ensembl_id
                print(external_gene_name)
                print(ensembl_id)

                for k in range(len(ls_geneDictionaries)):           
                    temp = ls_geneDictionaries[k]['Gene_name']                   # searching existing gene 
                    temp_gln = ls_geneDictionaries[k]['Gene_list_number']        # searching existing list number

                    if temp == gene_name and temp_gln == count:
                        print(auuu)
                        ls_geneDictionaries[k]['Gene_name'] = external_gene_name
                        ls_geneDictionaries[k]['Ensembl_transcript_id'] = ls_geneDictionaries[k]['Ensembl_transcript_id'], 'auuu'
                        ls_geneDictionaries[k]['RefSeq_mRNA_id'] = ls_geneDictionaries[k]['RefSeq_mRNA_id'], 'auuuuuu'

                        continue

            else:
                ls_notResponse.append(gene_name)




    ### from DICTIONARY to file
    
    data = ls_geneDictionaries
    df = pd.DataFrame.from_dict(data)
    df.to_csv(output_file_path, sep='\t', index=False)


    ### from ls_notResponse to file
    data1 = ls_notResponse
    df1 = pd.DataFrame.from_dict(data1)
    df1.to_csv(output_file_path_notResponse, sep='\t', index=False)
    
    data = ''
    df = ''
    data1 = ''
    df1 = ''





OPGS003443_model.tsv
ABCA5
ABCA6
ABCG5
ABCG8
ABO
AMIGO1
ANGPTL3
ANKRD31
APOB
APOC1
APOC4
APOE
ATG4C
BCAM
BCL3
BSND
CARM1
CBLC
CDKN2D
CEACAM16
CEACAM19
CEACAM20
CELSR2
CERT1
CLASRP
CLPTM1
DNAH11
DNM2
ENSG00000267022

ENSG00000267022
FADS1
FADS2
FADS3
FEN1
GCNT4
GEMIN7
GMIP
GPC6
HMGCR
IGSF23
KANK2
LDAH
LDLR
LIPC
LPAR2
MAP2K6
MARK4
MAU2
MYBPHL
NCAN
NECTIN2
NPC1L1
NR2C2AP
PCSK9
POC5
POLK
POLR1G
PPP1R13L
PSRC1
PVR
RELB
SARS1
SLC25A42
SMARCA4
SNX5
SP4
SPC24
SUGP1
SUGP2
SYPL2
TDRD15
TM6SF2
TMED1
TMEM258
TOMM40
TRAPPC6A
TRIB1
TSSK6
USP1
USP24
VASP
ZNF112
ZNF180
ZNF224
ZNF234
ZNF285
ZNF296
OPGS003444_model.tsv
ABCA1
ABCA5
ABCA6
ABCG5
ABCG8
ABO
AMIGO1
ANGPTL3
ANGPTL4
ANKRD31
APOA1
APOA5
APOB
APOC1
APOC3
APOC4
APOE
ATG4C
BAZ1B
BCAM
BCL3
BORCS8-MEF2B
BSND
BUD13
CARM1
CBLC
CCDC121
CDKN2D
CEACAM16
CEACAM19
CELSR2
CERT1
CETP
CILP2
CLASRP
CLPTM1
DHODH
DNAH11
FADS3
GATAD2A
GCKR
GCNT4
GMIP
HAVCR1
HERPUD1
HMGCR
HOMER3
IFT172
IGSF23
KANK2
LDAH
LDLR
LIPC
LPA
LPAR2
LPL
MAP2K6
MARK4
MAS1
MAU2
MLXIPL
MYBPHL
N

CFAP53
CLPTM1
CXXC1
DOCK6
DYM
FADS1
FADS2
FADS3
FEN1
GALNT2
GRB14
HERPUD1
HLA-DQA1
INTS10
LIPC
LIPG
LPL
MEOX1
MMP9
MT1A
MT1X
MYO5B
NECTIN2
NLRC5
NUP93
PAFAH1B2
PCIF1
PLTP
RAB11B
RAB3IL1
RELB
SKA1
SLC12A3
SLC12A5
SLC18A1
SOST
SPC24
ST3GAL4
TMEM258
TOMM40
TRIB1
UBE2C
ZNF335
ZNF664
ZPR1
OPGS003459_model.tsv
ACAA2
ALDH1A2
ANGPTL4
APOA5
APOB
APOC1
APOC2
APOC3
APOC4
APOC4-APOC2
APOE
BCAM
BCL3
BUD13
C18orf32
CD300LG
CEACAM16
CETP
CFAP53
CLPTM1
CXXC1
DOCK6
DYM
FADS1
FADS2
FADS3
FEN1
G6PC3
GALNT2
GRB14
HERPUD1
HLA-DQA1
INTS10
LIPC
LIPG
LPL
MEOX1
MLXIPL
MMP9
MT1A
MT1F
MT1X
MYO5B
NECTIN2
NLRC5
NUP93
PAFAH1B2
PCIF1
PLTP
RAB11B
RAB3IL1
RELB
SKA1
SLC12A3
SLC12A5
SLC18A1
SOST
TMEM258
TOMM40
TRIB1
UBE2C
ZNF335
ZNF664
ZPR1
OPGS003460_model.tsv
ABCA1
ALDH1A2
ANGPTL3
ANGPTL4
APOA1
APOA5
APOB
APOC1
APOC3
APOC4
APOE
ATG4C
BCAM
BCL3
BUD13
CATSPER2
CBLC
CCDC121
CEACAM16
CLASRP
CLPTM1
CYP2W1
EIF2B4
FNDC4
FRMD5
GCKR
GPR146
GTF3C2
HLA-B
HLA-C
IFT172
KANK2
KLHL8
LDAH
LDLR
LIPC
LPL
MAU2
MLXIPL
NCAN
NECTIN2
NRBP1


In [8]:
##TSV

input_directory = r'./Nightingale 11'
dir_len = len([entry for entry in os.listdir(input_directory) if os.path.isfile(os.path.join(input_directory, entry))])
tsv_files = [file for file in os.listdir(input_directory) if file.endswith(".tsv")]


#ALIAS
mgi_file_path = r'..\MGI_EntrezGene.xlsx'
save_file_path = r'.\withAlias.xlsx'


for file_name in tsv_files:
    
    input_file_path = os.path.join(input_directory, file_name)
    output_file_path = os.path.join(input_directory, 'Dictionaries/dict-alias' + file_name)
    
    # Read the TSV file => df
    df_pub = pd.read_csv(input_file_path, delimiter="\t")
    publication_geneName_alias = df_pub['gene_name'].tolist()
    
    ### from alias_and_official to file
    data2 = alias_and_official(ls_notResponse, ls_row_10, ls_row_2)


    updateCellswithAlias(mgi_file_path, dictionary_file_path, save_file_path



['pamci', 73414, 'MGI:2384307']
['sgk', 36747, 'MGI:1340062']
['fam105a', 83077, 'MGI:2687281']
['hist1h2bj', 78779, 'MGI:2448388']
['jmjd3', 79607, 'MGI:2448492']
