In [1]:
# IMPORT 

import os
import csv
from biomart import BiomartServer
import pandas as pd
import openpyxl

In [2]:
# DATASET

## biomart_server
server = BiomartServer('http://www.ensembl.org/biomart')
dataset = server.datasets['hsapiens_gene_ensembl']

In [9]:
# FUNCTIONS

def gene_dictionary(index,
                    gene_name,
                    gene_list_number,
                    gene_list_id,
                    source,
                    ensembl_gene_id,
                    ensembl_transcript_id,
                    refseq_mrna,
                    hgnc_symbol,
                    organism,
                    alias,
                    info):
    
    dictionary = {
        'Index' : index,
        'Gene_name' : gene_name,
        'Gene_list_number' : gene_list_number,
        'Gene_list_index' : gene_list_id,
        'Source' : source,
        'Ensembl_gene_id' : ensembl_gene_id,
        'Ensembl_transcript_id' : ensembl_transcript_id,
        'RefSeq_mRNA_id' : refseq_mrna,
        'Organism' : organism,
        'HGNC_symbol' : hgnc_symbol,
        'Alias' : alias,
        'Info' : info
    }
    
    return dictionary


  
def biomartParameters(mgi_symbol):

    attributes = ['ensembl_gene_id',
                  'ensembl_transcript_id',
                  'refseq_mrna']

    filters = {'external_gene_name':[mgi_symbol]}               # gene_name = mgi_symbol
    response = dataset.search({'attributes':attributes,'filters':filters})
    

    # response_convertion
    lines = response.text.split("\n")
    lines = list(filter(lambda l: l != "",lines))
    values = []
    for l in lines:
        temp = []
        raw_values = l.split("\t")
        values.append(raw_values)
        
    return values

def biomartParametersbyEnsembl(ensembl_id):

    attributes = ['external_gene_name',
                  'ensembl_transcript_id',
                  'refseq_mrna']

    filters = {'ensembl_gene_id':[ensembl_gene_id]}               # gene_name = mgi_symbol
    response = dataset.search({'attributes':attributes,'filters':filters})
    

    # response_convertion
    values = [line.split("\t") for line in response.text.split("\n") if line.strip()]
        
    return values  


def alias_and_official(ls_notResponse,ls_row_10,ls_row_2):

    ls_response = []
    ls_response2 = []
    
    for notResponse in ls_notResponse:
        word = str(notResponse).lower()

        for i in range(len(ls_row_10)):
            if ls_row_10[i]:
                temp = str(ls_row_10[i]).lower()
                t_strings = temp.split("|")
                if word in t_strings:
                    ls_response.append([word, i, ls_row_1[i]])

    ls_response.insert(0, ['gene_name','alias_index','mgi_id'])         # print(len(ls_response))


    for notResponse in ls_notResponse:
        word = str(notResponse).lower()

        for i in range(len(ls_row_10)):
            if ls_row_10[i]:
                temp = str(ls_row_10[i]).lower()
                if word == temp:
                    ls_response2.append([word, i, ls_row_1[i]])
                    

    ls_response.insert(0, ['gene_name','official_index','mgi_id'])      # print(len(ls_response2))

    ls_response_3 = ls_response
    
    for response in ls_response2:
        if response not in ls_response_3:
            ls_response_3.append(response)
        else: print(response)

    ls_response_3.pop(0)
    ls_response_3.pop(0)
    ls_response.insert(0, ['gene_name','official/alias_index','mgi_id'])
    
    return ls_response_3


def biomartParameters_mgi(gene_name):

    attributes = ['ensembl_gene_id',
                  'ensembl_transcript_id',
                  'refseq_mrna']
    filters = {'uniprot_gn_symbol':[gene_name]}                     # gene_name = mgi_id
    response = dataset.search({'attributes':attributes,'filters':filters})
    
    # response_convertion
    lines = response.text.split("\n")
    lines = list(filter(lambda l: l != "",lines))
    values = []
    for l in lines:
        temp = []
        raw_values = l.split("\t")
        values.append(raw_values)

    return values  


def updateCellswithAlias(mgi_file_path, dictionary_file_path, file_path):
    
    # Load MGI file and open workbook
    wb_mgi = openpyxl.load_workbook(mgi_file_path)
    ws_mgi = wb_mgi.active

    ls_row_alias = [ws_mgi.cell(row=i,column=10).value for i in range(2,ws_mgi.max_row+1)]
    ls_row_g_name = [ws_mgi.cell(row=i,column=2).value for i in range(2,ws_mgi.max_row+1)] 

    # Load dictionary file and open workbook
    workbook = openpyxl.load_workbook(dictionary_file_path)
    sheet = workbook.active

    ls_row_2 = [sheet.cell(row=i,column=2).value for i in range(2,sheet.max_row+1)]
    ls_row_9 = [sheet.cell(row=i,column=11).value for i in range(2,sheet.max_row+1)]

    
    length = len(ls_row_2) + 2

    # Modify the desired cell
    for i in range(length):
        K_cel = 'K' + str(i + 2)

        for k in range(len(ls_row_g_name)):
            try:
                if str(ls_row_g_name[k]).lower() == str(ls_row_2[i]).lower():
                    if ls_row_alias[k] is not None:
                        sheet[K_cel] = ls_row_alias[k]
                    else:
                        continue
            except:
                continue

    # Save the file
    workbook.save(file_path)


In [4]:
# LOAD START FILES

##TSV
input_directory = r'./Metabolon 100'
dir_len = len([entry for entry in os.listdir(input_directory) if os.path.isfile(os.path.join(input_directory, entry))])
tsv_files = [file for file in os.listdir(input_directory) if file.endswith(".tsv")]


## MGI
wb_1 = openpyxl.load_workbook(r'../MGI_EntrezGene.xlsx')
ws_1 = wb_1.active

ls_row_10 = [ws_1.cell(row=i,column=10).value for i in range(2,ws_1.max_row+1)]
ls_row_2 = [ws_1.cell(row=i,column=2).value for i in range(2,ws_1.max_row+1)]
ls_row_1 = [ws_1.cell(row=i,column=1).value for i in range(2,ws_1.max_row+1)]  

In [5]:
# LISTS


In [10]:
# DICTIONARY

dictionary = {}
ls_geneDictionaries = []
ls_notResponse = []
ls_notResponse_after = []

count = 36

for file_name in tsv_files:
    
    dictionary = {}
    ls_geneDictionaries = []
    ls_notResponse = []

    input_file_path = os.path.join(input_directory, file_name)
    output_file_path = os.path.join(input_directory, 'Dictionaries/dict_' + file_name)
    output_file_path_notResponse = os.path.join(input_directory, 'Dictionaries/ls_notResponse_' + file_name)
    
    # Read the TSV file => df
    df_pub = pd.read_csv(input_file_path, delimiter="\t")

    # Extract the desired columns as lists
    publication_geneName = df_pub['gene_name'].tolist()
    publication_info = df_pub['info'].tolist()
    
    print(file_name)
    
    count += 1
    
    # VARIABLES
    gene_list_number = count
    gene_list_id =  str('Metabolon_'+ file_name[:-4]) 
    source = str('omicspred_metabolon'+ file_name[:-4])
    organism = 'human'
    alias = ''


    for i in range(len(publication_geneName)):
        index = i + 1
        gene_name = publication_geneName[i]
        hgnc_symbol = gene_name
        info = publication_info[i]
        print(gene_name)

        if gene_name != '':
            ensembl_gene_id_temp = []
            ensembl_transcript_id_temp = []
            refseq_mrna_temp = []

            ls_biomartParameters = biomartParameters(gene_name)
            
            if not ls_biomartParameters:
                ensembl_id = gene_name                # only in that case!!!
                ls_biomartParameters = biomartParametersbyEnsembl(ensembl_id)
                
                    
                if not ls_biomartParameters:
                    ls_notResponse.append(gene_name)
                    print(ls_notResponse)


            for ls in ls_biomartParameters:
                for j in range(3):
                    if len(ls) < (j+1):
                        ls.append('')
                ensembl_gene_id_temp.append(ls[0])
                ensembl_transcript_id_temp.append(ls[1])
                refseq_mrna_temp.append(ls[2])  

            ensembl_gene_id = '|'.join(list(set(filter(None, ensembl_gene_id_temp))))
            ensembl_transcript_id = '|'.join(list(set(filter(None, ensembl_transcript_id_temp))))
            refseq_mrna = '|'.join(list(set(filter(None, refseq_mrna_temp))))



            temp_gene_dictionary = gene_dictionary(index,
                                                   gene_name,
                                                   gene_list_number,
                                                   gene_list_id,
                                                   source,
                                                   ensembl_gene_id,
                                                   ensembl_transcript_id, 
                                                   refseq_mrna,
                                                   hgnc_symbol,
                                                   organism,
                                                   alias,
                                                   info)

            ls_geneDictionaries.append(temp_gene_dictionary)

            
    ### from DICTIONARY to file
    
    data = ls_geneDictionaries
    df = pd.DataFrame.from_dict(data)
    df.to_csv(output_file_path, sep='\t', index=False)


    ### from ls_notResponse to file
    data1 = ls_notResponse
    df1 = pd.DataFrame.from_dict(data1)
    df1.to_csv(output_file_path_notResponse, sep='\t', index=False)
    
    data = ''
    df = ''
    data1 = ''
    df1 = ''





OPGS002693_model.tsv
ACADS
ACSF3
ANAPC5
BICDL1
C12orf43
CABP1
CAMKK2
CBFA2T3
CDH15
CDT1
CIT
COQ5
COX6A1
CPNE7
DBNDD1
DPEP1
DYNLL1
ENSG00000288623
ENSG00000288715
['ENSG00000288715']
GAS8
GATC
GCN1
HNF1A
LYPLAL1
MC1R
MLEC
MSI1
OASL
P2RX4
P2RX7
PIEZO1
PLA2G1B
POP5
PRDM7
PRKAB1
PXN
RAB35
RNF10
RPLP0
SIRT4
SLC22A31
SPG7
SPIRE2
SPPL3
SRSF9
TCF25
TMEM233
UNC119B
ZNF276
ZNF778
OPGS002694_model.tsv
ACTG2
ALMS1
C2orf78
CCT7
DGUOK
DUSP11
EGR4
EMX1
EXOC6B
FBXO41
NAT8
NAT8B
NOTO
PRADC1
RAB11FIP5
SFXN5
SPR
STAMBP
TET3
TPRKB
OPGS002695_model.tsv
ENSG00000257062
IAPP
PYROXD1
SLCO1A2
SLCO1B1
SLCO1B3
SLCO1B3-SLCO1B7
SLCO1C1
TMPRSS11A
TMPRSS11B
TMPRSS11D
TMPRSS11E
TMPRSS11F
UGT2A1
UGT2A2
UGT2A3
UGT2B10
UGT2B11
UGT2B15
UGT2B17
UGT2B28
UGT2B4
UGT2B7
YTHDC1
OPGS002696_model.tsv
ACTG2
ALMS1
C2orf78
CCT7
DGUOK
DUSP11
EGR4
EMX1
EXOC6B
FBXO41
HNF1A
MAPK7
NAT8
NAT8B
NOTO
PRADC1
RAB11FIP5
RNF112
SFXN5
SLC16A9
SLC22A1
SLC47A1
SMYD5
STAMBP
TET3
TPRKB
OPGS002697_model.tsv
ACTL6B
AP4M1
ARPC1A
ARPC1B
AZGP1
BIRC6
BSPH

SAG
TRPM8
UGT1A1
UGT1A10
UGT1A3
UGT1A4
UGT1A5
UGT1A6
UGT1A7
UGT1A8
UGT1A9
USP40
OPGS002724_model.tsv
CCDC57
CCDC77
CNOT1
GATM
GBP1
GBP2
GBP3
GBP4
GBP5
GOT2
GTF2B
GTF2H1
HAL
KDM5A
KYAT3
NDRG4
PKN2
RBMXL1
SETD6
SLC25A20
SLC38A7
SLC6A13
SLC7A5
OPGS002725_model.tsv
ATG16L1
DGKD
MROH2A
SAG
SLCO1B3-SLCO1B7
TRPM8
UGT1A1
UGT1A10
UGT1A3
UGT1A4
UGT1A5
UGT1A6
UGT1A7
UGT1A8
UGT1A9
USP40
OPGS002726_model.tsv
CYP4V2
DBN1
ENSG00000290316
F11
F12
FAM149A
FGFR4
GRK6
HK3
KLKB1
KNG1
LMAN2
MXD3
NSD1
PFN3
RGS14
SLC34A1
TLR3
OPGS002727_model.tsv
ATG16L1
DGKD
MROH2A
SAG
TRPM8
UGT1A1
UGT1A10
UGT1A3
UGT1A4
UGT1A5
UGT1A6
UGT1A7
UGT1A8
UGT1A9
USP40
OPGS002728_model.tsv
CHORDC1
GRM5
NAALAD2
NOX4
RAB38
TRIM49
TRIM49C
TRIM49D1
TRIM49D2
TRIM64
TRIM64B
TRIM77
TYR
UBTFL1
OPGS002729_model.tsv
ABT1
ACSM6
ARPC1A
ARPC1B
AZGP1
BTN3A2
BUD31
CNPY4
CYP2C18
CYP2C19
CYP2C8
CYP2C9
CYP3A4
CYP3A43
CYP3A5
CYP3A7
ENSG00000272647
ENSG00000284292
['ENSG00000284292']
ENSG00000289690
ENSG00000290217
['ENSG00000284292', 'ENSG00000290217'

TSC22D4
ZCWPW1
ZKSCAN1
ZKSCAN5
ZNF394
ZNF655
ZSCAN21
OPGS002766_model.tsv
ACTG2
ALMS1
C2orf78
CCT7
DUSP11
EGR4
EMX1
FBXO41
NAT8
NAT8B
NOTO
PRADC1
RAB11FIP5
SFXN5
STAMBP
TPRKB
OPGS002767_model.tsv
ANKRD50
CYP4V2
F11
F12
KLKB1
KNG1
PFN3
OPGS002768_model.tsv
CPS1
CPT2
GLDC
IGF2R
MAGOH
MAS1
MREG
PECR
PNLDC1
SLC22A1
SUCLG2
OPGS002769_model.tsv
CYP4F11
CYP4F12
CYP4F2
CYP4F3
CYP4F8
ENSG00000257062
GOLT1B
IAPP
OR10H1
OR10H4
OR10H5
PYROXD1
SLCO1A2
SLCO1B1
SLCO1B3
SLCO1B3-SLCO1B7
SLCO1C1
ST8SIA1
OPGS002770_model.tsv
ALDH1A2
APOA1
APOA5
APOC3
FADS1
FADS2
FADS3
FEN1
LIPC
RAB3IL1
TMEM258
ZPR1
OPGS002771_model.tsv
ABCC1
ACD
BEST2
C4orf46
CALR
CBFB
CPT2
CTCF
CZIB
DAND5
DDX28
DNASE2
DPEP2NB
DPEP3
DUS2
ELMO3
ENSG00000269693
ETFA
ETFDH
FARSA
FNIP2
GADD45GIP1
GCDH
GFOD2
ISL2
KLF1
LRP8
LRRC36
LYL1
MAGOH
MAST1
NFATC3
NFIX
PPDPFL
PPID
PRDX2
PRMT7
PSKH1
RAD23A
RANBP10
RNASEH2A
RRAD
RTBDN
SLC12A4
SLC7A6
SLC7A6OS
SMPD3
SNAI2
SYCE2
THSD8
TMEM266
TNPO2
TPPP3
TSNAXIP1
OPGS002772_model.tsv
ABCC4
CLDN10
GABRR1
GABR

S100A10
S100A11
SLC44A5
ST6GALNAC3
TCHH
TCHHL1
TDRKH
THEM4
THEM5
TYW3
OPGS002818_model.tsv
CCDC6
GSTM2
MRLN
SLC16A9
OPGS002819_model.tsv
APOC3
BEST1
DAGLA
ELOVL2
ERVFRD-1
FADS1
FADS2
FADS3
FEN1
MYRF
RAB3IL1
SYCP2L
TMEM258
TRIB1
OPGS002820_model.tsv
HNRNPM
SLCO1B3-SLCO1B7
SPAST
SRD5A2
TMPRSS11E
UGT2B15
UGT2B17
OPGS002821_model.tsv
DENND6B
HDAC10
MOV10L1
NAT1
NAT2
PANX2
PLXNB2
PSD3
TTLL8
OPGS002822_model.tsv
TBX3
ZZEF1
OPGS002823_model.tsv
MYADM
NLRP12
ZFPM2
OPGS002824_model.tsv
ASAH1
FDX1
HDAC10
MOV10L1
NAT1
NAT2
PANX2
PLXNB2
PSD3
TTLL8
OPGS002825_model.tsv
CYP4V2
F11
KLKB1
KNG1
OPGS002826_model.tsv
BIRC6
CAPN14
DPY30
EHD3
LTBP1
MEMO1
NLRC4
SLC30A6
SPAST
SRD5A2
TMPRSS11E
TRIB1
TTC27
UGT2B15
XDH
YIPF4
OPGS002827_model.tsv
CYP1A1
ENSG00000283321
NAT1
NAT2
PSD3
OPGS002828_model.tsv
ALPL
ECE1
EIF4G3
ENPP6
ENSG00000289715
MYADM
NBPF3
NLRP12
RAP1GAP
OPGS002829_model.tsv
BEST1
DAGLA
FADS1
FADS2
FADS3
FEN1
FTH1
MYRF
RAB3IL1
SYT7
TMEM258
OPGS002830_model.tsv
ACSL6
ADAMTS19
AFF4
ALDH3A2
CHSY3
CSF

CYC1
EPPK1
EXOSC4
GRINA
HGH1
HSF1
LMNB1
MEGF10
MROH1
OPLAH
PLEC
SHARPIN
SPATC1
TEX43
WDR97
OPGS002898_model.tsv
BSPH1
CRX
ENSG00000257062
IAPP
SLCO1A2
SLCO1B1
SLCO1B3
SLCO1B3-SLCO1B7
SULT2A1
TPRX2
OPGS002899_model.tsv
BEST1
DAGLA
ELOVL2
ERVFRD-1
FADS1
FADS2
FADS3
FEN1
MYRF
RAB3IL1
SYCP2L
TMEM258
OPGS002900_model.tsv
AKR1D1
ALKBH3
CYP3A5
HSD17B12
MFSD9
NAALADL2
OR2AE1
TTC17
ZNF799
OPGS002901_model.tsv
DDTL
ENDOG
ENSG00000278401
ENSG00000281310
['ENSG00000281310']
ENSG00000286112
GSTT2
GSTT2B
GSTT4
LRRC8A
MIF
PHYHD1
PKN3
SLC2A11
SPOUT1
TBC1D13
ZDHHC12
ZER1
OPGS002902_model.tsv
BCO1
ISX
PKD1L2
SCARB1
OPGS002903_model.tsv
GBA3
OPGS002904_model.tsv
CNOT3
ENSG00000257062
['ENSG00000257062']
FADS1
FADS2
FADS3
FEN1
LENG1
MBOAT7
SLCO1A2
SLCO1B1
SLCO1B3
SLCO1B3-SLCO1B7
SLCO1C1
TMC4
TMEM258
OPGS002905_model.tsv
ABCG2
CLNK
DRD5
SLC2A9
SPP1
WDR1
ZNF518B
OPGS002906_model.tsv
CPS1
FAAH
NSUN4
TBX10
OPGS002907_model.tsv
ACSM2B
CYP4A11
CYP4B1
CYP4X1
HMGCS2
REG4
SLCO1A2
SLCO1B1
SLCO1B3-SLCO1B7
OPGS002908

CPS1
NCAPD3
OPGS002987_model.tsv
ABCC2
CMAS
SLC10A2
SLCO1A2
SLCO1B1
OPGS002988_model.tsv
LIPC
NRBP1
PDE1C
OPGS002989_model.tsv
DGCR6
ENSG00000283809
FAM246C
PRODH
OPGS002990_model.tsv
CYP4A11
CYP4A22
CYP4B1
CYP4X1
CYP4Z1
PDZK1IP1
OPGS002991_model.tsv
ASH1L
DAP3
DCST2
EFNA4-EFNA3
GBA1
GON4L
LMNA
MSTO1
MUC1
PMVK
RXFP4
SEMA4A
SSR2
SYT11
UGT2B15
YY1AP1
OPGS002992_model.tsv
FAM135B
KBTBD8
TAS2R41
OPGS002993_model.tsv
DBH
TH
UGT2B10
UGT2B15
OPGS002994_model.tsv
BLMH
CYP3A4
CYP3A5
GOSR1
OR2AE1
SLC10A2
SLC6A4
TMEM225B
TMIGD1
ZFAND4
ZKSCAN1
OPGS002995_model.tsv
BSPH1
SLCO1A2
SLCO1B1
SULT2A1
TPRX2
OPGS002996_model.tsv
ANKRD13C
CTH
LRRC7
SRSF11
OPGS002997_model.tsv
FADS1
FADS2
FADS3
FEN1
RAB3IL1
TMEM258
ZPR1
OPGS002998_model.tsv
GOT2
SLC38A7
SORCS3
OPGS002999_model.tsv
NOS1AP
OPGS003000_model.tsv
APOA5
LIPC
PAQR9
ZPR1
OPGS003001_model.tsv
APOC1
APOE
CBLC
FADS1
FADS2
FADS3
FEN1
LDLR
PCSK9
PSRC1
TMEM258
TOMM40
OPGS003002_model.tsv
CCT6A
CPS1
HMGCS2
MRPS17
NIPSNAP2
PHGDH
PHKG1
PSPH
OPGS003003_model.

OPGS003109_model.tsv
FADS1
FADS2
FADS3
FEN1
RAB3IL1
TMEM258
OPGS003110_model.tsv
ANKRD27
CEP89
FAAP24
NUDT19
RGS9BP
SLC7A9
OPGS003111_model.tsv
CEP89
ENSG00000290217
FAAP24
H2AC1
RGS9BP
SLC17A1
SLC17A3
SLC17A4
SLC7A9
OPGS003112_model.tsv
DBH
SARDH
OPGS003113_model.tsv
ELOVL2
FADS1
FADS2
FEN1
GCKR
NRBP1
PPM1G
TMEM258
OPGS003114_model.tsv
FADS1
FADS2
FADS3
FEN1
GCKR
PPM1G
TMEM258
OPGS003115_model.tsv
ARHGEF3
KLHL29
MFSD2B
MYL10
PLPP3
UBXN2A
OPGS003116_model.tsv
CETP
CLMN
HERPUD1
LPL
TMEM229B
OPGS003117_model.tsv
RPTN
S100A10
S100A11
TCHHL1
THEM4
THEM5
OPGS003118_model.tsv
FADS1
FADS2
FADS3
FEN1
TMEM229B
TMEM258
OPGS003119_model.tsv
EIF2B4
FNDC4
GCKR
GTF3C2
NRBP1
PPM1G
ZBTB17
ZPR1
OPGS003120_model.tsv
ELOVL2
EVA1A
FADS1
FADS2
FADS3
FEN1
TMEM258
OPGS003121_model.tsv
FADS1
FADS2
FADS3
FEN1
PPT2
TMEM258
OPGS003122_model.tsv
ACADSB
P4HA2
SLC22A1
SLC22A4
SLC22A5
OPGS003123_model.tsv
ADA
AKAIN1
CCN5
OPGS003124_model.tsv
APOE
BUD13
FADS1
FADS2
FADS3
FEN1
TMEM258
ZPR1
OPGS003125_model.tsv
ELOVL2


OPGS003289_model.tsv
CDK10
CHMP1A
DPEP1
SPATA2L
SPATA33
VPS9D1
OPGS003290_model.tsv
ARHGAP39
C8orf82
ENSG00000291316
FOXH1
GPT
RECQL4
ZFTRAF1
OPGS003291_model.tsv
OPGS003292_model.tsv
SELENOT
OPGS003293_model.tsv
ACOX1
TEN1
OPGS003294_model.tsv
CYP4A11
CYP4B1
OPGS003295_model.tsv
CYP2A6
LMOD3
OPGS003296_model.tsv
ENSG00000284686
OPGS003297_model.tsv
COMT
OPGS003298_model.tsv
PAH
OPGS003299_model.tsv
SLC13A1
TAS2R16
OPGS003300_model.tsv
FLVCR1
SPATA45
OPGS003301_model.tsv
SLC28A1
OPGS003302_model.tsv
DEPDC1B
OPGS003303_model.tsv
PRMT7
OPGS003304_model.tsv
CYP2C8
CYP2C9
HELLS
OPGS003305_model.tsv
PRKAG3
OPGS003306_model.tsv
PNLIPRP2
OPGS003307_model.tsv
EIF2B4
FNDC4
GCKR
GTF3C2
NRBP1
PPM1G
OPGS003308_model.tsv
MTRR
OPGS003309_model.tsv
FADS1
FADS3
FEN1
TMEM258
OPGS003310_model.tsv
UGT1A1
UGT1A10
UGT1A3
UGT1A5
UGT1A7
UGT1A8
USP40
OPGS003311_model.tsv
SULT1A1
SULT1A2
OPGS003312_model.tsv
SLC17A1
SLC17A3
SLC17A4
OPGS003313_model.tsv
PRMT7
OPGS003314_model.tsv
ACSM2A
ACSM5
OPGS003315_model.t

In [8]:
##TSV

input_directory = r'./Metabolon 108'
dir_len = len([entry for entry in os.listdir(input_directory) if os.path.isfile(os.path.join(input_directory, entry))])
tsv_files = [file for file in os.listdir(input_directory) if file.endswith(".tsv")]


#ALIAS
mgi_file_path = r'..\MGI_EntrezGene.xlsx'
save_file_path = r'.\withAlias.xlsx'


for file_name in tsv_files:
    
    input_file_path = os.path.join(input_directory, file_name)
    output_file_path = os.path.join(input_directory, 'Dictionaries/dict-alias' + file_name)
    
    # Read the TSV file => df
    df_pub = pd.read_csv(input_file_path, delimiter="\t")
    publication_geneName_alias = df_pub['gene_name'].tolist()
    
    ### from alias_and_official to file
    data2 = alias_and_official(ls_notResponse, ls_row_10, ls_row_2)


    updateCellswithAlias(mgi_file_path, dictionary_file_path, save_file_path



['pamci', 73414, 'MGI:2384307']
['sgk', 36747, 'MGI:1340062']
['fam105a', 83077, 'MGI:2687281']
['hist1h2bj', 78779, 'MGI:2448388']
['jmjd3', 79607, 'MGI:2448492']
