# This program was written by Angelica Sharma

## Goal 1: Create a table with normalized weight scores for gene pairs available in our data set, taking into consideration 7 factors:

1. Co-Expression
2. Biological Pathway
3. Protein-Protein Interactions
4. Cellular Co-localizations
5. Genetic Interaction 
6. Shared Protein Domain
7. Gene Regularatory Interaction Network

We will then use these factors to a create one, final normalized score for each gene pair.

In [38]:
import requests
from bs4 import BeautifulSoup

def getUrlsFromGeneMania(name):
    url = 'http://genemania.org/data/current/Homo_sapiens/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    link_tags = soup.find_all('a')
    urls = []
    for link in link_tags:
        href = link.get('href')
        if href.startswith(name):
            url = "{}{}".format('http://genemania.org/data/current/Homo_sapiens/', href)
            urls.append(url)
    return urls

In [39]:
import datatable as dt
import urllib.request
from collections import defaultdict

def readData(urls):
    # Create a defaultdict to store gene pairs and weights
    gene_pairs = defaultdict(float)

    # Iterate through the files containing gene pairs and weights
    for file in urls:
        # Read the contents of the file into a list of lines
        webUrl = urllib.request.urlopen(file)
    
        # Skip the first line
        data = webUrl.readlines()[1:]
    
        # Iterate through the lines and update the defaultdict
        for line in data:
            values = line.decode().strip().split("\t")
            gene1, gene2 = values[0], values[1]
            weight = float(values[2])
        
            # Normalize the gene pair representation
            gene_pair = f"{min(gene1, gene2)},{max(gene1, gene2)}"
        
            # Add the weight to the existing value in the defaultdict
            gene_pairs[gene_pair] += weight
            
        # To keep track of progress
        print('Finished analyzing ' + file)
        
    # Create a datatable from the defaultdict - more efficient!
    table = dt.Frame(GenePair=list(gene_pairs.keys()), Weight=list(gene_pairs.values()))
    return table

In [40]:
from datatable import update, f
def normalizeWeights(tab, name):
    mean = tab[:, dt.mean(dt.f[1])][0, 0]
    print("Mean:", mean)
    std = tab[:, dt.sd(dt.f[1])][0, 0]
    print("Standard Deviation:", std)

    # Normalize the values in column 2 using mean and standard deviation
    tab[:, name] = (f[name] - mean) / std
    return tab

In [16]:
def analyzeFactor(urlName, tableName):
    
    dt.options.display.max_nrows = None
    #Get list of files from genemania
    urls = getUrlsFromGeneMania(urlName)
    
    #Read files and create a table with weights
    tab = readData(urls)
    print(tab)
    
    #Rename second column for readability
    tab.names = {'Weight': tableName}
    
    #Normalize the weights in the table
    tab = normalizeWeights(tab, tableName)
    print(tab)
    
    return tab

In [43]:
import requests
def analyzeTxtFiles(fileList, tableName):
    
    # Create a defaultdict to store gene pairs and weights
    gene_pairs = defaultdict(float)
    for file in fileList:
        with open(file, "r") as data:
            # Skip the first line
            next(data)  
            for line in data:
    
                # Iterate through the lines and update the defaultdict
                values = line.strip().split("\t")
                gene1, gene2 = str(values[0]), str(values[1])
                
                # Skip over gene pairs that don't have any weight data
                if(values[4] != ''):
                    weight = float(values[4])
        
                # Normalize the gene pair representation, and make sure they are all uppercase letters
                gene_pair = f"{min(gene1, gene2).upper()},{max(gene1, gene2).upper()}"
        
                # Add the weight to the existing value in the defaultdict
                gene_pairs[gene_pair] += weight
            
            # To keep track of progress
            print('Finished analyzing ' + file)
        
    # Create a datatable from the defaultdict - more efficient!
    tab = dt.Frame(GenePair=list(gene_pairs.keys()), Weight=list(gene_pairs.values()))
    
    #Rename second column for readability
    tab.names = {'Weight': tableName}
    
    #Normalize the weights in the table
    tab = normalizeWeights(tab, tableName)
    
    return tab
    

Creating tables for each factor

In [None]:
#Co-expression - 1

co_exp_table = analyzeFactor('Co-expression.', 'Co-expression Weight')
%store co_exp_table
print(co_exp_table)

In [None]:
#Pathway - 2

pathway_table = analyzeFactor('Pathway.', 'Pathway Weight')
%store pathway_table
print(pathway_table)

In [None]:
#Protein-Protein Interactions - 3

protein_table = analyzeFactor('Physical_Interactions.', 'Protein-Protein Weight')
%store protein_table
print(protein_table)

In [None]:
#Cellular Localizations - 4

localization_table = analyzeFactor('Co-localization.', 'Co-localization Weight')
%store localization_table
print(localization_table)

In [None]:
#Genetic Interactions - 5

genetic_interaction_table = analyzeFactor('Genetic_Interactions.', 'Genetic Interaction Weight')
%store genetic_interaction_table
print(genetic_interaction_table)

In [None]:
#Shared Protein Domains - 6

shared_protein_table = analyzeFactor('Shared_protein_domains.', 'Shared Protein Domain Weight')
%store shared_protein_table
print(shared_protein_table)

In [None]:
#Gene Regularatory Interaction Network - has standard gene symbol names - 7

fileList = ['Adult-Heart-regulons.txt', 'Heart_GTEx-regulons.txt', 'Fetal-Heart-regulons.txt', 'whole_NeonatalHeart-regulons.txt']
gene_reg_table = analyzeTxtFiles(fileList, 'Gene Regularatory Weight')
%store gene_reg_table
print(gene_reg_table)

Combining tables 1-6 to form one, large table. Excluding table 7 for now, since table 7 is in gene symbols while the rest are in Uniprot IDs.

In [11]:
import datatable as dt
from collections import defaultdict

# Function to add/update weights in the big table
def update_weights(gene_pair, weight, big_table_dict):
    big_table_dict[gene_pair].append(weight)

def merge_and_update(table, colName, big_table_dict):
     for i in range(table.nrows):
        gene_pair = table[i, "GenePair"].upper()
        weight = table[i, colName]
        update_weights(gene_pair, weight, big_table_dict)
        if(i % 1_000_000 == 0):
            print("Finished " + str(i) + "/" + str(table.nrows))
     print("Finished adding table with " + colName)


In [None]:
# Load tables 1-6
%store -r co_exp_table
%store -r pathway_table
%store -r protein_table
%store -r localization_table
%store -r genetic_interaction_table
%store -r shared_protein_table

# Create the initial big table as a dictionary
big_table_dict = defaultdict(list)

# Merge and update weights for each table
merge_and_update(shared_protein_table, "Shared Protein Domain Weight", big_table_dict)
merge_and_update(genetic_interaction_table, "Genetic Interaction Weight", big_table_dict)
merge_and_update(localization_table, "Co-localization Weight", big_table_dict)
merge_and_update(protein_table, "Protein-Protein Weight", big_table_dict)
merge_and_update(pathway_table, "Pathway Weight", big_table_dict)
merge_and_update(co_exp_table, "Co-expression Weight", big_table_dict)

print("Finished adding all tables. Converting dictionary to dataframe...")

# Create empty lists for GenePair and WeightList columns
gene_pairs = []
weight_lists = []

# Iterate over the dictionary and populate the lists
for gene_pair, weight_list in big_table_dict.items():
    gene_pairs.append(gene_pair)
    weight_lists.append(weight_list)

# Create the big table using the populated lists
big_table = dt.Frame(GenePair=gene_pairs, WeightList=weight_lists)

print(big_table)
%store big_table

In [26]:
%store -r big_table
print(big_table)

         | GenePair       WeightList                                      
         | str32          arr32(float64)                                  
-------- + -------------  ------------------------------------------------
       0 | Q16850,Q9NR63  [0.280541, -0.138305]                           
       1 | O15519,Q92851  [2.90248, 70.4736, 2.58423, -0.0948521, 2.32987]
       2 | Q13275,Q9Y4D7  [0.225917, 0.249665]                            
       3 | P52756,P78332  [0.690218, 0.107048, 33.7433]                   
       4 | P13569,Q2M3G0  [0.376132]                                      
       5 | P30260,Q02790  [0.103014, 0.249635]                            
       6 | P05141,Q9UJS0  [0.198605, 0.0245982]                           
       7 | P46063,Q9H6R0  [-0.156449]                                     
       8 | P31270,P50221  [-0.00623345, -0.0246768]                       
       9 | Q02790,Q9H6T3  [0.089358, -0.533407]                           
      10 | P30260,Q9H6T3 

Extracting the unique genes in this large table & creating a dictionary to convert them to gene symbols.

In [None]:
import requests
import xml.etree.ElementTree as ET

def convert_uniprot_to_gene(uniprot_id):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
    response = requests.get(url)
    if response.ok:
        xml_data = response.text
        root = ET.fromstring(xml_data)
        gene_symbol_element = root.find(".//{http://uniprot.org/uniprot}gene/{http://uniprot.org/uniprot}name[@type='primary']")
        if gene_symbol_element is not None:
            gene_symbol = gene_symbol_element.text
            return gene_symbol

    return None

In [None]:
import requests
import json

def convert_ensembl_to_gene_symbol(ensembl_id):
    url = f"https://rest.ensembl.org/lookup/id/{ensembl_id}"
    headers = {"Content-Type": "application/json"}
    response = requests.get(url, headers=headers)
    if response.ok:
        data = response.json()
        if "display_name" in data:
            gene_symbol = data["display_name"]
            return gene_symbol

    return None

In [None]:
#Gathering unique genes in the combined table

%store -r big_table

unique_genes = set()

gene_pairs_list = big_table['GenePair'].to_list()[0]

# Iterate over the gene pairs list
for gene_pair in gene_pairs_list:
    
    # Split the gene pair by the delimiter 
    genes = gene_pair.split(',') 
    
    # Add each gene to set
    for gene in genes:
        unique_genes.add(gene.strip())
        
print("We have " + str(len(unique_genes)) + " distinct genes.")
print(unique_genes)
%store unique_genes

In [None]:
#Creating a dictionary that maps each UniprotID/Ensembl ID to its gene symbol, if one exists. 
#Using multiprocessing to make this faster using Dask.

import dask
import dask.distributed
import psutil
import time

#Creating a dictionary that maps each UniprotID to its gene symbol. 
#This allows multiple processes to access and update the dictionary concurrently.
gene_to_symbol = dict()
%store -r unique_genes
gene_list = list(unique_genes)

for gene in gene_list:
    gene_to_symbol[gene] = None

#This ensures that the API call for each gene is executed lazily and in a parallelizable manner.
@dask.delayed
def process_gene(gene):
    original_gene = gene.strip()

    # check if uniprot
    output = convert_uniprot_to_gene(str(original_gene))

    # otherwise check if ensembl
    if output is None:
        output = convert_ensembl_to_gene_symbol(str(original_gene))
        
    #if we still couldn't find a gene symbol, we will keep it the same
    if output is None:
        output = original_gene
    gene_symbol = output

    return original_gene, gene_symbol

i = 0
length = len(gene_list)
genes_per_process = 547  # Number of genes to process per process

# Split the gene pairs into chunks for parallel processing
chunks = [gene_list[i:i + genes_per_process] for i in range(0, length, genes_per_process)]

# Create a Dask client
client = dask.distributed.Client()

# Scatter the gene_to_symbol dictionary to all workers
gene_to_symbol_future = client.scatter(gene_to_symbol)

results = []
# Iterate over the chunks and process them in parallel
for chunk in chunks:
    start_time = time.time()
    futures = []
    for gene in chunk:
        future = process_gene(gene)
        futures.append(future)

   # Compute the results of the current chunk
    chunk_results = dask.compute(futures)
    results.extend(chunk_results)
    
    i += len(chunk)
    print("Progress: {}/{}".format(i, length))
    
    # Print CPU and memory utilization
    cpu_percent = psutil.cpu_percent()
    memory_info = psutil.virtual_memory()
    memory_percent = memory_info.percent
    print("CPU utilization: {}%".format(cpu_percent))
    print("Memory utilization: {}%".format(memory_percent))
    elapsed_time = time.time() - start_time
    print("Elapsed Time: {:.2f} seconds".format(elapsed_time))
    
# Update the gene_to_symbol dictionary with the results
for this_chunk in results:
    for gene in this_chunk:
        gene_to_symbol[gene[0]] = gene[1]

# Close the Dask client
client.close()

print(gene_to_symbol)
%store gene_to_symbol

Converting tables 1 - 6 to gene symbols using the created dictionary

In [40]:
%store -r shared_protein_table
shared_protein_table = convert_to_gene_symbols(gene_to_symbol, shared_protein_table)
shared_protein_table.names = {'C0': 'GenePair'}
print("Converting to csv file...")
shared_protein_table.to_csv("shared_protein_table.csv")

Converting datatable to a list of lists...
Done. Splitting data list into chunks...
Done. Scattering chunks to Dask workers...
Done. Applying update function to each chunk in parallel...
Elapsed Time: 4.44 seconds
Done. Combining the updated chunks into a single datatable...
Converting to csv file...


In [42]:
%store -r genetic_interaction_table
genetic_interaction_table = convert_to_gene_symbols(gene_to_symbol, genetic_interaction_table)
genetic_interaction_table.names = {'C0': 'GenePair'}
print("Converting to csv file...")
genetic_interaction_table.to_csv("genetic_interaction_table.csv")

Converting datatable to a list of lists...
Done. Splitting data list into chunks...
Done. Scattering chunks to Dask workers...
Done. Applying update function to each chunk in parallel...
Elapsed Time: 29.00 seconds
Done. Combining the updated chunks into a single datatable...
Converting to csv file...


In [45]:
%store -r localization_table
localization_table = convert_to_gene_symbols(gene_to_symbol, localization_table)
localization_table.names = {'C0': 'GenePair'}
print("Converting to csv file...")
localization_table.to_csv("localization_table.csv")

Converting datatable to a list of lists...
Done. Splitting data list into chunks...
Done. Scattering chunks to Dask workers...
Done. Applying update function to each chunk in parallel...
Elapsed Time: 2.60 seconds
Done. Combining the updated chunks into a single datatable...
Converting to csv file...


In [None]:
%store -r protein_table
print(protein_table)
protein_table = convert_to_gene_symbols(gene_to_symbol, protein_table)
protein_table.names = {'C0': 'GenePair'}
print("Converting to csv file...")
protein_table.to_csv("protein_table.csv")

In [48]:
%store -r pathway_table
pathway_table = convert_to_gene_symbols(gene_to_symbol, pathway_table)
pathway_table.names = {'C0': 'GenePair'}
print("Converting to csv file...")
pathway_table.to_csv("pathway_table.csv")

Converting datatable to a list of lists...
Done. Splitting data list into chunks...
Done. Scattering chunks to Dask workers...
Done. Applying update function to each chunk in parallel...
Elapsed Time: 0.50 seconds
Done. Combining the updated chunks into a single datatable...
Converting to csv file...


In [None]:
%store -r co_exp_table
print(co_exp_table)
co_exp_table = convert_to_gene_symbols(gene_to_symbol, co_exp_table)
co_exp_table.names = {'C0': 'GenePair'}
print(co_exp_table)
print("Converting to csv file...")
co_exp_table.to_csv("co_exp_table.csv")

In [None]:
#Also storing table 7, which is already in gene symbols

%store -r gene_reg_table
print("Storing gene_reg_table in csv...")
gene_reg_table.to_csv('gene_reg_table.csv')

Combining all gene pairs from all 7 tables into one column

In [52]:
import datatable as dt

def merge_to_final_table(table, combined_table):
    gene_pairs = table[:, 0]
    combined_table.rbind(gene_pairs)
    return combined_table

In [53]:
import datatable as dt

combined_table = dt.Frame()
print("Merging shared_protein_table...")
combined_table = merge_to_final_table(shared_protein_table, combined_table)
print("Done. Merging genetic_interaction_table...")
combined_table = merge_to_final_table(genetic_interaction_table, combined_table)
print("Done. Merging localization_table...")
combined_table = merge_to_final_table(localization_table, combined_table)
print("Done. Merging protein_table...")
combined_table = merge_to_final_table(protein_table, combined_table)
print("Done. Merging pathway_table...")
combined_table = merge_to_final_table(pathway_table, combined_table)
print("Done. Merging co_exp_table...")
combined_table = merge_to_final_table(co_exp_table, combined_table)
print("Done. Merging gene_reg_table...")
combined_table = merge_to_final_table(gene_reg_table, combined_table)
print(combined_table)
print("Done. Deleting duplicate pairs...")
combined_table = dt.unique(combined_table['GenePair'])
print("Done. Converting to csv file...")
combined_table.to_csv("combined_table_gene_pair_symbols.csv")
print(combined_table)

Merging shared_protein_table...
Done. Merging genetic_interaction_table...
Done. Merging localization_table...
Done. Merging protein_table...
Done. Merging pathway_table...
Done. Merging co_exp_table...
Done. Merging gene_reg_table...
         | GenePair        
         | str32           
-------- + ----------------
       0 | CYP51A1,CYP26B1 
       1 | CFLAR,CASP10    
       2 | SEMA3F,PLXND1   
       3 | RBM5,RBM6       
       4 | CFTR,ABCB5      
       5 | CDC27,FKBP4     
       6 | SLC25A5,SLC25A13
       7 | RECQL,DHX33     
       8 | HOXA11,MEOX1    
       9 | FKBP4,RPAP3     
      10 | CDC27,RPAP3     
      11 | CIAPIN1,NDUFAF7 
      12 | CFTR,ABCB4      
      13 | ABCB4,ABCB5     
      14 | ALS2,IBTK       
       … | …               
50538058 | PLA2G12A,ZMIZ1  
50538059 | PRKRA,ZMIZ1     
50538060 | STAM,ZMIZ1      
50538061 | TRF,ZMIZ1       
50538062 | WDR41,ZMIZ1     
[50538063 rows x 1 column]

Done. Deleting duplicate pairs...
Done. Converting to csv file...

Creating new & final large table with all factors and all entries filled.

In [2]:
import dask.dataframe as dd
import pandas as pd

def mergeTable(small_table, large_table, columnName):
    print("Merging large dataframe with this table..")
    merged_df = large_table.merge(small_table, on='GenePair', how='left')
    print("Calculating the average weight from the small dataframe...")
    avg = small_table[columnName].mean().compute()
    print("Done. The average is: " + str(avg))
    print("Filling in any empty weights in the merged dataframe with the average weight...")
    merged_df[columnName] = merged_df[columnName].fillna(avg)
    print('Done.\n')
    return merged_df

print("Loading genepairs from large table into a dask dataframe...")
large_df = dd.read_csv('combined_table_gene_pair_symbols.csv', header=0, usecols=['GenePair'])

#Table 7
print("Loading table 7 as dask dataframe...")
gene_reg_df = dd.read_csv('gene_reg_table.csv')
large_df = mergeTable(gene_reg_df, large_df, 'Gene Regularatory Weight')

#Table 6
print("Loading table 6 as dask dataframe...")
shared_protein_df = dd.read_csv('shared_protein_table.csv')
large_df = mergeTable(shared_protein_df, large_df, 'Shared Protein Domain Weight')

#Table 5
print("Loading table 5 as dask dataframe...")
genetic_interaction_df = dd.read_csv('genetic_interaction_table.csv')
large_df = mergeTable(genetic_interaction_df, large_df, 'Genetic Interaction Weight')

#Table 4
print("Loading table 4 as dask dataframe...")
localization_df = dd.read_csv('localization_table.csv')
large_df = mergeTable(localization_df, large_df, 'Co-localization Weight')

#Table 3
print("Loading table 3 as dask dataframe...")
protein_df = dd.read_csv('protein_table.csv')
large_df = mergeTable(protein_df, large_df, 'Protein-Protein Weight')

#Table 2
print("Loading table 2 as dask dataframe...")
pathway_df = dd.read_csv('pathway_table.csv')
large_df = mergeTable(pathway_df, large_df, 'Pathway Weight')

#Table 1
print("Loading table 1 as dask dataframe...")
co_exp_df = dd.read_csv('co_exp_table.csv')
large_df = mergeTable(co_exp_df, large_df, 'Co-expression Weight')

print(large_df.head(n=10000, npartitions=1, compute=True))
print("Saving the merged dataframe as a csv file...")
large_df.to_csv('final_table_gene_data.csv')

print("Done.")


Loading genepairs from large table into a dask dataframe...
Loading table 7 as dask dataframe...
Merging large dataframe with this table..
Calculating the average weight from the small dataframe...
Done. The average is: 1.9726663895399094e-14
Filling in any empty weights in the merged dataframe with the average weight...
Done.

Loading table 6 as dask dataframe...
Merging large dataframe with this table..
Calculating the average weight from the small dataframe...
Done. The average is: -4.583351362155133e-13
Filling in any empty weights in the merged dataframe with the average weight...
Done.

Loading table 5 as dask dataframe...
Merging large dataframe with this table..
Calculating the average weight from the small dataframe...
Done. The average is: -1.178984198571654e-13
Filling in any empty weights in the merged dataframe with the average weight...
Done.

Loading table 4 as dask dataframe...
Merging large dataframe with this table..
Calculating the average weight from the small dataf

In [None]:
TRASH

In [None]:
import dask
from datatable import dt, update
import dask.array as da
import time
%store -r big_table
%store -r gene_to_symbol

print("Converting datatable to a list of lists...")
# Convert the datatable to a list of lists
data_list = big_table[:, 0].to_list()

print("Done. Splitting data list into chunks...")
# Split the data list into chunks
chunks = dask.delayed(data_list)


# Define the function to update the gene symbols
def update_gene_symbol(gene):
    gene1, gene2 = gene.split(',')[0], gene.split(',')[1]
    output = gene_to_symbol.get(gene1, gene1) + "," + gene_to_symbol.get(gene2, gene2)
    return output

# Define the function to update a chunk
def update_chunk(chunk):
    updated_chunk = []
    for gene in chunk:
        updated_gene = update_gene_symbol(gene)
        updated_chunk.append(updated_gene)
    return updated_chunk

print("Done. Scattering chunks to Dask workers...")
# Scatter the chunks to Dask workers
scattered_chunks = dask.compute(chunks)[0]

print("Done. Applying update function to each chunk in parallel...")
start_time = time.time()
# Apply the update function to each chunk in parallel using Dask
results = dask.compute(*[dask.delayed(update_chunk)(chunk) for chunk in scattered_chunks])
elapsed_time = time.time() - start_time
print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

print("Done. Combining the updated chunks into a single datatable...")
# Combine the updated chunks into a single datatable
updated_data_list = sum(results, [])
updated_table = dt.Frame(updated_data_list)

# Add the second column of the original datatable to the updated datatable
updated_table = dt.cbind(updated_table, big_table[:, 1])

%store updated_table
print(big_table)
print(updated_table)

Data table has been converted to gene symbols. Adding table 7 to table.

In [None]:
import datatable as dt

# Load updated_table and gene_reg_table
%store -r updated_table
%store -r gene_reg_table
#updated_table.names = {'C0': 'GenePair'}

# Create the initial big table as a dictionary
big_table_dict_2 = defaultdict(list)

# Merge and update weights for each table
print("Adding updated_table...")
merge_and_update(updated_table, "WeightList", big_table_dict_2)

In [None]:
%store -r gene_reg_table

merge_and_update(gene_reg_table, "Gene Regularatory Weight", big_table_dict_2)

print("Finished adding all tables. Converting dictionary to dataframe...")

In [None]:
# Create empty lists for GenePair and WeightList columns
gene_pairs = []
weight_lists = []

# Iterate over the dictionary and populate the lists
for gene_pair, weight_list in big_table_dict_2.items():
    gene_pairs.append(gene_pair)
    weight_lists.append(weight_list)

In [None]:
# Create the big table using the populated lists

print("Converting weights to list if it isn't already one...")
weight_lists = [list(w) if isinstance(w, (tuple, list)) else [w] for w in weight_lists]

print("Done. Merging tables...")

In [23]:
import csv

# Define the CSV file path
csv_file = "gene_data.csv"

# Write the data to the CSV file
with open(csv_file, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["GenePair", "WeightList"])  # Write header

    # Write the data rows
    for gene_pair, weight_list in zip(gene_pairs, weight_lists):
        writer.writerow([gene_pair, weight_list])

print("Gene pairs and weight lists have been stored in the CSV file:", csv_file)


Gene pairs and weight lists have been stored in the CSV file: gene_data.csv


In [25]:
# Define the txt file path
txt_file = "gene_data_text.txt"

# Write the data to the txt file
with open(txt_file, mode="w") as file:
    file.write("GenePair\tWeightList\n")  # Write header

    # Write the data rows
    for gene_pair, weight_list in zip(gene_pairs, weight_lists):
        file.write(f"{gene_pair}\t{weight_list}\n")

print("Gene pairs and weight lists have been stored in the txt file:", txt_file)

Gene pairs and weight lists have been stored in the txt file: gene_data_text.txt


In [8]:
import dask
from datatable import dt, update
import dask.array as da
import time

def convert_to_gene_symbols(dictionary, table):

    print("Converting datatable to a list of lists...")
    # Convert the datatable to a list of lists
    data_list = table[:, 0].to_list()

    print("Done. Splitting data list into chunks...")
    # Split the data list into chunks
    chunks = dask.delayed(data_list)


    # Define the function to update the gene symbols
    def update_gene_symbol(gene):
        gene1, gene2 = gene.split(',')[0], gene.split(',')[1]
        output = gene_to_symbol.get(gene1, gene1) + "," + gene_to_symbol.get(gene2, gene2)
        return output

    # Define the function to update a chunk
    def update_chunk(chunk):
        updated_chunk = []
        for gene in chunk:
            updated_gene = update_gene_symbol(gene)
            updated_chunk.append(updated_gene)
        return updated_chunk

    print("Done. Scattering chunks to Dask workers...")
    # Scatter the chunks to Dask workers
    scattered_chunks = dask.compute(chunks)[0]

    print("Done. Applying update function to each chunk in parallel...")
    start_time = time.time()
    # Apply the update function to each chunk in parallel using Dask
    results = dask.compute(*[dask.delayed(update_chunk)(chunk) for chunk in scattered_chunks])
    elapsed_time = time.time() - start_time
    print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

    print("Done. Combining the updated chunks into a single datatable...")
    # Combine the updated chunks into a single datatable
    updated_data_list = sum(results, [])
    updated_table = dt.Frame(updated_data_list)

    # Add the second column of the original datatable to the updated datatable
    updated_table = dt.cbind(updated_table, table[:, 1])
    return updated_table
