# Prepare the instance files for model building
1. Kinase family gene pairs and Melissa's alpha whole genome duplication gene pairs as instances  
2. All possible gene pair combinations as instances from the A. thaliana TAIR10 genome  
3. Single genes as instances from TAIR10  

Conda env: /home/seguraab/miniconda3/envs/py310

### Prepare instance file no. 1

In [1]:
import pandas as pd

# Read in the gene pairs from the two datasets
ara_m = pd.read_csv('../data/20240725_melissa_ara_data/interactions_fitness.txt', sep='\t') # Melissa's gene pairs
ara_m.rename(columns={'MA': 'gene1', 'MB': 'gene2'}, inplace=True)
kinases = pd.read_csv('../data/2021_cusack_data/Dataset_4.txt', sep='\t')
kinases = kinases.loc[kinases.Class=='test'] # these are the kinase family gene pairs

# Split the gene pair identifier into two columns
instances = kinases.pair_ID.str.split('_', expand=True)
instances.columns = ['gene1', 'gene2']

# Merge the instances
instances = pd.concat([instances, ara_m[['gene1', 'gene2']]], axis=0, ignore_index=True)
instances['gene1'] = instances['gene1'].str.upper()
instances['gene2'] = instances['gene2'].str.upper()
print(instances.shape) # (10265, 2)

# Check for duplicate gene pairs
instances = instances.apply(lambda x: sorted(x), axis=1) # sort the gene pairs
instances = pd.DataFrame(instances.to_list(), columns=['gene1', 'gene2'])
instances.drop_duplicates(inplace=True)
print(instances.shape) # (10250, 2); 15 gene pairs overlapped with ara_m

# Save the instances
# instances.to_csv('../data/instances_dataset_1.txt', sep='\t', index=False)
instances

(10265, 2)
(10250, 2)


Unnamed: 0,gene1,gene2
0,AT3G46420,AT4G20450
1,AT5G01820,AT5G57630
2,AT2G37050,AT5G59660
3,AT3G17840,AT3G51740
4,AT1G11410,AT4G23190
...,...,...
10259,AT1G23380,AT1G70510
10260,AT1G26790,AT1G69570
10261,AT1G16060,AT1G79700
10262,AT1G21410,AT1G77000


### Prepare instance file no. 3

In [2]:
import gffutils

gff = '../data/TAIR10/Athaliana_167_TAIR10.gene.gff3'

# Create the database
db = gffutils.create_db(gff, dbfn='TAIR10.db', force=True, keep_order=True,
                        merge_strategy='merge', sort_attribute_values=True) 

# Extract the gene information
db = gffutils.FeatureDB('TAIR10.db') # access the database
genes = []
for gene in db.features_of_type('gene'):
    genes.append(gene['Name'][0])

print(len(genes)) # 27416

# Write the gene IDs to a file
# with open('../data/instances_dataset_singles.txt', 'w') as f:
#     f.write('gene\n')
#     for gene in genes:
#         f.write('%s\n' % gene)

27416


### Prepare instance file no. 2

In [3]:
from itertools import combinations

# Generate all possible gene pairs without duplicates
gene_pairs = list(combinations(genes, 2))
len(gene_pairs) # 375804820

# Write the gene pairs to a file
# with open('../data/instances_dataset_pairs.txt', 'w') as f:
#     f.write('gene1\tgene2\n')
#     for gene_pair in gene_pairs:
#         f.write('%s\t%s\n' % gene_pair)

375804820