# Prepare the instance files for model building
For classification models:  
1. Train/Val/Test: Melissa's alpha whole genome duplication gene pairs as instances. The label is 0 for no interaction and 1 for interaction.  
2. Predict: Cusack 2021 kinase family gene pairs
3. Predict: Kinase gene pairs from Araport11 (Won't do this)
4. Predict: Kinase gene pairs from TAIR10

For regression models:  
1. Train/Val/Test: Melissa's gene pairs with the corrected total seed count (TSC) as the label
2. Predict: All possible kinase family gene pairs
3. Predict: Kinase gene pairs from Araport11 (Won't do this)
4. Predict: Kinase gene pairs from TAIR10

Additional instance files:  
- All possible gene pair combinations as instances from the A. thaliana TAIR10 genome  
- Single genes as instances from TAIR10  

**Conda env:** /home/seguraab/miniconda3/envs/py310

**Update 9/17/2024:** Melissa gave me an updated dataset with additional instances.
ara-kinase-prediction/data/20240917_melissa_ara_data/**fitness_data_for_Kenia_09172024.xlsx**

## Using 20240725_melissa_ara_data/interactions_fitness.txt
### Prepare instance file no. 1

In [1]:
import pandas as pd

# Read in the gene pairs from the two datasets
ara_m = pd.read_csv('../data/20240725_melissa_ara_data/interactions_fitness.txt', sep='\t') # Melissa's gene pairs
ara_m.rename(columns={'MA': 'gene1', 'MB': 'gene2'}, inplace=True)
kinases = pd.read_csv('../data/2021_cusack_data/Dataset_4.txt', sep='\t')
kinases = kinases.loc[kinases.Class=='test'] # these are the kinase family gene pairs

# Split the gene pair identifier into two columns
instances = kinases.pair_ID.str.split('_', expand=True)
instances.columns = ['gene1', 'gene2']

# Merge the instances
instances = pd.concat([instances, ara_m[['gene1', 'gene2']]], axis=0, ignore_index=True)
instances['gene1'] = instances['gene1'].str.upper()
instances['gene2'] = instances['gene2'].str.upper()
print(instances.shape) # (10265, 2)

# Check for duplicate gene pairs
instances = instances.apply(lambda x: sorted(x), axis=1) # sort the gene pairs
instances = pd.DataFrame(instances.to_list(), columns=['gene1', 'gene2'])
instances.drop_duplicates(inplace=True)
print(instances.shape) # (10250, 2); 15 gene pairs overlapped with ara_m

# Save the instances
# instances.to_csv('../data/instances_dataset_1.txt', sep='\t', index=False)
instances

(10265, 2)
(10250, 2)


Unnamed: 0,gene1,gene2
0,AT3G46420,AT4G20450
1,AT5G01820,AT5G57630
2,AT2G37050,AT5G59660
3,AT3G17840,AT3G51740
4,AT1G11410,AT4G23190
...,...,...
10259,AT1G23380,AT1G70510
10260,AT1G26790,AT1G69570
10261,AT1G16060,AT1G79700
10262,AT1G21410,AT1G77000


In [3]:
import os

os.chdir('/home/seguraab/ara-kinase-prediction')
kinases = pd.read_csv('data/2021_cusack_data/Dataset_4.txt', sep='\t')

# sort instance identifiers
sorted_IDs = kinases['pair_ID'].str.split('_').apply(sorted).str.join('_')
sum(sorted_IDs == kinases.pair_ID.values) # they're exactly the same, good!

# Note: I am going to generate features for Dataset_4.txt (saved to ara-kinase-prediction/data/2021_cusack_data/Dataset_4_Features/)

10300

##### Create test set files
These test sets will be used to evaluate model performance in a "round robin" style.
For the cross-validation during model training, I will use:
1. Leave-One-Out cross-validation with the remaining 9 folds
2. Normal 9-fold cross-validation, excluding the test set for each round robin iteration

In [None]:
# Drop the duplicate instance from ara_m before assigning instances to CV folds
ara_m = pd.read_csv('../data/20240725_melissa_ara_data/interactions_fitness.txt', sep='\t') # Melissa's gene pairs
ara_m.index = np.sort(ara_m[['MA', 'MB']], axis=1) # sort the gene pairs
ara_m.index = ara_m.index.map(tuple) # convert to tuples
ara_m.index[ara_m.index.duplicated()] # At1g18620 At1g74160 is duplicated (set 703)
ara_m_no_dups = ara_m.loc[ara_m.index.drop_duplicates()] 


# Convert instances to uppercase
ara_m_no_dups.index = ara_m_no_dups.index.set_levels(ara_m_no_dups.index.levels[0].map(str.upper), level=0)
ara_m_no_dups.index = ara_m_no_dups.index.set_levels(ara_m_no_dups.index.levels[1].map(str.upper), level=1)
ara_m_no_dups = ara_m_no_dups.loc[ara_m_no_dups.Set != 703]

# Assign instances to CV folds
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=20240909)
for i, (train_idx, test_idx) in enumerate(folds.split(ara_m_no_dups, ara_m_no_dups['Interaction'])):
	fold_i = ara_m_no_dups.loc[ara_m_no_dups.index[test_idx]].index.to_frame(index=False)
	
	with open(f'../data/test_sets_clf/test_ara_m_fold_{i}.txt', 'w') as out:
		for j in range(fold_i.shape[0]):
			out.write(f'{fold_i.iloc[j,0]}_{fold_i.iloc[j,1]}\n')


### Prepare instance file of TAIR10 Kinase genes

In [None]:
import itertools
import datatable as dt
import numpy as np

# Kinase genes from NCBI
genes = pd.read_csv("../data/Kinase_genes/kinase-athaliana-ncbi.tsv", sep="\t")

# Most up-to-date reference genome is NC_003070.9
genes[['genomic_nucleotide_accession.version']].value_counts()

# Kinase gene list
for row in genes.loc[genes["genomic_nucleotide_accession.version"]=="NC_003070.9"].iterrows():
    print(row[1]['Aliases'])
# note: some aliases are "nan" or don't follow "AT..." format, perhaps they're from the plastid or mitochondria
# to simplify, I will only keep the first alias (AT... format) for each gene
k_id = genes.loc[(genes["genomic_nucleotide_accession.version"]=="NC_003070.9") \
    & (genes["Aliases"].str.startswith("AT"))].apply(
    lambda x: x["Aliases"].split(",")[0].strip() if isinstance(x["Aliases"], str)\
    else x, axis=1)
print(k_id.nunique()) # 606 unique kinase genes

# TAIR10 GFF genes
gff = dt.fread("../data/TAIR10/Athaliana_167_gene.gff3", skip_to_line=4).to_pandas()
at_id = gff.loc[gff["C2"]=="gene"].\
    apply(lambda x: x["C8"].replace("ID=", "").split(";")[0].strip() if isinstance(x["C8"], str) else x, axis=1)

# TAIR10 Kinase genes; generate all possible pairs
tair10_kinases = np.intersect1d(k_id, at_id) # 597 kinase genes
pairs = list(itertools.combinations(tair10_kinases, 2)) # 177906 pairs

# Save the pairs to a file
pd.DataFrame(pairs, columns=["gene1", "gene2"]).to_csv(
    "../data/Kinase_genes/instances_tair10_kinases_NC_003070.9.txt", index=False, sep="\t")


### Prepare instance file of Araport11 Kinase genes

In [None]:
# Kinase genes from NCBI
genes = pd.read_csv("../data/Kinase_genes/kinase-athaliana-ncbi.tsv", sep="\t")
k_id = genes.apply(lambda x: x["Aliases"].split(",")[0].strip() if isinstance(x["Aliases"], str) else x, axis=1)

print(k_id.apply(type).unique()) # type: ignore
# [<class 'str'> <class 'pandas.core.series.Series'>]
genes.iloc[k_id.loc[k_id.apply(lambda x: isinstance(x, pd.core.series.Series))].index,:]["Aliases"] # these Aliases are NaN values

k_id = k_id.loc[k_id.apply(lambda x: isinstance(x,str))].unique() # kinase genes (2235)

# Araport11 GFF genes
gff = dt.fread("../data/Araport11/Athaliana_447_Araport11.gene.gff3", skip_to_line=4).to_pandas()
at_id = gff.loc[gff["C2"]=="gene"].\
    apply(lambda x: x["C8"].replace("ID=", "").split(".")[0].strip() if isinstance(x["C8"], str) else x, axis=1)

# Araport11 Kinase genes; generate all possible pairs
araport11_kinases = np.intersect1d(k_id, at_id) # 2202 kinase genes
pairs = list(itertools.combinations(araport11_kinases, 2)) # 2423301 pairs

# Save the pairs to a file
pd.DataFrame(pairs, columns=["gene1", "gene2"]).to_csv(
    "../data/Kinase_genes/instances_araport11_kinases.txt", index=False, sep="\t")


### Prepare instance file for single genes

In [2]:
import gffutils

gff = '../data/TAIR10/Athaliana_167_TAIR10.gene.gff3'

# Create the database
db = gffutils.create_db(gff, dbfn='TAIR10.db', force=True, keep_order=True,
                        merge_strategy='merge', sort_attribute_values=True) 

# Extract the gene information
db = gffutils.FeatureDB('../data/TAIR10/TAIR10.db') # access the database
genes = []
for gene in db.features_of_type('gene'):
    genes.append(gene['Name'][0])

print(len(genes)) # 27416

# Write the gene IDs to a file
# with open('../data/instances_dataset_singles.txt', 'w') as f:
#     f.write('gene\n')
#     for gene in genes:
#         f.write('%s\n' % gene)

27416


### Prepare instance file no. 2

In [3]:
from itertools import combinations

# Generate all possible gene pairs without duplicates
gene_pairs = list(combinations(genes, 2))
len(gene_pairs) # 375804820

# Write the gene pairs to a file
# with open('../data/instances_dataset_pairs.txt', 'w') as f:
#     f.write('gene1\tgene2\n')
#     for gene_pair in gene_pairs:
#         f.write('%s\t%s\n' % gene_pair)

375804820

## Using 20240917_melissa_ara_data/fitness_data_for_Kenia_09172024_corrected.tsv