### Select Well-Predicted Genes

We used R2 values from [ElasticNet training](https://sabrina-mi.github.io/PTRS-Rat-Analysis/Br_EN_Validation)

![Well-Predicted Genes](WellPredGenes.png)

### Collect Gene Intervals

In [26]:
import pandas as pd
gene_annot = pd.read_csv('/home/s1mi/enformer_rat_data/rn7.gene.txt', sep ='\t',  index_col='geneId')
gene_list = ['ENSRNOG00000001342', 'ENSRNOG00000020624', 'ENSRNOG00000058006', 'ENSRNOG00000060045']

def write_intervals(gene_list, file):
    with open(file, 'a') as f:
        for gene in gene_list:
            print(gene)
            row = gene_annot.loc[gene]
            interval = f"chr{row['chromosome']}_{row['start']}_{row['end']}"
            f.write(interval + '\n')

write_intervals(gene_list, "metadata/intervals.txt")

    

ENSRNOG00000001342
ENSRNOG00000020624
ENSRNOG00000058006
ENSRNOG00000060045


  gene_annot = pd.read_csv('/home/s1mi/enformer_rat_data/rn7.gene.txt', sep ='\t',  index_col='geneId')


### Concatenate h5 outputs

In [11]:
predictions_dir="/home/s1mi/enformer_rat_pipeline/predictions_folder/personalized_enformer_minimal_some_regions/predictions_2023-07-26/enformer_predictions"
import os
import h5py


In [90]:
def concatenate_predictions(predictions_dir, output_path):
    with h5py.File(output_path, "w") as output_file:
        individuals = os.listdir(predictions_dir)
        for individual in individuals:
            group = output_file.create_group(individual)

            dir = os.path.join(predictions_dir, individual, "haplotype1")
            filenames = os.listdir(dir)

            for file in filenames:
                interval = file.replace("_predictions.h5", "")
                with h5py.File(os.path.join(dir, file), "r") as input_file:
                    group.create_dataset(interval, data=input_file[interval][()], shape=(896,5313))
            


    

In [91]:
concatenate_predictions(predictions_dir, "/home/s1mi/enformer_rat_pipeline/Br_enformer_predictions.h5")

In [94]:
with h5py.File("/home/s1mi/enformer_rat_pipeline/Br_enformer_predictions.h5", "r") as f:
    for individual in f.keys():
        data = f[individual]
        for interval in data.keys():
            #print(data[interval][()])
        
        

[[0.5645112  0.3369323  0.35497162 ... 0.08674482 0.9707706  0.76295894]
 [1.6409073  1.041316   1.166337   ... 0.18861614 1.1439003  0.8881453 ]
 [1.5544218  1.0320587  0.87225413 ... 0.07785353 0.1688161  0.14973015]
 ...
 [0.2423646  0.29521942 0.40190938 ... 0.01414104 0.10851587 0.10147726]
 [0.253065   0.29997772 0.40514356 ... 0.04219715 0.36896732 0.43622243]
 [0.21092205 0.24369633 0.29999018 ... 0.03879149 0.14974359 0.12561812]]
[[0.27489612 0.25525594 0.40794072 ... 0.01854003 0.09383103 0.06213763]
 [0.29296684 0.28227487 0.42407656 ... 0.02425781 0.11390349 0.06618448]
 [0.26659775 0.24751456 0.26308283 ... 0.0108161  0.06195768 0.03871614]
 ...
 [0.15637168 0.29116946 0.12917393 ... 0.02218419 0.10299934 0.08917878]
 [0.17481245 0.2683783  0.16950147 ... 0.00800467 0.04096119 0.04038178]
 [0.16851926 0.2367249  0.20260964 ... 0.00727123 0.03415444 0.03612157]]
[[0.07318618 0.07125932 0.04390321 ... 0.0135207  0.09588904 0.0576106 ]
 [0.13737778 0.13913476 0.09102745 ... 

KeyboardInterrupt: 

### Calculate predicted gene expression

In [13]:
    #   temp_predictions = [prediction_1[:, 5110], prediction_2[:, 5110]] # CAGE predictions we are interested in
    #   individual_prediction[individual] = temp_predictions

    #   # Calculate TSS CAGE expression which correspond to column 5110 of the predictions above
    #   temp_list = list()

    #   pred_prepared_1 = prepare_for_quantify_prediction_per_TSS(predictions=prediction_1, gene=gene, tss_df=tss_dataframe)
    #   tss_predictions_1 = quantify_prediction_per_TSS(low_range = window_coords.start, TSS=pred_prepared_1['gene_TSS'], cage_predictions=pred_prepared_1['cage_predictions'])

    #   pred_prepared_2 = prepare_for_quantify_prediction_per_TSS(predictions=prediction_2, gene=gene, tss_df=tss_dataframe)
    #   tss_predictions_2 = quantify_prediction_per_TSS(low_range = window_coords.start, TSS=pred_prepared_2['gene_TSS'], cage_predictions=pred_prepared_2['cage_predictions'])

    #   temp_list.append(tss_predictions_1)
    #   temp_list.append(tss_predictions_2) # results here are a dictionary for each TSS for each haplotype

    #   individual_results[individual] = temp_list # save for the individual

    # gene_output[gene] = individual_results
    # gene_predictions[gene] = individual_prediction

def calculate_gene_expression(matrix, target_interval, tss):
    window_coords = target_interval.resize(SEQUENCE_LENGTH)
    low_range = window_coords.start
    CAGE_predictions = matrix[:, 4980]
    
    bin_start = low_range + ((768 + 320) * 128)
    count = -1
    while bin_start < tss:
      bin_start = bin_start + 128
      count += 1
    if count >= len(cage_predictions)-1:
      continue
    cage_preds = cage_predictions[count - 1] + cage_predictions[count] + cage_predictions[count + 1]
