# Generate SRP Training Data
Goal is create a machine learning model based on publicly available SRP data.
This model will then be applied to Treehouse public compendia to determine how well it performs.

In [2]:
import pandas as pd
import numpy as np

import os

os.chdir(os.getcwd())

In [9]:
## Load in Treehouse SRP data
TREEHOUSE_DATADIR = "../data/treehouse_SRP_data/"

# polyA samples
SRP026013_expression_data = pd.read_csv(TREEHOUSE_DATADIR + 'SRP026013_log2TPM_plus1_HUGO.tsv', sep='\t', index_col=0)
SRP064410_expression_data = pd.read_csv(TREEHOUSE_DATADIR + 'SRP064410_log2TPM_plus1_HUGO.tsv', sep='\t', index_col=0)
SRP132968_expression_data = pd.read_csv(TREEHOUSE_DATADIR + 'SRP132968_log2TPM_plus1_HUGO.tsv', sep='\t', index_col=0)

# riboD samples
SRP055411_expression_data = pd.read_csv(TREEHOUSE_DATADIR + 'SRP055411_log2TPM_plus1_HUGO.tsv', sep='\t', index_col=0)
SRP058841_expression_data = pd.read_csv(TREEHOUSE_DATADIR + 'SRP058841_log2TPM_plus1_HUGO.tsv', sep='\t', index_col=0)
SRP109549_expression_data = pd.read_csv(TREEHOUSE_DATADIR + 'SRP109549_log2TPM_plus1_HUGO.tsv', sep='\t', index_col=0)
SRP130971_expression_data = pd.read_csv(TREEHOUSE_DATADIR + 'SRP130971_log2TPM_plus1_HUGO.tsv', sep='\t', index_col=0)
SRP183700_expression_data = pd.read_csv(TREEHOUSE_DATADIR + 'SRP183700_log2TPM_plus1_HUGO.tsv', sep='\t', index_col=0)


In [13]:
# Merge dataframes
polyA_data = [SRP026013_expression_data, SRP064410_expression_data, SRP132968_expression_data]
riboD_data = [SRP055411_expression_data, SRP058841_expression_data, SRP109549_expression_data, SRP130971_expression_data, SRP183700_expression_data]

combined_polyA_expression_data = pd.concat(polyA_data)
combined_riboD_expression_data = pd.concat(riboD_data)

print("Shape of polyA SRP expression data " + ','.join(str(value) for value in combined_polyA_expression_data.shape))
print("Shape of riboD SRP expression data " + ','.join(str(value) for value in combined_riboD_expression_data.shape))

In [34]:
# Subset to top 5000 highest variance genes
merged_SRP_data = pd.concat([combined_polyA_expression_data, combined_riboD_expression_data], axis=0)
merged_SRP_data = merged_SRP_data.T.loc[merged_SRP_data.var().sort_values().index[-5000:]].T

merged_SRP_data.shape

(220, 5000)

In [36]:
SRP_classifier_genes = merged_SRP_data.columns
np.savetxt(TREEHOUSE_DATADIR+'SRPClassifierGenes.txt',SRP_classifier_genes, fmt='%s')

In [37]:
# Get labels
polyA_labels = combined_polyA_expression_data.shape[0]*[0]
riboD_labels = combined_riboD_expression_data.shape[0]*[1]
all_labels = polyA_labels + riboD_labels

merged_SRP_labels = pd.DataFrame(all_labels, index = merged_SRP_data.index, columns = ['Ribo'])

In [38]:
# Output data

# Write datasets
merged_SRP_data.to_csv(TREEHOUSE_DATADIR + 'merged_SRP_data.tsv', sep='\t')
# Write labels
merged_SRP_labels.to_csv(TREEHOUSE_DATADIR + 'merged_SRP_labels.tsv', sep='\t')