### These are the parameters of the classifier I have made
- pathToInput: file name and location (if neccessary) of the sample to be classified
    - should have sampleID first row, log2TPM+1 values in rows after first
- if there is a HUGO gene id column 
    - noGene = False
- otherwise:
    - noGene = True

In [38]:
pathToInput = "bigSampleOneColumn.tsv"
nameOfOutput = "outputBigSampleOneColumn.txt"
noGene = True

# Run script to classify input file!

In [39]:
#!/usr/bin/env python3
# By Liam McKay (ltmckay) and Paola Angulo (pangulo)
# polyAriboDclassifier-v2.py

####################################################################################
# MAIN PROGRAM
# 
# Finds the method of preparation for the transcriptome from a sequence partner 
# in the Treehouse Initiative 
# Usage: $ python polyAriboDclassifier-v2.py [-nG [--noGene]] inputFile.tsv outputFile
####################################################################################


import matplotlib.pyplot as plt
import csv
import numpy as np
import pandas as pd
import sys

def main(inCL=None):
	'''
	Find PolyA-Selection or Ribo-Depletion of a column of gene expression data by 
	comparing the 95th percentile, variance, and mean of a reference set to the new
	sample.
	-nG --noGene: use if your new sample does not have a "Gene" column, otherwise 
				  the program assumes that there is a Gene column and removes it.

	'''


	# open file for output
	inputFile = pathToInput
	outputFile = nameOfOutput 
	outputFileObject = open(outputFile, 'w')
	newSampdf = pd.read_csv(inputFile, sep="\t")

	# ------------------------------------------------
	# classification calculations:

	# Load these in from values returned by avgGeneExpr_ExpressedGene.py
	riboDp95 = 3.6812559832605625
	riboDVar = 1.881014066540907
	riboDMean = 0.7170628642305512
	polyAp95 = 5.340720200699123
	polyAVar = 3.649240487654772
	polyAMean = 1.0491995456139986

	# Distance calculation of mean
	threeValsRiboD = np.array([riboDp95,riboDVar,riboDMean])

	# figure out how to access mean value in dataframes of one value
	threeValsPolyA = np.array([polyAp95,polyAVar,polyAMean])

	# new sample
	newSampVarianceDf = newSampdf.var(axis=0)
	newSampMeanDf = newSampdf.mean(axis=0)
	newSampPctlDf = newSampdf.quantile(0.95)

	# drop gene column in the new sample's dataframe
	if(noGene == False):
		newSampdf = newSampdf.drop("Gene", axis=1)


	for col in newSampdf.columns :
		threeValsNewSamp = np.array([newSampPctlDf[col],newSampVarianceDf[col],newSampMeanDf[col]])
		differenceToRiboD = np.subtract(threeValsNewSamp, threeValsRiboD)
		differenceToPolyA = np.subtract(threeValsNewSamp, threeValsPolyA)
		
		# scoring method based on how close each p95, var, mean is to the respective reference method
		riboDScore = 0
		polyAScore = 0
		for i in range(0,3):
			if (abs(differenceToPolyA[i])<abs(differenceToRiboD[i])):
				polyAScore+=1
			else:
				riboDScore+=1
		if(riboDScore>polyAScore):
			outputFileObject.write("{0}\tRiboMinus\n".format(col))
		else:
			outputFileObject.write("{0}\tPolyA\n".format(col))

	
if __name__ == "__main__":
	main() 



# Run to print results

In [44]:
!head outputBigSampleOneColumn.txt

TH01_0051_S01	PolyA
