In [3]:
import subprocess
import os
import loompy
import pandas as pd

import sys
sys.path.insert(1, '../scripts/') # comment out in python script
from load_environmental_variables import *

In [73]:
def velocyto_run10x(sample_folder, gtf_file, output_file, *args, **kwargs):
    '''

    Python wrapper function for velocytro run10x

    1) sample_folder is a string of the full path to the 10x sample folder.
    2) gtf_file is a string of the full path to the genome annotation file
    3) output_file is a string of the full path to the terminal output to be written as a text file

    '''
    # https://velocyto.org/velocyto.py/tutorial/cli.html#run10x-run-on-10x-chromium-samples


    try:
        cmd = 'velocyto run10x ' #command line command as a string and function of file path
        # add args
        for arg in args:
            if len(set(arg)) >1:
                cmd += '--' + arg + ' '
            else:
                cmd += '-' + arg + ' '

        # add kwargs
        for flag, option in kwargs.items():
            if len(flag) >1:
                cmd += '--' + flag + ' ' + option + ' '
            else:
                cmd += '-' + flag + ' ' + option + ' '
        cmd += sample_folder + ' ' + gtf_file + ' > ' + output_file
        print(cmd)
#         output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True, shell=True) # this line will execute the command
    except subprocess.CalledProcessError as exc:
        print("Status : FAIL", exc.returncode, exc.output)

In [40]:
# format metadata file
md = pd.read_csv(local_data_path + 'processed/5k_pbmc_celltypes.csv').iloc[:,[0,1]]
md.columns = ['SampleID', 'Cell_Type']
md.to_csv(local_data_path + 'processed/5k_pbmc_celltypes_velocytoformatted.csv')

In [70]:
# set up params
sample_folder_ = '/data2/hratch/STAT4/interim/5k_pbmc_v3_count'
gtf_file_ = '/data/hratch/Software/refdata-cellranger-GRCh38-3.1.0/GRCh38/genes/genes.gtf'
output_file_ = local_data_path + 'interim/velocyto_run10x_output.txt'
n_threads = 20
key_words = {'metadatatable': local_data_path + 'processed/5k_pbmc_celltypes_velocytoformatted.csv', 
                    'samtools-threads': str(n_threads), 'samtools-memory': str(round((183777*0.1)/n_threads)), 
                'dtype': 'uint32'}
arguments = ['vvv']
# run
velocyto_run10x(sample_folder_, gtf_file_, output_file_, *arguments, **key_words)
# move output loom file
cmd = 'mv ' + sample_folder_ + '/velocyto ' + local_data_path + 'interim/.'
os.system(cmd)

In [None]:
# mapp cell types onto loom file

cmd = 'scp ' + local_data_path + 'interim/velocyto/5k_pbmc_v3_count.loom '
cmd += local_data_path + 'interim/velocyto/5k_pbmc_v3_count_celltypes.loom'
os.system(cmd)

ds = loompy.connect(local_data_path + 'interim/velocyto/5k_pbmc_v3_count_celltypes.loom')
cell_type_df = pd.read_csv(local_data_path + 'processed/5k_pbmc_celltypes_velocytoformatted.csv', index_col = 0)
cell_type_df['SampleID'] = cell_type_df['SampleID'].apply(lambda x: '5k_pbmc_v3_count:' + x + 'x')
cell_type_map = dict(zip(cell_type_df.SampleID.tolist(), cell_type_df.Cell_Type.tolist()))
cell_type_attribute = [cell_type_map[i] for i in list(ds.ca['CellID'])]
ds.ca['Cell_Type'] = cell_type_attribute
ds.close() # saves file with changes