# 1.2 Load CRV3 GEX Data

MTX -> Sparse in-memory Format -> DataFrame (dense)

In [3]:
import pandas as pd
import helper_functions_10x as hf

from typing import Dict

# explicitly declare we're using a dictionary of DataFrames
df:Dict[str,pd.DataFrame] = {}

In [4]:
import gzip
from scipy import io
from scipy.sparse import csc_matrix
from ast import literal_eval as make_tuple
import pandas as pd
import numpy as np
from copy import deepcopy
import os
import matplotlib.pyplot as plt

In [5]:
def load_crv3_feature_matrix(inst_path):
    # Read Barcodes
    ###########################
    # need to check whether we have tuples
    barcodes_cats = False

    # barcodes
    filename = inst_path + 'barcodes.tsv.gz'
    f = gzip.open(filename, 'rt')
    lines = f.readlines()
    f.close()

    barcodes = []
    for inst_bc in lines:
        inst_bc = inst_bc.strip().split('\t')

        if barcodes_cats == False:
            # remove dash from barcodes if necessary
            if '-' in inst_bc[0]:
                inst_bc[0] = inst_bc[0].split('-')[0]

        barcodes.append(inst_bc[0])

    # parse tuples if necessary
    if barcodes_cats:
        try:
            barcodes = [make_tuple(x) for x in barcodes]
        except:
            pass

    # Load Matrix
    #################
    mat = io.mmread(inst_path + 'matrix.mtx.gz')
    mat_csr = mat.tocsr()

    # Get Indexes of Feature Types
    ##################################
    filename = inst_path + 'features.tsv.gz'
    f = gzip.open(filename, 'rt')
    lines = f.readlines()
    f.close()

    feature_indexes = {}
    feature_lines = {}
    for index in range(len(lines)):

        inst_line = lines[index].strip().split('\t')
        inst_feat = inst_line[2].replace('Gene Expression', 'gex').replace('Antibody Capture', 'adt').replace('Custom', 'custom')


        if inst_feat not in feature_indexes:
            feature_indexes[inst_feat] = []

        feature_indexes[inst_feat].append(index)

    feature_data = {}

    for inst_feat in feature_indexes:
        feature_data[inst_feat] = {}

        feature_data[inst_feat]['barcodes'] = barcodes

        inst_indexes = feature_indexes[inst_feat]

        # Separate feature lists
        ser_lines = pd.Series(lines)
        ser_lines_found = ser_lines[inst_indexes]
        lines_found = ser_lines_found.get_values().tolist()

        # save feature lines
        feature_lines[inst_feat] = lines_found

        # save as compressed sparse column matrix (for barcode filtering)
        mat_filt = mat_csr[inst_indexes, :].tocsc()

        feature_data[inst_feat]['mat'] = mat_filt

    # Make unique feature names
    for inst_feat in feature_lines:
        feat_lines = feature_lines[inst_feat]
        feat_lines = [x.strip().split('\t') for x in feat_lines]

        # find non-unique initial feature names (add id later if necessary)
        ini_names = [x[1] for x in feat_lines]

        ini_name_count = pd.Series(ini_names).value_counts()
        duplicate_names = ini_name_count[ini_name_count > 1].index.tolist()

        print(duplicate_names)

        new_names = [x[1] if x[1] not in duplicate_names else x[1] + '_' + x[0] for x in feat_lines]

        # quick hack to clean up names
        new_names = [x.replace('_TotalSeqB', '') for x in new_names]

        feature_data[inst_feat]['features'] = new_names

    return feature_data

### Load GEX Data into DataFrame

In [6]:
feature_data = load_crv3_feature_matrix('../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/filtered_feature_bc_matrix/')

['ATXN7', 'LINC01505', 'TBCE', 'PDE11A', 'CYB561D2', 'TXNRD3NB', 'CCDC39', 'IGF2', 'SOD2', 'TMSB15B', 'MATR3', 'EMG1', 'SCO2', 'PRSS50', 'ABCF2', 'POLR2J3', 'COG8', 'HSPA14', 'DIABLO', 'RGS5', 'LINC02203', 'LINC01238', 'H2BFS', 'PINX1']
[]


In [8]:
hf.check_feature_data_size(feature_data)

gex
33538 7865
(33538, 7865) 

adt
17 7865
(17, 7865) 



In [10]:
# rows = df['gex-ini'].index.tolist()
rows = feature_data['gex']['features']

In [11]:
found_rows = [x for x in rows if 'ATXN7' in x]
print(found_rows)

['ATXN7L2', 'ATXN7_ENSG00000285258', 'ATXN7_ENSG00000163635', 'ATXN7L1', 'ATXN7L3B', 'ATXN7L3']


In [None]:
df['gex-ini'].shape

In [None]:
df['gex-ini'].head()

### Checking Ensemble Gene IDS

In [None]:
filename = '../data/pbmc3k_filtered_gene_bc_matrices_v2-cr/genes.tsv'
f = open(filename, 'r')
lines = f.readlines()
lines[0]

In [None]:
ens_list = []
gene_list = []
for inst_line in lines:
    inst_line = inst_line.strip().split('\t')
    ens_list.append(inst_line[0])
    gene_list.append(inst_line[1])

In [None]:
print(len(ens_list))
print(len(list(set(ens_list))))

In [None]:
print(len(gene_list))
print(len(list(set(gene_list))))