In [1]:
# sample reader to read samples from /data/runs/samples
# and build each sample into a matrix

# Isaac Berez
# 14.01.23

from scipy.io import mmread
import os
import glob
import pandas as pd
import numpy as np
from pandas_ods_reader import read_ods
from copy import deepcopy
import pprint
import json
import re
from datetime import datetime
import logging


import sample_reader as sr

# This notebook processes sample data related to sexual dimorphism research
## for reference, see 'Dimorph_samples.ods' file in /home/isaac/analysis/scRNA-seq

### 1. get sample ids and construct meta data from ods file

In [2]:
samples,meta_data_dict = sr.process_meta_data(analysis_dir = '/home/isaac/analysis/scRNA-seq/',
                                dimorph_sample_file = 'Dimorph_samples.ods')
pprint.pprint(meta_data_dict)

{'10X35_1': {'Age': '8w',
             'Avesizebp_cDNAlib': nan,
             'Cell_Conc': 1000.0,
             'ChipID': '10X35',
             'Comments': 'Naive 8wo females, pooled, left hemisphere. 5%opti.',
             'Date': None,
             'Date_Captured': '2019-11-20',
             'DonorID': 'DI1,DI2',
             'Group': 'Naïve-F',
             'LIbConstructionComment': None,
             'Num_Pooled_Animals': 2.0,
             'PCR_Cycles': 13.0,
             'Project': 'Dimorph',
             'SampleID': '10X35_1',
             'Sample_Index': 'D9',
             'Serial_Number': 63.0,
             'Sex': 'F',
             'Species': 'Mm',
             'Strain': None,
             'Target_Num_Cells': 5000.0,
             'Tissue': 'MeA_L',
             'Transcriptome': 'Mm10',
             'cDNA_Lib_Ok': 'Y',
             'cDNAul': nan,
             'lengthbp_seqlib': 416.0,
             'ngperul_cDNA': 23.3,
             'ngperul_seqlib': 13.6},
 '10X35_2': {'Age': '8

### 2. Validatation find latest file function

In [15]:
#testing find latest sample file on all dimorph sample files
sample_dir = '/data/runs/samples'
count = 0
for i in range(len(samples)):
    file = sr.find_latest_sample_file(sample_dir,samples,i)
    print ('Passed!')
    count+=1
print (f'expected {len(samples)} sample files, succesfully found {count} sample files')

path:  /data/runs/samples/10X54_1
sample ind:  10X54_1
files ['commands_10X54_1_200707', 'out10X54_1_200707']
['200707']
datetime dates: [datetime.datetime(2020, 7, 7, 0, 0)]
latest date:  2020-07-07 00:00:00
latest date key:  200707
latest file:  out10X54_1_200707
Passed!
path:  /data/runs/samples/10X54_2
sample ind:  10X54_2
files ['commands_10X54_2_210220', 'out10X54_2_210220', 'commands_10X54_2_200707', 'out10X54_2_200707']
['210220', '200707']
datetime dates: [datetime.datetime(2021, 2, 20, 0, 0), datetime.datetime(2020, 7, 7, 0, 0)]
latest date:  2021-02-20 00:00:00
latest date key:  210220
latest file:  out10X54_2_210220
Passed!
path:  /data/runs/samples/10X98_2
sample ind:  10X98_2
files ['commands_10X98_2_221010', 'out10X98_2_221010']
['221010']
datetime dates: [datetime.datetime(2022, 10, 10, 0, 0)]
latest date:  2022-10-10 00:00:00
latest date key:  221010
latest file:  out10X98_2_221010
Passed!
path:  /data/runs/samples/10X98_3
sample ind:  10X98_3
files ['commands_10X98_3_

### 2. Reconstruct data matrices from relevant sample files found in '/data/runs/samples/'

In [3]:
def construct_sample_matrix(sample_dir, samples, sample_ind):
    '''construcs a single combined sample matrix from matrix, gene, feature files'''
    
    os.chdir(sample_dir)
    cwd = os.getcwd()
    print ('switched to sample dir: ', cwd)
    
    path = os.path.join(cwd,samples[sample_ind])
    
    #check for latest file
    file = sr.find_latest_sample_file(sample_dir,samples,sample_ind)
    
    #for loop would go here to loop over samples
    dir = str((glob.glob(path+
                     '/'+file+
                     '/'+'outs'+
                     '/'+ 'filtered_feature_bc_matrix'))).replace('[','').replace(']','')[1:-1]

    #read matrix
    print ('reading matrix file from: ', dir)
    m = mmread(dir + '/' + 'matrix.mtx')
    m_arr = m.toarray()

    #read barcodes file and store as cells
    barcodes = pd.read_csv(dir+'/'+'barcodes.tsv', sep='\t',header = None)
    cells = barcodes.iloc[:,0]

    #append the sample id to each cell
    cells = barcodes.iloc[:,0] + samples[sample_ind]
    cells = cells.values.reshape(cells.shape[0],1)

    #read features file and store as gene labels
    features = pd.read_csv(dir+'/'+'features.tsv', sep='\t', header=None)
    gene_labels = features.iloc[:,1]
    gene_labels = gene_labels.values.reshape(gene_labels.shape[0],1)

    #check for duplicate genes
    print('unique values in gene labels: ', len(np.unique(gene_labels)))
    print('total length gene labels: ', len(gene_labels))
    print('# duplicates: ', len(gene_labels)-len(np.unique(gene_labels)))

    #combine matrix, cells, genes into single dataframe
    sample_df = pd.DataFrame(data = m_arr,
                             index = gene_labels,
                             columns = cells)
    #add duplicate rows together
    print('sample df size before add duplicate gene rows together: ', sample_df.shape)
    print('adding duplicate rows...')
    sample_df_summed = sample_df.groupby(level=0).sum()
    print('sample df after adding/removing duplicate rows: ', sample_df_summed.shape)
    return sample_df_summed

In [12]:
df1 = construct_sample_matrix(sample_dir= '/data/runs/samples/',samples=samples,sample_ind=0)
df1

switched to sample dir:  /data/runs/samples
path:  /data/runs/samples/10X54_1
sample ind:  10X54_1
files ['commands_10X54_1_200707', 'out10X54_1_200707']
['200707']
datetime dates: [datetime.datetime(2020, 7, 7, 0, 0)]
latest date:  2020-07-07 00:00:00
latest date key:  200707
latest file:  out10X54_1_200707
reading matrix file from:  /data/runs/samples/10X54_1/out10X54_1_200707/outs/filtered_feature_bc_matrix
unique values in gene labels:  27933
total length gene labels:  27998
# duplicates:  65
sample df size before add duplicate gene rows together:  (27998, 6229)
adding duplicate rows...
sample df after adding/removing duplicate rows:  (27933, 6229)


Unnamed: 0,"(AAACCCACAACAGTGG-110X54_1,)","(AAACCCACATGGCCCA-110X54_1,)","(AAACCCAGTCCCTGAG-110X54_1,)","(AAACGAACACTACAGT-110X54_1,)","(AAACGAATCCCAGCGA-110X54_1,)","(AAACGAATCTGCTTAT-110X54_1,)","(AAACGCTAGCAGATAT-110X54_1,)","(AAACGCTAGTGCTCAT-110X54_1,)","(AAACGCTGTCCGTACG-110X54_1,)","(AAACGCTGTGGCTACC-110X54_1,)",...,"(TTTGGTTAGAGTCACG-110X54_1,)","(TTTGGTTCACAGCCTG-110X54_1,)","(TTTGGTTCAGCCTTCT-110X54_1,)","(TTTGGTTGTCACTAGT-110X54_1,)","(TTTGGTTGTTCCTAAG-110X54_1,)","(TTTGGTTTCATCTGTT-110X54_1,)","(TTTGGTTTCTGTGCTC-110X54_1,)","(TTTGTTGAGATGTTCC-110X54_1,)","(TTTGTTGCATAGCACT-110X54_1,)","(TTTGTTGTCCGCGGAT-110X54_1,)"
"(0610007P14Rik,)",0,1,5,0,0,0,1,0,2,1,...,1,2,0,0,1,0,0,0,0,0
"(0610009B22Rik,)",0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
"(0610009L18Rik,)",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"(0610009O20Rik,)",0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
"(0610010F05Rik,)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(mt-Nd3,)",1,9,17,21,9,6,14,23,10,10,...,16,10,4,1,3,4,4,8,14,1
"(mt-Nd4,)",10,60,117,66,30,50,87,144,80,53,...,82,76,44,12,9,38,10,57,73,24
"(mt-Nd4l,)",0,7,8,3,4,3,1,5,10,2,...,8,8,2,0,0,2,2,2,6,1
"(mt-Nd5,)",3,7,14,8,5,5,7,19,9,5,...,14,10,0,0,1,7,0,6,11,3


### check out a few other samples

In [13]:
df2 = construct_sample_matrix(sample_dir= '/data/runs/samples/',samples=samples,sample_ind=1)
df2

switched to sample dir:  /data/runs/samples
path:  /data/runs/samples/10X54_2
sample ind:  10X54_2
files ['commands_10X54_2_210220', 'out10X54_2_210220', 'commands_10X54_2_200707', 'out10X54_2_200707']
['210220', '200707']
datetime dates: [datetime.datetime(2021, 2, 20, 0, 0), datetime.datetime(2020, 7, 7, 0, 0)]
latest date:  2021-02-20 00:00:00
latest date key:  210220
latest file:  out10X54_2_210220
reading matrix file from:  /data/runs/samples/10X54_2/out10X54_2_210220/outs/filtered_feature_bc_matrix
unique values in gene labels:  27933
total length gene labels:  27998
# duplicates:  65
sample df size before add duplicate gene rows together:  (27998, 4343)
adding duplicate rows...
sample df after adding/removing duplicate rows:  (27933, 4343)


Unnamed: 0,"(AAACCCACATGTTACG-110X54_2,)","(AAACGAAAGACGAAGA-110X54_2,)","(AAACGAAGTTCTCAGA-110X54_2,)","(AAACGAATCAGCTAGT-110X54_2,)","(AAACGAATCTCTAAGG-110X54_2,)","(AAACGAATCTCTGAGA-110X54_2,)","(AAACGCTAGAATCTAG-110X54_2,)","(AAACGCTAGTCTGCGC-110X54_2,)","(AAACGCTTCTCTGACC-110X54_2,)","(AAACGCTTCTGGTTGA-110X54_2,)",...,"(TTTGGAGAGTGGTTCT-110X54_2,)","(TTTGGAGCATGGCCCA-110X54_2,)","(TTTGGAGTCACAAGAA-110X54_2,)","(TTTGGAGTCCATAGGT-110X54_2,)","(TTTGGAGTCTCTCGAC-110X54_2,)","(TTTGGTTAGAGTGGCT-110X54_2,)","(TTTGGTTAGCTACTAC-110X54_2,)","(TTTGGTTCAGCGCTTG-110X54_2,)","(TTTGGTTCATGACAGG-110X54_2,)","(TTTGTTGTCTACTGCC-110X54_2,)"
"(0610007P14Rik,)",0,0,0,1,0,0,0,2,0,0,...,0,0,2,0,2,2,1,0,2,1
"(0610009B22Rik,)",1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
"(0610009L18Rik,)",1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
"(0610009O20Rik,)",1,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
"(0610010F05Rik,)",1,0,0,0,1,1,0,0,0,1,...,0,0,1,0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(mt-Nd3,)",28,10,8,18,2,19,2,19,32,3,...,5,0,9,6,3,7,26,22,0,16
"(mt-Nd4,)",69,57,56,81,4,68,5,53,179,26,...,22,13,58,27,42,24,132,119,58,48
"(mt-Nd4l,)",10,6,5,9,2,3,2,3,10,2,...,0,0,5,1,2,1,12,9,5,2
"(mt-Nd5,)",13,8,5,21,1,8,2,2,15,3,...,1,1,7,5,5,1,12,11,6,11


In [25]:
df3 = construct_sample_matrix(sample_dir= '/data/runs/samples/',samples=samples,sample_ind=2)
df3

switched to sample dir:  /data/runs/samples
path:  /data/runs/samples/10X98_2
sample ind:  10X98_2
files ['commands_10X98_2_221010', 'out10X98_2_221010']
['221010']
datetime dates: [datetime.datetime(2022, 10, 10, 0, 0)]
latest date:  2022-10-10 00:00:00
latest date key:  221010
latest file:  out10X98_2_221010
reading matrix file from:  /data/runs/samples/10X98_2/out10X98_2_221010/outs/filtered_feature_bc_matrix
unique values in gene labels:  27933
total length gene labels:  27998
# duplicates:  65
sample df size before add duplicate gene rows together:  (27998, 4725)
adding duplicate rows...
sample df after adding/removing duplicate rows:  (27933, 4725)


Unnamed: 0,"(AAACCCACAATAGTCC-110X98_2,)","(AAACGAAAGGAGATAG-110X98_2,)","(AAACGAAAGGTACATA-110X98_2,)","(AAACGAACAAACAGGC-110X98_2,)","(AAACGAACAAATCGTC-110X98_2,)","(AAACGAACAACACGAG-110X98_2,)","(AAACGAAGTTGGAGAC-110X98_2,)","(AAACGAATCCCGAACG-110X98_2,)","(AAACGAATCCGATGCG-110X98_2,)","(AAACGAATCTTGTTAC-110X98_2,)",...,"(TTTGATCGTCCGAAAG-110X98_2,)","(TTTGGAGAGGTGCTGA-110X98_2,)","(TTTGGAGCAGTCTTCC-110X98_2,)","(TTTGGAGCATCGAACT-110X98_2,)","(TTTGGAGGTCGCATTA-110X98_2,)","(TTTGGTTCATTAGGCT-110X98_2,)","(TTTGGTTGTTCCCACT-110X98_2,)","(TTTGGTTTCCGATAAC-110X98_2,)","(TTTGTTGTCTCTAAGG-110X98_2,)","(TTTGTTGTCTGAGAGG-110X98_2,)"
"(0610007P14Rik,)",1,0,2,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
"(0610009B22Rik,)",0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
"(0610009L18Rik,)",0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
"(0610009O20Rik,)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
"(0610010F05Rik,)",0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(mt-Nd3,)",4,1,3,8,8,4,2,3,6,6,...,0,0,3,5,2,0,2,11,3,2
"(mt-Nd4,)",9,3,24,54,30,25,5,10,30,13,...,4,0,33,39,8,5,8,32,27,18
"(mt-Nd4l,)",0,0,1,6,1,1,0,1,3,1,...,0,0,1,7,0,0,2,1,2,0
"(mt-Nd5,)",1,1,2,9,4,5,0,0,9,2,...,1,0,3,5,0,2,3,7,3,3


In [27]:
(np.array(df1.index)==np.array(df2.index)).all()

True

In [38]:
print (f'df1 shape {df1.shape}')
print (f'df2 shape {df2.shape}')
combined_col_length = df1.shape[1]+df2.shape[1]
print (f'expected shape of combined df1 and df2 {df1.shape[0]},{combined_col_length}')

df1 shape (27933, 6229)
df2 shape (27933, 4343)
expected shape of combined df1 and df2 27933,10572
