# Development notebook for processing sexual dimorphism data
## This notebook takes as input the csv produced from 'sample_reader_nb', which represents the load and rearranged sexual dimorphism data matrix

In [1]:
# Isaac Berez
# 17.01.23

from scipy.io import mmread
import os
import glob
import pandas as pd
import numpy as np
from pandas_ods_reader import read_ods
from copy import deepcopy
import pprint
import json
import re
from datetime import datetime
import logging


import sample_reader as sr

### 1. Read in meta data and big data file

In [2]:
meta_data = pd.read_json('/bigdata/isaac/meta_data_dict.json')
meta_data.iloc[:5,:5]

Unnamed: 0,10X54_1,10X54_2,10x98_2,10x98_3,10X51_2
Serial_Number,106.0,107.0,212.0,213.0,98.0
Date_Captured,43993.0,43993.0,2021-03-22,2021-03-22,05/31/20
Species,Mm,Mm,Mm,Mm,Mm
Transcriptome,Mm10,Mm10,Mm10,Mm10,Mm10
Strain,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,C57Bl/6


In [60]:
meta_data.loc[meta_data.index[0], meta_data.columns[0]]

106.0

In [3]:
#dimorph_df = pd.read_csv('/bigdata/isaac/dimorph_df.csv')
test_df = pd.read_csv('/bigdata/isaac/test_df_50.csv')
test_df.head()

Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1,AAACGAATCTGCTTAT-1_10X54_1,AAACGCTAGCAGATAT-1_10X54_1,AAACGCTAGTGCTCAT-1_10X54_1,AAACGCTGTCCGTACG-1_10X54_1,AAACGCTGTGGCTACC-1_10X54_1,...,AAAGTGAGTTTGAACC-1_10X54_1,AAATGGAAGGAGTATT-1_10X54_1,AAATGGACAAGAGCTG-1_10X54_1,AAATGGAGTGACACGA-1_10X54_1,AAATGGAGTTGCACGC-1_10X54_1,AAATGGATCGTTTACT-1_10X54_1,AAATGGATCTGAACGT-1_10X54_1,AACAAAGAGAAACCCG-1_10X54_1,AACAAAGAGCACCTGC-1_10X54_1,AACAAAGCAATTGAGA-1_10X54_1
0610007P14Rik,0,1,5,0,0,0,1,0,2,1,...,0,0,0,0,2,0,0,1,0,0
0610009B22Rik,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,1
0610009L18Rik,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
0610009O20Rik,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0
0610010F05Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [43]:
meta_data.index[0]

'Serial_Number'

In [53]:
meta_data.keys()[0]

'10X54_1'

In [38]:
meta_data_df.head()

Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1,AAACGAATCTGCTTAT-1_10X54_1,AAACGCTAGCAGATAT-1_10X54_1,AAACGCTAGTGCTCAT-1_10X54_1,AAACGCTGTCCGTACG-1_10X54_1,AAACGCTGTGGCTACC-1_10X54_1,...,AAAGTGAGTTTGAACC-1_10X54_1,AAATGGAAGGAGTATT-1_10X54_1,AAATGGACAAGAGCTG-1_10X54_1,AAATGGAGTGACACGA-1_10X54_1,AAATGGAGTTGCACGC-1_10X54_1,AAATGGATCGTTTACT-1_10X54_1,AAATGGATCTGAACGT-1_10X54_1,AACAAAGAGAAACCCG-1_10X54_1,AACAAAGAGCACCTGC-1_10X54_1,AACAAAGCAATTGAGA-1_10X54_1
Serial_Number,,,,,,,,,,,...,,,,,,,,,,
Date_Captured,,,,,,,,,,,...,,,,,,,,,,
Species,,,,,,,,,,,...,,,,,,,,,,
Transcriptome,,,,,,,,,,,...,,,,,,,,,,
Strain,,,,,,,,,,,...,,,,,,,,,,


In [58]:
#meta_data_df.at['Serial_Number','AAACCCACAACAGTGG-1_10X54_1'] = 'TEST'

In [72]:
#[5]*len(meta_data_df.columns[meta_data_df.columns.str.contains(meta_data.keys()[0])])

In [85]:
for i in range(len(meta_data.keys())):
    print (i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [89]:
meta_data_df.loc[meta_data_df.index[:],
                    meta_data_df.columns[meta_data_df.columns.str.contains(meta_data.keys()[0])]]

Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1,AAACGAATCTGCTTAT-1_10X54_1,AAACGCTAGCAGATAT-1_10X54_1,AAACGCTAGTGCTCAT-1_10X54_1,AAACGCTGTCCGTACG-1_10X54_1,AAACGCTGTGGCTACC-1_10X54_1,...,AAAGTGAGTTTGAACC-1_10X54_1,AAATGGAAGGAGTATT-1_10X54_1,AAATGGACAAGAGCTG-1_10X54_1,AAATGGAGTGACACGA-1_10X54_1,AAATGGAGTTGCACGC-1_10X54_1,AAATGGATCGTTTACT-1_10X54_1,AAATGGATCTGAACGT-1_10X54_1,AACAAAGAGAAACCCG-1_10X54_1,AACAAAGAGCACCTGC-1_10X54_1,AACAAAGCAATTGAGA-1_10X54_1
Serial_Number,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,...,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0
Date_Captured,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,...,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0
Species,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,...,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm
Transcriptome,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,...,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10
Strain,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,...,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6
Project,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,...,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO
Group,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,...,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F
ChipID,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,...,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54
SampleID,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,...,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1
DonorID,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,...,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F


In [121]:
len(list(meta_data.columns))

20

In [116]:
len(meta_data_df.columns[meta_data_df.columns.str.contains(meta_data.keys()[1])])>0

False

In [117]:
# create meta data df with each row corresponding to meta data features, 
# and columns for each each cell,
meta_data_df = pd.DataFrame(index = meta_data.index, columns = test_df.columns)
for i,v in enumerate(meta_data.keys()):
    #print (v)
    if len(meta_data_df.columns[meta_data_df.columns.str.contains(meta_data.keys()[i])])>0:
        meta_data_df.loc[meta_data_df.index[:],
                        meta_data_df.columns[meta_data_df.columns.str.contains(meta_data.keys()[i])]] =meta_data.loc[meta_data.index[:], meta_data.columns[i]]
meta_data_df

10X54_1
10X54_2
10x98_2
10x98_3
10X51_2
10X51_1
10X52_1
10X52_2
10X51_3
10X51_4
10X52_3
10X52_4
10X35_1
10X35_2
10X38_1
10X38_2
10X36_1
10X36_2
10X37_1
10X37_2


Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1,AAACGAATCTGCTTAT-1_10X54_1,AAACGCTAGCAGATAT-1_10X54_1,AAACGCTAGTGCTCAT-1_10X54_1,AAACGCTGTCCGTACG-1_10X54_1,AAACGCTGTGGCTACC-1_10X54_1,...,AAAGTGAGTTTGAACC-1_10X54_1,AAATGGAAGGAGTATT-1_10X54_1,AAATGGACAAGAGCTG-1_10X54_1,AAATGGAGTGACACGA-1_10X54_1,AAATGGAGTTGCACGC-1_10X54_1,AAATGGATCGTTTACT-1_10X54_1,AAATGGATCTGAACGT-1_10X54_1,AACAAAGAGAAACCCG-1_10X54_1,AACAAAGAGCACCTGC-1_10X54_1,AACAAAGCAATTGAGA-1_10X54_1
Serial_Number,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,...,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0
Date_Captured,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,...,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0
Species,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,...,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm
Transcriptome,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,...,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10
Strain,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,...,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6
Project,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,...,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO
Group,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,...,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-F
ChipID,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,...,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54,10X54
SampleID,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,...,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1,10X54_1
DonorID,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,...,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F,Cntnp-KO_2-9F


In [49]:
meta_data_df.columns[meta_data_df.columns.str.contains('10X54_1')]

Index(['AAACCCACAACAGTGG-1_10X54_1', 'AAACCCACATGGCCCA-1_10X54_1',
       'AAACCCAGTCCCTGAG-1_10X54_1', 'AAACGAACACTACAGT-1_10X54_1',
       'AAACGAATCCCAGCGA-1_10X54_1', 'AAACGAATCTGCTTAT-1_10X54_1',
       'AAACGCTAGCAGATAT-1_10X54_1', 'AAACGCTAGTGCTCAT-1_10X54_1',
       'AAACGCTGTCCGTACG-1_10X54_1', 'AAACGCTGTGGCTACC-1_10X54_1',
       'AAACGCTGTTGCAAGG-1_10X54_1', 'AAACGCTTCAAGTGGG-1_10X54_1',
       'AAACGCTTCTACCTTA-1_10X54_1', 'AAAGAACAGAAGCGGG-1_10X54_1',
       'AAAGAACAGAGTTCGG-1_10X54_1', 'AAAGAACAGCCTCAGC-1_10X54_1',
       'AAAGAACAGGTAGCCA-1_10X54_1', 'AAAGAACGTACGTACT-1_10X54_1',
       'AAAGAACGTGCCGGTT-1_10X54_1', 'AAAGAACGTGTCACAT-1_10X54_1',
       'AAAGAACGTTACGGAG-1_10X54_1', 'AAAGGATAGAACTCCT-1_10X54_1',
       'AAAGGATAGAAGCCTG-1_10X54_1', 'AAAGGATCAAGCCATT-1_10X54_1',
       'AAAGGATTCATTTACC-1_10X54_1', 'AAAGGGCAGTAAGAGG-1_10X54_1',
       'AAAGGGCCACACGGTC-1_10X54_1', 'AAAGGGCCAGACAAGC-1_10X54_1',
       'AAAGGGCCATGTGACT-1_10X54_1', 'AAAGGGCGTTCCGCGA-1_10X54

### 2. Calculate molecules/cell and genes/cell vectors, keep only cells with >2500 mol/cell and >2000 genes/cell

In [5]:
test_df.columns[0]

'AAACCCACAACAGTGG-1_10X54_1'

In [6]:
total_molecules_per_cell = test_df.sum(axis=0)
total_molecules_per_cell = np.reshape(np.array(total_molecules_per_cell),(1,len(total_molecules_per_cell)))
print('total molecules per cell shape: ', total_molecules_per_cell.shape)
print('total molecules per cell: ', total_molecules_per_cell)

total molecules per cell shape:  (1, 50)
total molecules per cell:  [[ 0 13 21 21  7  8  1  4 19  5  9  3  2  4 10  6 19 19  0 21  5 22  2 12
   0  1  4 21  0 25  2 10  2 16  8  2  7  0 21 13  4 10 16 10  3  2  4 13
   8 13]]


In [7]:
test_df_bool = test_df.mask(test_df>0, other = 1)
#test_df_bool

In [8]:
total_genes_per_cell = test_df_bool.sum(axis=0)
total_genes_per_cell = np.reshape(np.array(total_genes_per_cell),(1,len(total_genes_per_cell)))
print('total genes per cell shape: ', total_genes_per_cell.shape)
print('total genes per cell: ', total_genes_per_cell)

total genes per cell shape:  (1, 50)
total genes per cell:  [[ 0  7 10 11  5  5  1  4 11  5  7  2  2  4  7  6  9 12  0 13  5 10  2  9
   0  1  3  9  0 12  1  8  2  8  7  2  7  0 10  8  3  7  7  7  2  2  3  8
   7  9]]


In [9]:
#total mols/cell AND total genes/cell boolean mask filter
threshold_m = 1
threshold_g = 1
mol_cell_mask = (total_molecules_per_cell>threshold_m)[0]
genes_cell_mask = (total_genes_per_cell>threshold_g)[0]
mol_AND_gene_cell_mask = np.logical_and(mol_cell_mask,genes_cell_mask)
print(mol_AND_gene_cell_mask)
test_df_l2 = test_df.loc[:,mol_AND_gene_cell_mask]
test_df_l2.head()

[False  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
 False False  True  True False  True False  True  True  True  True  True
  True False  True  True  True  True  True  True  True  True  True  True
  True  True]


Unnamed: 0,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1,AAACGAATCTGCTTAT-1_10X54_1,AAACGCTAGTGCTCAT-1_10X54_1,AAACGCTGTCCGTACG-1_10X54_1,AAACGCTGTGGCTACC-1_10X54_1,AAACGCTGTTGCAAGG-1_10X54_1,AAACGCTTCAAGTGGG-1_10X54_1,...,AAAGTGAGTTTGAACC-1_10X54_1,AAATGGAAGGAGTATT-1_10X54_1,AAATGGACAAGAGCTG-1_10X54_1,AAATGGAGTGACACGA-1_10X54_1,AAATGGAGTTGCACGC-1_10X54_1,AAATGGATCGTTTACT-1_10X54_1,AAATGGATCTGAACGT-1_10X54_1,AACAAAGAGAAACCCG-1_10X54_1,AACAAAGAGCACCTGC-1_10X54_1,AACAAAGCAATTGAGA-1_10X54_1
0610007P14Rik,1,5,0,0,0,0,2,1,0,0,...,0,0,0,0,2,0,0,1,0,0
0610009B22Rik,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,1
0610009L18Rik,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
0610009O20Rik,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0
0610010F05Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### 3. Gene Exclusion - exclude sex genes
#### Keep only genes expressed in >10 cells, <50% of all cells

In [11]:
#update gene boolean mask with l2 filtered cells
test_df_bool_l2 = test_df_bool.loc[:,test_df_l2.columns]
test_df_bool_l2.head()

Unnamed: 0,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1,AAACGAATCTGCTTAT-1_10X54_1,AAACGCTAGTGCTCAT-1_10X54_1,AAACGCTGTCCGTACG-1_10X54_1,AAACGCTGTGGCTACC-1_10X54_1,AAACGCTGTTGCAAGG-1_10X54_1,AAACGCTTCAAGTGGG-1_10X54_1,...,AAAGTGAGTTTGAACC-1_10X54_1,AAATGGAAGGAGTATT-1_10X54_1,AAATGGACAAGAGCTG-1_10X54_1,AAATGGAGTGACACGA-1_10X54_1,AAATGGAGTTGCACGC-1_10X54_1,AAATGGATCGTTTACT-1_10X54_1,AAATGGATCTGAACGT-1_10X54_1,AACAAAGAGAAACCCG-1_10X54_1,AACAAAGAGCACCTGC-1_10X54_1,AACAAAGCAATTGAGA-1_10X54_1
0610007P14Rik,1,1,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,1,0,0
0610009B22Rik,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
0610009L18Rik,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
0610009O20Rik,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0
0610010F05Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
print (test_df_bool_l2.shape)
print (test_df_l2.shape)

(50, 42)
(50, 42)


In [13]:
#sum each row of l2 filtered boolean gene mask, get vector of dim (len(row genes)x1)
gene_sum_l2 = np.array(test_df_bool_l2.sum(axis=1))
gene_sum_l2 = np.reshape(gene_sum_l2,(gene_sum_l2.shape[0],1))
gene_sum_l2.shape

(50, 1)

In [14]:
#gene_sum_l2

In [15]:
0.5*test_df_l2.shape[1]

21.0

In [16]:
test_df_l3 = pd.DataFrame(columns = test_df_l2.columns)
gene_exclusion_lwr_bound = 10
gene_exclusion_upper_bound = 0.5*test_df_l2.shape[1]
genes_to_keep_indices = []
for i,v in enumerate(gene_sum_l2):
    if gene_exclusion_lwr_bound < v < gene_exclusion_upper_bound:
        genes_to_keep_indices.append(i)

print (genes_to_keep_indices)
#test_df_l3

[0, 6, 22, 27, 33, 41]


In [17]:
#use genes to keep indices to extract genes from l2
test_df_l3 = test_df_l2.iloc[genes_to_keep_indices,:]
test_df_l3

Unnamed: 0,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1,AAACGAATCTGCTTAT-1_10X54_1,AAACGCTAGTGCTCAT-1_10X54_1,AAACGCTGTCCGTACG-1_10X54_1,AAACGCTGTGGCTACC-1_10X54_1,AAACGCTGTTGCAAGG-1_10X54_1,AAACGCTTCAAGTGGG-1_10X54_1,...,AAAGTGAGTTTGAACC-1_10X54_1,AAATGGAAGGAGTATT-1_10X54_1,AAATGGACAAGAGCTG-1_10X54_1,AAATGGAGTGACACGA-1_10X54_1,AAATGGAGTTGCACGC-1_10X54_1,AAATGGATCGTTTACT-1_10X54_1,AAATGGATCTGAACGT-1_10X54_1,AACAAAGAGAAACCCG-1_10X54_1,AACAAAGAGCACCTGC-1_10X54_1,AACAAAGCAATTGAGA-1_10X54_1
0610007P14Rik,1,5,0,0,0,0,2,1,0,0,...,0,0,0,0,2,0,0,1,0,0
0610011F06Rik,3,3,2,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1110001J03Rik,0,2,1,0,0,1,0,0,1,0,...,2,1,0,1,1,0,0,4,1,0
1110004E09Rik,0,0,0,1,1,0,2,0,1,0,...,0,1,1,0,0,0,0,0,0,0
1110008P14Rik,0,1,3,0,0,0,1,0,1,0,...,0,0,3,0,0,0,0,0,1,1
1110032A03Rik,1,0,3,0,0,0,0,0,0,2,...,0,0,0,1,0,0,0,1,0,0


In [24]:
type(meta_data)

pandas.core.frame.DataFrame

In [30]:
meta_data_df_l3 = pd.DataFrame(index = meta_data['10X54_1'].keys() ,columns = test_df_l3.columns)
meta_data_df_l3

Unnamed: 0,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1,AAACGAATCTGCTTAT-1_10X54_1,AAACGCTAGTGCTCAT-1_10X54_1,AAACGCTGTCCGTACG-1_10X54_1,AAACGCTGTGGCTACC-1_10X54_1,AAACGCTGTTGCAAGG-1_10X54_1,AAACGCTTCAAGTGGG-1_10X54_1,...,AAAGTGAGTTTGAACC-1_10X54_1,AAATGGAAGGAGTATT-1_10X54_1,AAATGGACAAGAGCTG-1_10X54_1,AAATGGAGTGACACGA-1_10X54_1,AAATGGAGTTGCACGC-1_10X54_1,AAATGGATCGTTTACT-1_10X54_1,AAATGGATCTGAACGT-1_10X54_1,AACAAAGAGAAACCCG-1_10X54_1,AACAAAGAGCACCTGC-1_10X54_1,AACAAAGCAATTGAGA-1_10X54_1
Serial_Number,,,,,,,,,,,...,,,,,,,,,,
Date_Captured,,,,,,,,,,,...,,,,,,,,,,
Species,,,,,,,,,,,...,,,,,,,,,,
Transcriptome,,,,,,,,,,,...,,,,,,,,,,
Strain,,,,,,,,,,,...,,,,,,,,,,
Project,,,,,,,,,,,...,,,,,,,,,,
Group,,,,,,,,,,,...,,,,,,,,,,
ChipID,,,,,,,,,,,...,,,,,,,,,,
SampleID,,,,,,,,,,,...,,,,,,,,,,
DonorID,,,,,,,,,,,...,,,,,,,,,,


In [27]:
meta_data['10X54_1'].keys()

Index(['Serial_Number', 'Date_Captured', 'Species', 'Transcriptome', 'Strain',
       'Project', 'Group', 'ChipID', 'SampleID', 'DonorID', 'Age',
       'Num_Pooled_Animals', 'Sex', 'Tissue', 'Cell_Conc', 'Target_Num_Cells',
       'PCR_Cycles', 'Comments', 'cDNA_Lib_Ok', 'ngperul_cDNA',
       'Avesizebp_cDNAlib', 'Date', 'cDNAul', 'LIbConstructionComment',
       'ngperul_seqlib', 'lengthbp_seqlib', 'Sample_Index'],
      dtype='object')

In [25]:
meta_data

Unnamed: 0,10X54_1,10X54_2,10x98_2,10x98_3,10X51_2,10X51_1,10X52_1,10X52_2,10X51_3,10X51_4,10X52_3,10X52_4,10X35_1,10X35_2,10X38_1,10X38_2,10X36_1,10X36_2,10X37_1,10X37_2
Serial_Number,106.0,107.0,212.0,213.0,98.0,97.0,101.0,102.0,99.0,100.0,103.0,104.0,63.0,64.0,69.0,70.0,65.0,66.0,67.0,68.0
Date_Captured,43993.0,43993.0,2021-03-22,2021-03-22,05/31/20,05/31/20,05/31/20,05/31/20,05/31/20,05/31/20,05/31/20,05/31/20,2019-11-20,2019-11-20,2019-12-01,2019-12-01,2019-11-24,2019-11-24,2019-11-25,2019-11-25
Species,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm,Mm
Transcriptome,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10,Mm10
Strain,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,C57Bl/6,C57Bl/6,C57Bl/6,C57Bl/6,C57Bl/6,C57Bl/6,C57Bl/6,C57Bl/6,,,,,,,,
Project,Cntnp_KO,Cntnp_KO,Cntnp_KO,Cntnp_KO,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph,Dimorph
Group,Cntnp-KO-F,Cntnp-KO-F,Cntnp-KO-M,Cntnp-KO-M,Breeder-F,Breeder-F,Breeder-F,Breeder-F,Breeder-M,Breeder-M,Breeder-M,Breeder-M,Naïve-F,Naïve-F,Naïve-F,Naïve-F,Naïve-M,Naïve-M,Naïve-M,Naïve-M
ChipID,10X54,10X54,10x98,10x98,10X51,10X51,10X52,10X52,10X51,10X51,10X52,10X52,10X35,10X35,10X38,10X38,10X36,10X36,10X37,10X37
SampleID,10X54_1,10X54_2,10x98_2,10x98_3,10X51_2,10X51_1,10X52_1,10X52_2,10X51_3,10X51_4,10X52_3,10X52_4,10X35_1,10X35_2,10X38_1,10X38_2,10X36_1,10X36_2,10X37_1,10X37_2
DonorID,Cntnp-KO_2-9F,Cntnp-KO_4-10F,Cntnp-KO_2_M,Cntnp-KO_3_M,DI-B1-F,DI-B1-F,DI-B3-F,DI-B3-F,DI-B1-M,DI-B1-M,DI-B3-M,DI-B3-M,"DI1,DI2","DI1,DI2",DI6,DI6,DI3,DI3,"DI4,DI5","DI4,DI5"


In [23]:
meta_data['10X54_1']['Sex']

'F'

In [None]:
#use meta data to split dimorph_df into male and female data frames