# Development notebook for processing sexual dimorphism data
## This notebook takes as input the csv produced from 'sample_reader_nb', which represents the load and rearranged sexual dimorphism data matrix

In [1]:
# Isaac Berez
# 17.01.23

from scipy.io import mmread
import os
import glob
import pandas as pd
import numpy as np
from pandas_ods_reader import read_ods
from copy import deepcopy
import pprint
import json
import re
from datetime import datetime
import logging


import sample_reader as sr

### 1. Read in meta data and big data file

In [2]:
meta_data = pd.read_json('/bigdata/isaac/meta_data_dict.json')
meta_data.iloc[:5,:5]

Unnamed: 0,10X54_1,10X54_2,10x98_2,10x98_3,10X51_2
Serial_Number,106.0,107.0,212.0,213.0,98.0
Date_Captured,43993.0,43993.0,2021-03-22,2021-03-22,05/31/20
Species,Mm,Mm,Mm,Mm,Mm
Transcriptome,Mm10,Mm10,Mm10,Mm10,Mm10
Strain,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,C57Bl/6


In [3]:
#dimorph_df = pd.read_csv('/bigdata/isaac/dimorph_df.csv')
test_df = pd.read_csv('/bigdata/isaac/test_df.csv')
test_df

Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1
0610007P14Rik,0,1,5,0,0
0610009B22Rik,0,0,0,1,0
0610009L18Rik,0,0,0,0,1
0610009O20Rik,0,0,0,0,0
0610010F05Rik,0,0,0,0,0


In [4]:
test_df


Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1
0610007P14Rik,0,1,5,0,0
0610009B22Rik,0,0,0,1,0
0610009L18Rik,0,0,0,0,1
0610009O20Rik,0,0,0,0,0
0610010F05Rik,0,0,0,0,0


In [5]:
#add 2 value to 1 columnto validate filtering below
test_df_new = test_df.copy()
test_df_new.loc[test_df['AAACCCACAACAGTGG-1_10X54_1']==0, 'AAACCCACAACAGTGG-1_10X54_1'] =2
test_df_new

Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1
0610007P14Rik,2,1,5,0,0
0610009B22Rik,2,0,0,1,0
0610009L18Rik,2,0,0,0,1
0610009O20Rik,2,0,0,0,0
0610010F05Rik,2,0,0,0,0


### 2. Calculate molecules/cell and genes/cell vectors, keep only cells with >2500 mol/cell and >2000 genes/cell

In [6]:
test_df.columns[0]

'AAACCCACAACAGTGG-1_10X54_1'

In [7]:
total_molecules_per_cell = test_df_new.sum(axis=0)
total_molecules_per_cell = np.reshape(np.array(total_molecules_per_cell),(1,len(total_molecules_per_cell)))
print('total molecules per cell shape: ', total_molecules_per_cell.shape)
print('total molecules per cell: ', total_molecules_per_cell)

total molecules per cell shape:  (1, 5)
total molecules per cell:  [[10  1  5  1  1]]


In [8]:
test_df_new_bool = test_df_new.mask(test_df_new>0, other = 1)
test_df_new_bool

Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1
0610007P14Rik,1,1,1,0,0
0610009B22Rik,1,0,0,1,0
0610009L18Rik,1,0,0,0,1
0610009O20Rik,1,0,0,0,0
0610010F05Rik,1,0,0,0,0


In [9]:
total_genes_per_cell = test_df_new_bool.sum(axis=0)
total_genes_per_cell = np.reshape(np.array(total_genes_per_cell),(1,len(total_genes_per_cell)))
print('total genes per cell shape: ', total_genes_per_cell.shape)
print('total genes per cell: ', total_genes_per_cell)

total genes per cell shape:  (1, 5)
total genes per cell:  [[5 1 1 1 1]]


In [13]:
#total mols/cell AND total genes/cell boolean mask filter
threshold_m = 1
threshold_g = 1
mol_cell_mask = (total_molecules_per_cell>threshold_m)[0]
genes_cell_mask = (total_genes_per_cell>threshold_g)[0]
mol_AND_gene_cell_mask = np.logical_and(mol_cell_mask,genes_cell_mask)
print(mol_AND_gene_cell_mask)
test_df_new_l2 = test_df_new.loc[:,mol_AND_gene_cell_mask]
test_df_new_l2

[ True False False False False]


Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1
0610007P14Rik,2
0610009B22Rik,2
0610009L18Rik,2
0610009O20Rik,2
0610010F05Rik,2


### 3. Gene Exclusion - exclude sex genes
#### Keep only genes expressed in >10 cells, <50% of all cells