# Development notebook for processing sexual dimorphism data
## This notebook takes as input the csv produced from 'sample_reader_nb', which represents the load and rearranged sexual dimorphism data matrix

In [1]:
# Isaac Berez
# 17.01.23

from scipy.io import mmread
import os
import glob
import pandas as pd
import numpy as np
from pandas_ods_reader import read_ods
from copy import deepcopy
import pprint
import json
import re
from datetime import datetime
import logging


import sample_reader as sr

### 1. Read in meta data and big data file

In [2]:
meta_data = pd.read_json('/bigdata/isaac/meta_data_dict.json')
meta_data.iloc[:5,:5]

Unnamed: 0,10X54_1,10X54_2,10x98_2,10x98_3,10X51_2
Serial_Number,106.0,107.0,212.0,213.0,98.0
Date_Captured,43993.0,43993.0,2021-03-22,2021-03-22,05/31/20
Species,Mm,Mm,Mm,Mm,Mm
Transcriptome,Mm10,Mm10,Mm10,Mm10,Mm10
Strain,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,Cntnp-C57Bl/6,C57Bl/6


In [3]:
#dimorph_df = pd.read_csv('/bigdata/isaac/dimorph_df.csv')
test_df = pd.read_csv('/bigdata/isaac/test_df.csv')
test_df

Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1
0610007P14Rik,0,1,5,0,0
0610009B22Rik,0,0,0,1,0
0610009L18Rik,0,0,0,0,1
0610009O20Rik,0,0,0,0,0
0610010F05Rik,0,0,0,0,0


### 2. Calculate molecules/cell and genes/cell vectors 
### expected shape: 1xtotal number of cells (columns) = 1x101317

In [4]:
test_df.columns[0]

'AAACCCACAACAGTGG-1_10X54_1'

In [5]:
total_molecules_per_cell = test_df.sum(axis=0)
total_molecules_per_cell = np.reshape(np.array(total_molecules_per_cell),(1,len(total_molecules_per_cell)))
print('total molecules per cell shape: ', total_molecules_per_cell.shape)
print('total molecules per cell: ', total_molecules_per_cell)

total molecules per cell shape:  (1, 5)
total molecules per cell:  [[0 1 5 1 1]]


In [6]:
test_df_bool = test_df.mask(test_df>0, other = 1)
test_df_bool

Unnamed: 0,AAACCCACAACAGTGG-1_10X54_1,AAACCCACATGGCCCA-1_10X54_1,AAACCCAGTCCCTGAG-1_10X54_1,AAACGAACACTACAGT-1_10X54_1,AAACGAATCCCAGCGA-1_10X54_1
0610007P14Rik,0,1,1,0,0
0610009B22Rik,0,0,0,1,0
0610009L18Rik,0,0,0,0,1
0610009O20Rik,0,0,0,0,0
0610010F05Rik,0,0,0,0,0


In [7]:
total_genes_per_cell = test_df_bool.sum(axis=0)
total_genes_per_cell = np.reshape(np.array(total_genes_per_cell),(1,len(total_genes_per_cell)))
print('total genes per cell shape: ', total_genes_per_cell.shape)
print('total genes per cell: ', total_genes_per_cell)

total genes per cell shape:  (1, 5)
total genes per cell:  [[0 1 1 1 1]]


### Keep only cells with >2500 mol/cell and >2000 genes/cell

In [29]:
test_df_copy = test_df.copy()
test_df_copy.drop(columns=test_df_copy.columns[0])

Unnamed: 0,"('AAACCCACATGGCCCA-110X54_1',)","('AAACCCAGTCCCTGAG-110X54_1',)","('AAACGAACACTACAGT-110X54_1',)","('AAACGAATCCCAGCGA-110X54_1',)"
"('0610007P14Rik',)",1,5,0,0
"('0610009B22Rik',)",0,0,1,0
"('0610009L18Rik',)",0,0,0,1
"('0610009O20Rik',)",0,0,0,0
"('0610010F05Rik',)",0,0,0,0


In [50]:
np.array(test_df[total_molecules_per_cell>1])

ValueError: Item wrong length 1 instead of 5.

In [49]:
test_df.loc[:,(total_molecules_per_cell>1)]

IndexError: Boolean index has wrong length: 1 instead of 5

In [None]:
#thresholds to change to 2500 and 2000 when using full datset
thresh_mol = 1
for i,v in enumerate(total_molecules_per_cell):
    #drop cell columns below threshold
    if v > thresh_mol:
        test_df_l2 = test_df
