### Step1: process seqscope-liver un/sp data 

process the un/sp output from seqscope pipline to dataframes

intput: unsp_dict.pkl with unspliced_all_pos etc\
output: unsp_dict.pkl with unsp_df etc

In [1]:
import pandas as pd 
import pickle

#### Read un/sp raw data

In [2]:
##### read (kind of) raw unsp and sp data that are the outputs of seqscope pipline
filename = 'input/unsp_dict.pkl'
with open(filename, 'rb') as f:
    pickle_dict = pickle.load(f)  
unspliced_all_pos = pickle_dict['unspliced_all_pos']
spliced_all_pos = pickle_dict['spliced_all_pos']
unspMat = pickle_dict['unspMat']
spMat = pickle_dict['spMat']
# maxCooInt = unsp_dict['maxCooInt']
print(pickle_dict.keys())
print(unspliced_all_pos.shape)
print(spliced_all_pos.shape)

dict_keys(['unspliced_all_pos', 'spliced_all_pos', 'unspMat', 'spMat', 'maxCooInt'])
(8335475, 9)
(8335475, 9)


#### Only keep valid tiles

In [4]:
##### only keep valid tiles
tileList = [2102, 2103, 2104,2105, 2106, 2107, 2116, 2117, 2118, 2119]
unspliced_all_pos1 = unspliced_all_pos[unspliced_all_pos['tile_miseq'].isin(tileList)]
spliced_all_pos1 = spliced_all_pos[spliced_all_pos['tile_miseq'].isin(tileList)]
print(unspliced_all_pos1.shape)
print(spliced_all_pos1.shape)

##### remove umi=0 entries
unspliced_all_pos2 = unspliced_all_pos1[unspliced_all_pos1['umi']>0]
spliced_all_pos2 = spliced_all_pos1[spliced_all_pos1['umi']>0]
print(unspliced_all_pos2.shape)
print(spliced_all_pos2.shape)
print(unspliced_all_pos2.head())
print(spliced_all_pos2.head())

(6042177, 9)
(6042177, 9)
(1556983, 9)
(5850152, 9)
                    HDMI  lane_miseq  tile_miseq  x_miseq  y_miseq  \
0   AAAAAAAAAAAAAAAAAAAA         1.0      2104.0   6995.0  17656.0   
15  AAAAAAAAAGAGCTATAATT         1.0      2117.0  11128.0  15518.0   
16  AAAAAAAAAGAGTGAAAAAC         1.0      2102.0  28217.0  19164.0   
17  AAAAAAAAAGCGAGGGCAGA         1.0      2103.0  16048.0  18931.0   
20  AAAAAAAAAGGAGTACACTT         1.0      2117.0  25999.0  19552.0   

                 BARCODE  barcodeInd  barcode   umi  
0   AAAAAAAAAAAAAAAAAAAA           1        1  1853  
15  AAAAAAAAAGAGCTATAATT          88       88     1  
16  AAAAAAAAAGAGTGAAAAAC          90       90     1  
17  AAAAAAAAAGCGAGGGCAGA          99       99     1  
20  AAAAAAAAAGGAGTACACTT         106      106     1  
                   HDMI  lane_miseq  tile_miseq  x_miseq  y_miseq  \
0  AAAAAAAAAAAAAAAAAAAA         1.0      2104.0   6995.0  17656.0   
1  AAAAAAAAAAACTGCGTAGG         1.0      2119.0  18707.0  15408.0

#### Only keep useful cols

In [7]:
##### only keep: tile, x, y, umi, astype(int)
# unsp_df
tile_ = unspliced_all_pos2['tile_miseq'].to_numpy().astype(int)
x_ = unspliced_all_pos2['x_miseq'].to_numpy().astype(int)
y_ = unspliced_all_pos2['y_miseq'].to_numpy().astype(int)
umi_ = unspliced_all_pos2['umi'].to_numpy().astype(int)
d = {'tile': tile_, 'x': x_, 'y': y_, 'umi': umi_} 
unsp_df = pd.DataFrame(d)
# sp_df
tile_ = spliced_all_pos2['tile_miseq'].to_numpy().astype(int)
x_ = spliced_all_pos2['x_miseq'].to_numpy().astype(int)
y_ = spliced_all_pos2['y_miseq'].to_numpy().astype(int)
umi_ = spliced_all_pos2['umi'].to_numpy().astype(int)
d = {'tile': tile_, 'x': x_, 'y': y_, 'umi': umi_} 
sp_df = pd.DataFrame(d)

print(unsp_df.shape)
print(sp_df.shape)
print(unsp_df.head())
print(sp_df.head())

(1556983, 4)
(5850152, 4)
   tile      x      y   umi
0  2104   6995  17656  1853
1  2117  11128  15518     1
2  2102  28217  19164     1
3  2103  16048  18931     1
4  2117  25999  19552     1
   tile      x      y  umi
0  2104   6995  17656  199
1  2119  18707  15408    1
2  2119  20685   9510    1
3  2104  25803  11295    4
4  2104  10064  16363    4


#### Save the dataframes

In [10]:
# pickle
outfile = 'output_step1/unsp_dict.pkl'

# save
pickle_dict = {}
pickle_dict['unsp_df'] = unsp_df
pickle_dict['sp_df'] = sp_df
pickle_dict['unspMat'] = unspMat
pickle_dict['spMat'] = spMat
with open(outfile, 'wb') as f:
    pickle.dump(pickle_dict, f)