In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Load Sort-Seq count data and sequence data (two different files)
ct_df = pd.read_csv('../examples/datafiles/sort_seq/full-wt/bin_counts.txt', header=None, delim_whitespace=True)
seq_df = pd.read_csv('../examples/datafiles/sort_seq/full-wt/rnap_sequences.txt', 
                     header=None, delim_whitespace=True)

# Concatenate count and seq dataframes
matrix_df = pd.concat([ct_df, seq_df], axis=1)

# Label columns sensibly
bin_cols = [f'bin_{n}' for n in range(ct_df.shape[1])]
matrix_df.columns = bin_cols + ['seq']

# Sum across all repeats of the same sequence
matrix_df = matrix_df.groupby('seq').sum()

# Create total column and sort by this
matrix_df['total'] = matrix_df[bin_cols].sum(axis=1)
matrix_df.sort_values(by='total',ascending=False,inplace=True)
matrix_df.reset_index(inplace=True)

# Show dataframe
print(matrix_df.shape)
matrix_df.head()

(45778, 12)


Unnamed: 0,seq,bin_0,bin_1,bin_2,bin_3,bin_4,bin_5,bin_6,bin_7,bin_8,bin_9,total
0,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,64,15,12,20,42,88,100,84,89,72,586
1,GGCTTTACACTTTATGTTTCCGGCTCGTATGTTGTGTGG,5,2,4,3,1,2,8,3,16,4,48
2,GGCTTTACACTTTATGCTTCCGTCTCGTATGTTGTGTGG,4,0,0,0,0,3,7,10,7,15,46
3,GGCTTTACATTTTATGCTTCCGGCTCGTATGTTGTGTGG,6,0,0,0,2,11,3,4,9,11,46
4,GGCTTTACACTTTATGCTTCCGACTCGTATGTTGTGTGG,2,2,3,0,0,6,6,10,10,4,43


In [4]:
# Melt dataframe
melt_df = matrix_df.melt(id_vars=['seq'], value_vars=bin_cols, var_name='bin', value_name='ct')

# Remove rows with ct=0
ix = melt_df['ct'] > 0
melt_df = melt_df[ix]

# Sort by descending ct
melt_df.sort_values('ct', ascending=False, inplace=True)
melt_df.reset_index(drop=True, inplace=True)

# Print results
print(melt_df.shape)
melt_df.head()

(49511, 3)


Unnamed: 0,seq,bin,ct
0,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_6,100
1,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_8,89
2,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_5,88
3,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_7,84
4,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_9,72


In [5]:
# Explode dataframe
tmp_df = melt_df.copy()
tmp_df['tmp'] = [np.ones(ct) for ct in tmp_df['ct']]
exploded_df = tmp_df.explode('tmp')[['seq','bin']]

# Sort by sequence and bin and reset index
exploded_df.sort_values(by=['seq','bin'], inplace=True)
exploded_df.reset_index(drop=True, inplace=True)

# Show dataframe
print(exploded_df.shape)
exploded_df.head()

(67933, 2)


Unnamed: 0,seq,bin
0,AAATACACACTTGCTGCTTCCGGCTCGTATGTTGTGTGG,bin_3
1,AAATTTACACTGTATGCTTCCGGCTCGCATGGCGTTTGC,bin_2
2,AAATTTACACTTTATGCATCAGACTCGTATGTTGTGTGG,bin_0
3,AAATTTACACTTTATGCTTCTGGCGCGTATGCGGCGTGG,bin_3
4,AACATTACATTTTATGCTTCCGGCTCGTATGGTGTGTGG,bin_1


In [6]:
# Count sequence-bin pairs
tmp1_df = exploded_df.copy()
tmp1_df['ct'] = 1
melt2_df = tmp1_df.groupby(['seq','bin']).sum()

# Sort by ct and reset index
melt2_df.sort_values(by='ct', inplace=True, ascending=False)
melt2_df.reset_index(inplace=True)

# Show dataframe
print(melt2_df.shape)
melt2_df.head()

(49511, 3)


Unnamed: 0,seq,bin,ct
0,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_6,100
1,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_8,89
2,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_5,88
3,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_7,84
4,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,bin_9,72


In [19]:
# Turn back into a matrix df
matrix2_df = melt2_df.pivot(index='seq', columns='bin', values='ct')
matrix2_df = matrix2_df.fillna(0).astype(int)
matrix2_df['total'] = matrix2_df.sum(axis=1)
matrix2_df.sort_values(by='total', inplace=True, ascending=False)
matrix2_df.reset_index(inplace=True)
matrix2_df.index.name=None
matrix2_df.columns.name=None

print(matrix2_df.shape)
matrix2_df.head()

(45778, 12)


Unnamed: 0,seq,bin_0,bin_1,bin_2,bin_3,bin_4,bin_5,bin_6,bin_7,bin_8,bin_9,total
0,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,64,15,12,20,42,88,100,84,89,72,586
1,GGCTTTACACTTTATGTTTCCGGCTCGTATGTTGTGTGG,5,2,4,3,1,2,8,3,16,4,48
2,GGCTTTACACTTTATGCTTCCGTCTCGTATGTTGTGTGG,4,0,0,0,0,3,7,10,7,15,46
3,GGCTTTACATTTTATGCTTCCGGCTCGTATGTTGTGTGG,6,0,0,0,2,11,3,4,9,11,46
4,GGCTTTACACTTTATGCTTCCGACTCGTATGTTGTGTGG,2,2,3,0,0,6,6,10,10,4,43
