In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numbers
%matplotlib inline

In [18]:
## Make small function to transform from vector to matrix format for NA regression and NA model evaluation.

# Prototype function
def vec_data_to_mat_data(y_n, 
                         ct_n=None,
                         x_n=None):
    """
    
    parameters
    ----------
        
    y_n: (array-like of ints)
        List of N bin numbers y. Must be set by user.
        
    ct_n: (array-like of ints)
        List N counts, one for each (sequence,bin) pair.
        If None, a value of 1 will be assumed for all observations
        
    x_n: (array-like)
        List of N sequences. If None, each y_n will be
        assumed to come from a unique sequence. 
            
    returs
    ------
    
    ct_my: (2D array of ints)
        Matrix of counts.
            
    x_m: (array)
        Corresponding list of x-values. 
    """
    
    # Cast y as array of ints
    y_n = np.array(y_n).astype(int)
    N = len(y_n)
    
    # Cast x as array and get length
    if x_n is None:
        x_n = np.arange(N)
    else:
        x_n = np.array(x_n)
        assert len(x_n)==N, f'len(y_n)={len(y_n)} and len(x_n)={N} do not match.'

    # Get ct
    if ct_n is None:
        ct_n = np.ones(N).astype(int)
    else:
        assert len(ct_n)==N, f'len(ct_n)={len(ct_n)} and len(x_n)={N} do not match.'
        
    # Create dataframe
    data_df = pd.DataFrame()
    data_df['x'] = x_n
    data_df['y'] = y_n
    data_df['ct'] = ct_n

    # Pivot dataframe
    data_df = data_df.pivot(index='x', columns='y', values='ct')
    data_df = data_df.fillna(0).astype(int)

    # Clean dataframe
    data_df.reset_index(inplace=True)
    data_df.columns.name=None
    
    # Get ct_my values
    cols = [c for c in data_df.columns if not c in ['x']]
    ct_my = data_df[cols].values.astype(int)
    
    # Get x_m values
    x_m = data_df['x'].values
    
    return ct_my, x_m

In [19]:
data_df = pd.read_csv('sortseq/full-500/data.txt',delim_whitespace=True)
#data_df.columns[1:]
#data_df.columns = ['seq',0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
data_df.columns = ['seq',0, 1, 2, 3, 4]

In [20]:
seq_df = pd.DataFrame({'0':data_df['seq'].values})

In [21]:
ct_df = data_df[data_df.columns[1:]]

In [22]:
# Load Sort-Seq count data and sequence data (two different files)
# ct_df = pd.read_csv('../../examples/datafiles/sort_seq/full-wt/bin_counts.txt', header=None, delim_whitespace=True)
# seq_df = pd.read_csv('../../examples/datafiles/sort_seq/full-wt/rnap_sequences.txt', 
#                      header=None, delim_whitespace=True)

# Concatenate count and seq dataframes
matrix_df = pd.concat([ct_df, seq_df], axis=1)

# Label columns sensibly
bin_cols = [f'bin_{n}' for n in range(ct_df.shape[1])]
matrix_df.columns = bin_cols + ['seq']

# Sum across all repeats of the same sequence
matrix_df = matrix_df.groupby('seq').sum()

# Create total column and sort by this
#matrix_df['total'] = matrix_df[bin_cols].sum(axis=1)
#matrix_df.sort_values(by='total',ascending=False,inplace=True)
matrix_df.reset_index(inplace=True)

# Show dataframe
print(matrix_df.shape)
matrix_df.head()

(23251, 6)


Unnamed: 0,seq,bin_0,bin_1,bin_2,bin_3,bin_4
0,AAAAAATCTGTGTTTGCTCACCCATAAGGCACCGCCGGCTTTACAC...,0.0,1.0,0.0,0.0,0.0
1,AAAAAATGCGAGGTAGCTCACTCATTAGGAGTCCCAGGCTTTACAC...,0.0,0.0,0.0,1.0,0.0
2,AAAAAATGTCAGATTGCTCACTCATTAGGCACCCCGGGCTCTACAC...,0.0,1.0,0.0,0.0,0.0
3,AAAAAATGTCAGTTAGCTGACTCATTAGGCACCCCTGGCTTTACGT...,1.0,0.0,0.0,0.0,0.0
4,AAAAAATGTGACTTAGCTCACTCATTAGGTACCCCAGGCCTTGCAC...,0.0,0.0,0.0,0.0,1.0


In [23]:
#def maven_melt(data_df, b...)

# Melt dataframe
melt_df = matrix_df.melt(id_vars=['seq'], value_vars=bin_cols, var_name='bin', value_name='ct')

# Remove rows with ct=0
ix = melt_df['ct'] > 0
melt_df = melt_df[ix]

# Sort by descending ct
melt_df.sort_values('ct', ascending=False, inplace=True)
melt_df.reset_index(drop=True, inplace=True)

# Print results
print(melt_df.shape)

# Show data
data_df = melt_df.copy()
data_df['bin'] = [int(y.split('_')[1]) for y in data_df['bin']]
data_df.head()

(23431, 3)


Unnamed: 0,seq,bin,ct
0,AGTTAATTTGAGTTCGCTCACTCAATAGGTACCCCACGCTTTACAC...,1,12.0
1,AATTAATGTGAGTTAGCTCGTTCATTAGGCACTCCAGGCTTAACAC...,1,9.0
2,AAGCCGTGTAAGTTAGCTCACTCATAAGGCACCCCAGGCTTTACAC...,1,8.0
3,AATTAATGTGAGTTGGCCCACTCTTTCGGCACCCCAGGCTTTACAC...,3,8.0
4,AATTATTGTGAGTTAGCTCACTCATTACGCACCCCACGCTTTACAC...,4,8.0


In [24]:
#data_df.to_csv('full-500.csv')

In [6]:
#pd.read_csv('Sort_seq_vector_fmt_input.csv',index_col=[0])

Unnamed: 0,seq,bin,ct
0,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,6,100
1,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,8,89
2,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,5,88
3,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,7,84
4,GGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,9,72
...,...,...,...
49506,GGCTTTACTCATTAGGCTTCAGTCTCCAATGAGGTGTTG,3,1
49507,GGCTTTACTCATTATGCTTCCGGTTCGTATGTTGTGGGC,3,1
49508,GGCTTTACTCCTTCTATTTTCGGCTCGTATGTTGTGTGG,3,1
49509,GGCTTTACTCGTTATCATTCCGGCTCGTATGTGGTATGG,3,1


In [7]:
ct_my, x_m, = vec_data_to_mat_data(x_n=data_df['seq'], 
                                   y_n=data_df['bin'],
                                   ct_n=data_df['ct'])

print(ct_my)
print(x_m)

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 2 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 4]]
['AAATACACACTTGCTGCTTCCGGCTCGTATGTTGTGTGG'
 'AAATTTACACTGTATGCTTCCGGCTCGCATGGCGTTTGC'
 'AAATTTACACTTTATGCATCAGACTCGTATGTTGTGTGG' ...
 'TTTTTGACACTTTATGCTTCCGGCTCGTATACTGTGAGG'
 'TTTTTTACACTTTCTGCTTCCTGCTGGTAGGTTGCGTGC'
 'TTTTTTAGACTTTATGCTTTTCGCTCGTATGTTGTGTGG']


In [12]:
x_m

array(['AAATACACACTTGCTGCTTCCGGCTCGTATGTTGTGTGG',
       'AAATTTACACTGTATGCTTCCGGCTCGCATGGCGTTTGC',
       'AAATTTACACTTTATGCATCAGACTCGTATGTTGTGTGG', ...,
       'TTTTTGACACTTTATGCTTCCGGCTCGTATACTGTGAGG',
       'TTTTTTACACTTTCTGCTTCCTGCTGGTAGGTTGCGTGC',
       'TTTTTTAGACTTTATGCTTTTCGCTCGTATGTTGTGTGG'], dtype=object)

--------

In [8]:
ct_my.shape

(45778, 10)

In [9]:
# self = DummyModel("GE")
# set_data(self=self,
#          data_df=data_df,
#          x_col="seq",
#          y_col="bin",
#          ct_col="ct")
# self.__dict__

In [10]:
# class DummyModel:
#     def __init__(self, regression_type):
#         self.regression_type = regression_type
#         print(f"Set self.regression_type={self.regression_type}.")

# # Prototype function
# def set_data(self,
#              data_df=None, 
#              x_col=None, 
#              y_col=None, 
#              ct_col=None,
#              y_to_keep=None):
#     """
    
#     parameters
#     ----------
    
#     self: (Model instance)
#         Model class instance that must have self.regression_type in ["GE","NA"].
    
#     data_df: (pd.DataFrame)
#         Data frame containing training sequences and measurements.
        
#     x_col: (string)
#         The column of data_df listing the sequences to analyze.
        
#     y_col: (string)
#         The column of data_df listing measruement values. 
             
#     ct_col: (string)
#         The column of data_df listing the multiplicity of each (x,y) pair.
#         If None, a value of 1 will be assumed for all rows. 
        
#     y_to_keep: (list)
#         If out_format="matrix", or if y-values are non-numeric, each unique 
#         value of y will be replaced by a nonnegative integer k (the bin number). 
#         Setting y_to_keep allows the user to specify which y-values are mapped to which 
#         integers k, as k is set to be the index of y in this list. Excluding y-values from this
#         list will exclude corresponding observations from analysis. 
#         If this list is None, all data will be used and the mapping from 
#         y to k will be determined by sorting unique y-values in ascending order. 
#     """
    
#     # Make sure x_col and y_col are valid
#     assert x_col in data_df.columns, f"x_col={x_col} is not in data_df.columns={data_df.columns}."
#     assert y_col in data_df.columns, f"y_col={x_col} is not in data_df.columns={data_df.columns}."

#     # Trim data_df down and set columns to ["x", "y", "ct"].
#     if ct_col is None:
#         data_df = data_df[[x_col, y_col]].copy()
#         data_df.columns = ['x','y']
#         data_df['ct'] = 1
#     else:
#         assert ct_col in data_df.columns, f"ct_col={ct_col} is not in data_df.columns={data_df.columns}."
#         data_df = data_df[[x_col, y_col, ct_col]].copy()
#         data_df.columns = ['x','y','ct']
        
#     assert self.regression_type in ["GE","NA"]
    
#     # Replace y values with integers if 
#     # a) out_format=="matrix" or
#     # b) data_df['y'] contains non-numeric values
#     y_is_numeric = isinstance(data_df['y'].dtype, numbers.Number)
#     if self.regression_type=="NA" or not y_is_numeric:
        
#         # Discretize y
#         if y_to_keep is None:
#             y_to_keep = data_df['y'].unique()
#             y_to_keep.sort()
#         else:
#             assert set(y_to_keep) <= set(melt_df['bin'].values)

#         # Remove y-values not in list
#         num_bins = len(y_to_keep)
#         k_list = np.arange(num_bins)
#         y_to_k_dict = dict(zip(y_to_keep, k_list))
#         data_df['y'] = data_df['y'].map(y_to_k_dict)
#         data_df.dropna(inplace=True)
#         data_df['y'] = data_df['y'].astype(int)
    
#     # Transform to matrix format if requested
#     if self.regression_type=="NA":
        
#         # Turn back into a matrix df
#         data_df = data_df.pivot(index='x', columns='y', values='ct')
#         data_df = data_df.fillna(0).astype(int)
#         data_df['tot'] = data_df.sum(axis=1)
#         data_df.sort_values(by=['tot','x'], inplace=True, ascending=False)
#         data_df.reset_index(inplace=True)
#         data_df.columns.name=None
        
#         # Set attributes
#         self.data_df = data_df
#         self.x = data_df['x'].values
#         cols = [c for c in data_df.columns if not c in ['x','tot']]
#         self.y = data_df[cols].values.astype(int)
#         self.bin_names = y_to_keep
        
#     elif self.regression_type=="GE":
        
#         # Explode dataframe
#         data_df['tmp'] = [np.ones(ct) for ct in data_df['ct']]
#         data_df = data_df.explode('tmp')[['x','y']]
        
#         # Clean up data_df
#         data_df.sort_values(by=['y','x'], inplace=True, ascending=False)
#         data_df.reset_index(inplace=True, drop=True)
#         data_df.columns.name=None
        
#         # Set attributes
#         self.data_df = data_df
#         self.x = data_df['x'].values
#         self.y = data_df['y'].values.astype(float)
#     else:
#         assert False, "This line should never execute."
        
#     return self

In [11]:
# class DummyModel:
#     def __init__(self, regression_type):
#         self.regression_type = regression_type
#         print(f"Set self.regression_type={self.regression_type}.")
        
# # Prototype function
# def set_data(self,
#              data_df=None, 
#              x_col=None, 
#              y_col=None, 
#              ct_col=None):
#     """
    
#     parameters
#     ----------
    
#     self: (Model instance)
#         Model class instance that must have self.regression_type in ["GE","NA"].
    
#     data_df: (pd.DataFrame)
#         Data frame containing training sequences and measurements.
        
#     x_col: (string)
#         The column of data_df listing the sequences to analyze.
        
#     y_col: (string)
#         The column of data_df listing measruement values. 
             
#     ct_col: (string)
#         The column of data_df listing the multiplicity of each (x,y) pair.
#         If None, a value of 1 will be assumed for all rows. 
#     """
    
#     # Make sure x_col and y_col are valid
#     assert x_col in data_df.columns, f"x_col={x_col} is not in data_df.columns={data_df.columns}."
#     assert y_col in data_df.columns, f"y_col={x_col} is not in data_df.columns={data_df.columns}."

#     # Trim data_df down and set columns to ["x", "y", "ct"].
#     if ct_col is None:
#         data_df = data_df[[x_col, y_col]].copy()
#         data_df.columns = ['x','y']
#         data_df['ct'] = 1
#     else:
#         assert ct_col in data_df.columns, f"ct_col={ct_col} is not in data_df.columns={data_df.columns}."
#         data_df = data_df[[x_col, y_col, ct_col]].copy()
#         data_df.columns = ['x','y','ct']
        
#     assert self.regression_type in ["GE","NA"]
    
    
#     # Transform to matrix format if requested
#     if self.regression_type=="NA":
        
#         # Cast y values as integers
#         data_df['y'] = data_df['y'].astype(int)
    
#         # Pivot dataframe
#         data_df = data_df.pivot(index='x', columns='y', values='ct')
#         data_df = data_df.fillna(0).astype(int)
        
#         # Clean dataframe
#         data_df.reset_index(inplace=True)
#         data_df.columns.name=None
        
#         # Set attributes
#         #self.data_df = data_df
#         self.x_seq = data_df['x'].values
#         cols = [c for c in data_df.columns if not c in ['x']]
#         self.y = data_df[cols].values.astype(int)
#         self.num_bins = self.y.shape[1]
#         self.N = self.y.ravel().sum()
        
#     elif self.regression_type=="GE":
        
#         # Explode dataframe
#         data_df['tmp'] = [np.ones(ct) for ct in data_df['ct']]
#         data_df = data_df.explode('tmp')[['x','y']]
        
#         # Clean up data_df
#         data_df.sort_values(by=['y','x'], inplace=True, ascending=False)
#         data_df.reset_index(inplace=True, drop=True)
#         data_df.columns.name=None
        
#         # Set attributes
#         #self.data_df = data_df
#         self.x_seq = data_df['x'].values
#         self.y = data_df['y'].values.astype(float)
#         self.N = len(self.y)
#     else:
#         assert False, "This line should never execute."
        
#     # Make sure all sequences have the same length
#     lengths = np.unique([len(seq) for seq in data_df['x']])
#     assert len(lengths)==1, f'Not all sequences are the same length; lengths={lengths}.'
#     self.L = lengths[0]
#     assert self.L > 0, 'Sequence length must be > 0.'