### Data Pre-Processing

In [None]:
# Import modules and libraries

import numpy as np
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from itertools import product
import matplotlib.pyplot as plt
import pandas as pd
from scipy.ndimage import zoom
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%pip install -U scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
%pip install umap-learn;
%pip install umap-learn[plot]
import umap
import umap.plot

In [None]:
# Generate new dataframe to work with. "M" is the dataframe to house the raw sequence, the first measured reactivity, the last measured reactivity, and the sequence length
# Additional rows are for the sequence in an array type form; each column is one nucleotide.

rows = len(pandas_df)
cols = len(pandas_df.columns)
n = 500 # normalization points - consider upscaling in final analysis to between 207-457 (length of test sequences)

# Pre-processing
# Clip values > 1 = 1, and X<0 = 0
pandas_df.iloc[:,7:212][pandas_df.iloc[:,7:212] > 1] = 1
pandas_df.iloc[:,7:212][pandas_df.iloc[:,7:212] < 0] = 0
react_mean = np.nanmean(pandas_df.iloc[:, 7:212])

M_col_names = ['raw_seq','seq_start', 'seq_stop', 'seq_length']
#X_col_names = []
init_cols = len(M_col_names)

# Add new columns with normalized reactivity elements (500 sequences long)
for i in range(1, n+1):
  M_col_names.append('reactivity_' + str(i))
  #X_col_names.append('seq_' + str(i))

for i in range(1, n+1):
  M_col_names.append('seq_' + str(i))

#X = pd.DataFrame(columns = X_col_names, index = range(rows)) # Create empty data frame for sequence
M = pd.DataFrame(columns = M_col_names, index = range(rows)) # Create empty data frame for reactivities

for i in range(rows):

  #if pandas_df.iloc[i]['SN_filter'] == 1:
    seq = [j for j in pandas_df.iloc[i]['sequence']]
    M.iloc[i, (init_cols+n):(init_cols+n+len(seq))] = np.array(seq)
    M.iloc[i, (init_cols+n+len(seq)+1):(init_cols + 2*n)] = 'V'

    #seq_nans = pandas_df.iloc[i,7:212].isna().sum() # Sum all of the NaN's
    seq_len = len(pandas_df.iloc[i]['sequence'])

    M.iloc[i]['raw_seq'] = pandas_df.iloc[i]['sequence']
    M.iloc[i]['seq_length'] = seq_len

    seq_start = pandas_df.iloc[i, 7:212].first_valid_index()
    M.iloc[i]['seq_start'] = seq_start

    seq_stop = pandas_df.iloc[i, 7:212].last_valid_index()
    M.iloc[i]['seq_stop'] = seq_stop

    react = pandas_df.iloc[i]['reactivity_0001':'reactivity_0206'] # Gather the full reactivity values (including the NaN's)
    react[react.isnull() == True] = react_mean # Make NaN values equal to global mean of reactivity values

    react = np.array(react, dtype='float64')
    M.iloc[i, init_cols:init_cols+len(react)] = react

# Drop rows that had SN ratio < 1 and therefore will be populated with NaN's
print(M.shape)
M = M.dropna(subset = ['seq_start', 'seq_stop', 'seq_length'])
print(M.shape)

# Make any NaN values (outside the sequence) equal to 1 (should react)
#Norm_react.iloc[:,3:502][Norm_react.iloc[:,3:502].isnull() == True] = 1

# Fill NaN values with mean
M = M.fillna(1) # Fill NaN values with =1
#Norm_react = Norm_react.fillna(np.nanmean(Norm_react.iloc[:, 3:502].to_numpy())) # Fill NaN values with mean

# Reset index:
M.reset_index(drop=True, inplace=True)

rows = len(M)
cols = len(M.columns)

In [None]:
# Determine the "average" sequence to compare all other sequences to. E.g: {A,U,U,G,C ...}. "V" is for vacant, or empty, nucleotudes.

avg_seq = []
seq_nuc = ["A", "U", "G", "C", "V"] # "V" is vacant nucleotide

for i in range(n):
  counts = []
  counts.append((M.iloc[:, (init_cols+n+i)] == 'A').sum())
  counts.append((M.iloc[:, (init_cols+n+i)] == 'U').sum())
  counts.append((M.iloc[:, (init_cols+n+i)] == 'G').sum())
  counts.append((M.iloc[:, (init_cols+n+i)] == 'C').sum())
  counts.append((M.iloc[:, (init_cols+n+i)] == 'V').sum())
  avg_seq.append(seq_nuc[max(np.where(counts == max(counts), range(len(counts)), 0))])

print(avg_seq)

### Clustering 4.1 - PCA

- Exploratory dimensionality reduction with PCA

In [None]:
# PCA analysis on reactivites

# Import packages
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

# Remove entire rows with specified number of NaN's
print(Norm_react.shape)
Norm_react = Norm_react.dropna(how = 'all')
print(Norm_react.shape)
Norm_react = Norm_react.dropna(thresh = 50) # drop if there are 50 missing values
print(Norm_react.shape)

# Pre process the remaining individual NaN values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# End of pre-processing

X = np.array(Norm_react.iloc[:, 3:n+3])
X = imp.fit_transform(X)

pca = PCA(n_components=400)
pca.fit(X)
#PCA(n_components=500)
print(pca.explained_variance_ratio_)

ex_var_reposit = []
for i in range(500):
  ex_var_reposit.append(sum(pca.explained_variance_ratio_[0:i]))

plt.figure(figsize = (14,6))
plt.plot(range(500), ex_var_reposit, c = 'b', label = 'explained variance')
plt.title('Explained variance vs. # components for normalized reactivities')
plt.xlabel('# components')
plt.ylabel('Explained variance')
plt.axvline(x = 170, c = 'g', linestyle = 'dashed', label = 'seq_length = 170')
plt.axvline(x = 85, c = 'r', linestyle = 'dashed', label = 'exp_var = ' + str(ex_var_reposit[85]))
plt.axvline(x = 100, c = 'black')
plt.axhline(y = ex_var_reposit[85], c = 'r', linestyle = 'dashed')
plt.legend()
plt.show()


### Baseline 4.2 - UMAP

- Exploratory dimensionality reduction with UMAP clustering

In [None]:
# Import packages
import umap
import umap.plot

# Cite source for UMAP algorithm and implementation
@article{mcinnes2018umap-software,
  title={UMAP: Uniform Manifold Approximation and Projection},
  author={McInnes, Leland and Healy, John and Saul, Nathaniel and Grossberger, Lukas},
  journal={The Journal of Open Source Software},
  volume={3},
  number={29},
  pages={861},
  year={2018} }

# UMAP analysis
standard_embedding = umap.UMAP(random_state=42).fit_transform(X)


# Plot the results
labels = range(rows)
plt.figure(figsize = [12, 12])
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c = labels, s=0.1, cmap='Spectral');
# Use 'total reactivity' as metric for labelling!
plt.show()

### Baseline 3.1

- Normalize the sequence length so that variable lengths (e.g 178 nucleotides long or 206 nucleotides long) are normalized to a sequence of 500, with interpolated values as needed.

- Clip the reactivity values between [0,1]

In [None]:
 Reactivitiy of normalized sequence length starts here

rows = len(pandas_df)
cols = len(pandas_df.columns)
n = 500 # normalization points - consider upscaling in final analysis

col_names = ['seq_start', 'seq_stop', 'seq_length']
# Add new columns with normalized reactivity elements (500 sequences long)
for i in range(1, n+1):
  col_names.append('reactivity_' + str(i))

Norm_react = pd.DataFrame(columns = col_names, index = range(rows)) # Create empty data frame for reactivities
Norm_react = pd.DataFrame(columns = col_names, index = range(rows)) # Create empty data frame for seq similarity

# Question - does NaN mean 0 or no sequence?
for i in range(rows):

  if pandas_df.iloc[i]['SN_filter'] == 1:

    seq_nans = pandas_df.iloc[i,7:177].isna().sum() # Sum all of the NaN's
    seq_len = len(pandas_df.iloc[i]['sequence'])

    Norm_react.iloc[i]['seq_length'] = seq_len

    seq_start = pandas_df.iloc[i, 7:177].first_valid_index()
    Norm_react.iloc[i]['seq_start'] = seq_start

    seq_stop = pandas_df.iloc[i, 7:177].last_valid_index()
    Norm_react.iloc[i]['seq_stop'] = seq_stop

    react = pandas_df.iloc[i]['reactivity_0001':seq_stop]
    #react[react.isnull() == True] = 1 # Make NaN values equal to zero

    react = np.array(react, dtype='float64')
    react = np.clip(react, 0, 1) # clip values to between 0 and 1
    # Inspied by image resizing/compression

    x_interp = np.linspace(0,1,n)

    react_norm = np.interp(x_interp, np.linspace(0,1,len(react)), react)
    Norm_react.iloc[i,3:n+3] = react_norm

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=fe343e39-d2c0-4296-915d-091d9a42752d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>