### Data Pre-Processing

In [None]:
# Import modules and libraries

import numpy as np
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from itertools import product
import matplotlib.pyplot as plt
import pandas as pd
from scipy.ndimage import zoom
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%pip install -U scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
%pip install umap-learn;
%pip install umap-learn[plot]
import umap
import umap.plot

In [None]:
# Import the data

ddf = dd.read_csv(path_to_assignment_dir / "train_data.csv", sep = ",")
shfl_ddf = ddf.sample(frac = 1, random_state = 42)
shfl_ddf.head()

In [None]:
# Split the data into a smaller subset to import into pandas DataFrame

ddf_1, ddf_2 = ddf.random_split([0.03, 0.97])

In [None]:
# Convert into Pandas DataFrame

pandas_df = ddf_1.compute()
print(pandas_df.shape)

In [None]:
# Generate new dataframe to work with. "M" is the dataframe to house the raw sequence, the first measured reactivity, the last measured reactivity, and the sequence length
# Additional rows are for the sequence in an array type form; each column is one nucleotide.

rows = len(pandas_df)
cols = len(pandas_df.columns)
n = 500 # number of sequences in input model. Will be filled with vacant, "V" sequences if shorter than "n." Final analysis is between 207-457 (length of test sequences)

# Pre-processing
# Clip values > 1 = 1, and X < 0 = 0
pandas_df.iloc[:,7:212][pandas_df.iloc[:,7:212] > 1] = 1
pandas_df.iloc[:,7:212][pandas_df.iloc[:,7:212] < 0] = 0
react_mean = np.nanmean(pandas_df.iloc[:, 7:212])

M_col_names = ['raw_seq','seq_start', 'seq_stop', 'seq_length', 'mod_seq_length']
#X_col_names = []
init_cols = len(M_col_names)

# Add new columns with normalized reactivity elements (500 sequences long)
for i in range(1, n+1):
  M_col_names.append('reactivity_' + str(i))
  #X_col_names.append('seq_' + str(i))

for i in range(1, n+1):
  M_col_names.append('seq_' + str(i))

#X = pd.DataFrame(columns = X_col_names, index = range(rows)) # Create empty data frame for sequence
M = pd.DataFrame(columns = M_col_names, index = range(rows)) # Create empty data frame for reactivities

for i in range(rows):

  if pandas_df.iloc[i]['SN_filter'] == 1: # Only use high signal to noise data
    seq = [j for j in pandas_df.iloc[i]['sequence']]
    M.iloc[i, (init_cols+n):(init_cols+n+len(seq))] = np.array(seq) # Add the individual characters of the sequence into the second half of the data frame
    M.iloc[i, (init_cols+n+len(seq)+1):(init_cols + 2*n)] = 'V' # Fill the remaining sequence character columns with "V" for "vacant"

    # Calculate the sequence length from character count
    seq_len = len(pandas_df.iloc[i]['sequence'])
    M.iloc[i]['seq_length'] = seq_len

    # Save the raw sequence (contained in one column entry)
    M.iloc[i]['raw_seq'] = pandas_df.iloc[i]['sequence']
    
    # Save the very first non-NaN value
    seq_start = pandas_df.iloc[i, 7:212].first_valid_index()
    M.iloc[i]['seq_start'] = seq_start

    # Save the very last non-NaN value
    seq_stop = pandas_df.iloc[i, 7:212].last_valid_index()
    M.iloc[i]['seq_stop'] = seq_stop

    # Save the reactivity values, insert the mean value for NaN values
    react = pandas_df.iloc[i]['reactivity_0001':'reactivity_0206'] # Gather the full reactivity values (including the NaN's)
    react[react.isnull() == True] = react_mean # Make NaN values equal to global mean of reactivity values
    react = np.array(react, dtype='float64')
    M.iloc[i, init_cols:init_cols+len(react)] = react


    #if bool(isnull_array.iloc[i,0]) + bool(isnull_array.iloc[i,1]) == 0:
    M.iloc[i]['mod_seq_length'] = np.subtract(np.float64(M.iloc[i]['seq_stop'][-3:]), np.float64(M.iloc[i]['seq_start'][-3:]))

# Drop rows and columns
print(M.shape)
M = M.dropna(subset = ['seq_start', 'seq_stop', 'seq_length'])
print(M.shape)

# Fill NaN values
M = M.fillna(1) # Fill NaN values with =1
#Norm_react = Norm_react.fillna(np.nanmean(Norm_react.iloc[:, 3:502].to_numpy())) # Fill NaN values with mean

# Reset index:
M.reset_index(drop=True, inplace=True)

rows = len(M)
cols = len(M.columns)

In [None]:
# Determine the "average" sequence to compare all other sequences to. E.g: {A,U,U,G,C ...}. "V" is for vacant, or empty, nucleotudes.

avg_seq = []
seq_nuc = ["A", "U", "G", "C", "V"] # "V" is vacant nucleotide

for i in range(n):
  counts = []
  counts.append((M.iloc[:, (init_cols+n+i)] == 'A').sum())
  counts.append((M.iloc[:, (init_cols+n+i)] == 'U').sum())
  counts.append((M.iloc[:, (init_cols+n+i)] == 'G').sum())
  counts.append((M.iloc[:, (init_cols+n+i)] == 'C').sum())
  counts.append((M.iloc[:, (init_cols+n+i)] == 'V').sum())
  avg_seq.append(seq_nuc[max(np.where(counts == max(counts), range(len(counts)), 0))])

print(avg_seq)

In [None]:
# Determine the average reactivity for each position

avg_react = np.zeros(n)

for i in range(n):
  avg_react[i] = M.iloc[:,init_cols+i].sum()/rows

In [None]:
# Determine the cosine similarity for reactivities

cos_sim_react = np.zeros(rows)

for i in range(rows):
  stack = np.vstack((M.iloc[i,init_cols:init_cols+n], avg_react))
  cos_sim_val = cosine_similarity(stack)
  cos_sim_react[i] = cos_sim_val[1][0]

print(cos_sim)

In [None]:
# Create a new data frame to house to "numeric" sequence information. This will be used for cosine similarity calculation against the "average" sequence.

method = 2 # Method 1 and method 2 differ in what numeric values are assigned to nucleotides

# Make a copy of the dataframe
N = M

# Replace all characters with numeric value.
if method == 1:
  N.replace("V", 0, inplace = True)
  N.replace("A", 0.25, inplace = True)
  N.replace("U", 0.5, inplace = True)
  N.replace("G", 0.75, inplace = True)
  N.replace("C", 1, inplace = True)

if method == 2: # method based on number of possible hydrogen bond forming partners
  N.replace("V", 1, inplace = True) # also consider = 0
  N.replace("A", 0.61, inplace = True) # also consider = 2
  N.replace("U", 0.15, inplace = True) # also consider = 4
  N.replace("G", 0.08, inplace = True) # also consider = 5
  N.replace("C", 0.5, inplace = True) # also consider = 3

print(N.shape)
N = N.drop(N.columns[range(n+init_cols)], axis = 1)
print(N.shape)

In [None]:
# Determine cosine similarity and save values in single column vector:

avg_seq_cos = np.array(avg_seq)

if method == 1:
  avg_seq_cos[avg_seq_cos == "V"] = 0
  avg_seq_cos[avg_seq_cos == "A"] = 0.25
  avg_seq_cos[avg_seq_cos == "U"] = 0.5
  avg_seq_cos[avg_seq_cos == "G"] = 0.75
  avg_seq_cos[avg_seq_cos == "C"] = 1

if method == 2:
  avg_seq_cos[avg_seq_cos == "V"] = 0
  avg_seq_cos[avg_seq_cos == "A"] = 2
  avg_seq_cos[avg_seq_cos == "U"] = 4
  avg_seq_cos[avg_seq_cos == "G"] = 5
  avg_seq_cos[avg_seq_cos == "C"] = 3

print(avg_seq_cos)

cos_sim = np.zeros(rows) # Store the cosine similarity values

for i in range(rows):
  stack = np.vstack((N.iloc[i,:], avg_seq_cos))
  cos_sim_val = cosine_similarity(stack)
  cos_sim[i] = cos_sim_val[1][0]
  counter = i

print(cos_sim)

In [None]:
# Determine Levenshtein distance against the average sequence
# Source: https://github.com/maxbachmann/Levenshtein.git
# Max Bachmann, 2021

%pip install levenshtein
from Levenshtein import distance

lev_vals = np.zeros(rows) # Store the Levenshtein distances against the average sequence

for i in range(rows):
  lev_vals[i] = distance(M.iloc[i]['raw_seq'], avg_seq)

### Clustering 4.1 - PCA

- Exploratory dimensionality reduction with PCA

In [None]:
# PCA analysis on reactivites

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

# Pre process the NaN values
#imp = SimpleImputer(missing_values=np.nan, strategy='mean')

X = M[M['mod_seq_length'] == unique_seq_len[0][1]]
X = np.array(M.iloc[:, init_cols:init_cols+n])

#X = imp.fit_transform(X)
pca = PCA(n_components=400)
pca.fit(X)
PCA(n_components=500)
#print(pca.explained_variance_ratio_)

ex_var_reposit = []
for i in range(500):
  ex_var_reposit.append(sum(pca.explained_variance_ratio_[0:i]))


### Baseline 4.2 - UMAP

- Exploratory dimensionality reduction with UMAP clustering

In [None]:
# UMAP plot

#X = np.array(M.iloc[:, init_cols:init_cols+n])
X = np.array(M.iloc[:, init_cols+n:init_cols+2*n])
UMAP_data = StandardScaler().fit_transform(X)
#X = imp.fit_transform(X)
Y = lev_vals
print(Y)
standard_embedding = umap.UMAP(random_state=42).fit_transform(UMAP_data)


# Plot the UMAP results

Y = MAE_vals # This determines how each point is colored on the UMAP plot. Other option = cos_sim, lev_vals, M.iloc[:]['seq_length']. GC content? cos_sim_react
labels = Y
plt.figure(figsize = [14, 10])
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c = labels, s=0.2, cmap='rainbow');
# Use 'total reactivity' as metric for labelling!
plt.title('UMAP of Numeric RNA Sequences', fontsize = 24)
plt.xlabel('UMAP 1', labelpad = 15, fontsize = 16)
plt.ylabel('UMAP 2', labelpad = 15, fontsize = 16)
plt.colorbar().set_label('Sequence length', labelpad = 15)
plt.show()

### Baseline 3.1

- Normalize the sequence length so that variable lengths (e.g 178 nucleotides long or 206 nucleotides long) are normalized to a sequence of 500, with interpolated values as needed.

- Clip the reactivity values between [0,1]

In [None]:
 Reactivitiy of normalized sequence length starts here

rows = len(pandas_df)
cols = len(pandas_df.columns)
n = 500 # normalization points - consider upscaling in final analysis

col_names = ['seq_start', 'seq_stop', 'seq_length']
# Add new columns with normalized reactivity elements (500 sequences long)
for i in range(1, n+1):
  col_names.append('reactivity_' + str(i))

Norm_react = pd.DataFrame(columns = col_names, index = range(rows)) # Create empty data frame for reactivities
Norm_react = pd.DataFrame(columns = col_names, index = range(rows)) # Create empty data frame for seq similarity

# Question - does NaN mean 0 or no sequence?
for i in range(rows):

  if pandas_df.iloc[i]['SN_filter'] == 1:

    seq_nans = pandas_df.iloc[i,7:177].isna().sum() # Sum all of the NaN's
    seq_len = len(pandas_df.iloc[i]['sequence'])

    Norm_react.iloc[i]['seq_length'] = seq_len

    seq_start = pandas_df.iloc[i, 7:177].first_valid_index()
    Norm_react.iloc[i]['seq_start'] = seq_start

    seq_stop = pandas_df.iloc[i, 7:177].last_valid_index()
    Norm_react.iloc[i]['seq_stop'] = seq_stop

    react = pandas_df.iloc[i]['reactivity_0001':seq_stop]
    #react[react.isnull() == True] = 1 # Make NaN values equal to zero

    react = np.array(react, dtype='float64')
    react = np.clip(react, 0, 1) # clip values to between 0 and 1
    # Inspied by image resizing/compression

    x_interp = np.linspace(0,1,n)

    react_norm = np.interp(x_interp, np.linspace(0,1,len(react)), react)
    Norm_react.iloc[i,3:n+3] = react_norm

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=fe343e39-d2c0-4296-915d-091d9a42752d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>