In [2]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [25]:

ter_fe = {
    "aa": 0.03,
    "uu": 0.03,
    "au": 0.38,
    "ua": 0.16,
    "cu": 0.55,
    "ag": -0.19,
    "ca": 0.08,
    "ug": -0.05,
    "gu": 0.55,
    "ac": -0.05,
    "ga": 0.39,
    "uc": -0.20,
    "cg": -0.10,
    "gg": -0.16,
    "cc": 0.0,
    "gc": -0.12
}

int_fe = {
    "aa": -0.87,
    "uu": -0.87,
    "au": -1.21,
    "ua": -1.12,
    "cu": -2.14,
    "ag": -2.14,
    "ca": -1.94,
    "ug": -1.94,
    "gu": -2.29,
    "ac": -2.29,
    "ga": -2.23,
    "uc": -2.23,
    "cg": -2.31,
    "gg": -3.18,
    "cc": -3.18,
    "gc": -3.34,   
}

def encode_s(s: str):
    s = s[:-2].lower().replace('t', 'u')
    ther = []

    for i in range(len(s)-1):
        sub = s[i:i+2]
        if i == 0:
            ther.append(ter_fe[sub])
        else:
            ther.append(int_fe[sub])

    overall_stability = sum(ther)
    five_stability = sum(ther[:5])
    three_stabillity = sum(ther[-5:])

    gc_count = s.count('g') + s.count('c')
    five_gc_count = s[:5].count('g') + s[:5].count('c')
    three_gc_content = s[-5:].count('g') + s[-5:].count('c')

    return [overall_stability, five_stability, three_stabillity, gc_count, five_gc_count, three_gc_content] + ther

df = pd.read_csv('rna.csv')

encode_u = np.frompyfunc(encode_s, 1, 1)

X = np.array(encode_u(df['Sequence'].to_numpy()).tolist()).reshape(-1, 6 + 18)
y = df['Activity'].to_numpy().reshape(-1, 1)

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_scaled, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42)

In [26]:
params = {'boosting_type': 'gbdt', 'num_leaves': 15, 'max_depth': -1, 'learning_rate': 0.1, 'n_estimators': 100, 'subsample_for_bin': 200, 'objective': None, 'class_weight': None, 'random_state': 42}

In [27]:
model = lgb.LGBMRegressor()
model.set_params(**params)
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LGBMRegressor(num_leaves=15, random_state=42, subsample_for_bin=200)

In [28]:
y_pred = model.predict(X_test)

In [29]:
np.corrcoef(model.predict(X_train), y_train.ravel())

array([[1.        , 0.87138232],
       [0.87138232, 1.        ]])

In [30]:
np.corrcoef(y_pred, y_test.ravel())

array([[1.        , 0.57092744],
       [0.57092744, 1.        ]])