In [25]:
import pandas as pd

In [26]:
df_original = pd.read_csv("data/train_data.csv")
n_original = df_original.shape[0]
df_submit = pd.read_csv("data/sample_submission.csv")
df = pd.concat([df_original, df_submit], axis=0).reset_index(drop=True)

In [27]:
def siRNA_feat_builder(s: pd.Series, anti: bool = False):
    name = "anti" if anti else "sense"
    df = s.to_frame()
    df[f"feat_siRNA_{name}_seq_len"] = s.str.len()
    for pos in [0, -1]:
        for c in list("AUGC"):
            df[f"feat_siRNA_{name}_seq_{c}_{'front' if pos == 0 else 'back'}"] = (
                s.str[pos] == c
            )
    df[f"feat_siRNA_{name}_seq_pattern_1"] = s.str.startswith("AA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_2"] = s.str.startswith("GA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_3"] = s.str.startswith("CA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_4"] = s.str.startswith("UA") & s.str.endswith(
        "UU"
    )
    df[f"feat_siRNA_{name}_seq_pattern_5"] = s.str.startswith("UU") & s.str.endswith(
        "AA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_6"] = s.str.startswith("UU") & s.str.endswith(
        "GA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_7"] = s.str.startswith("UU") & s.str.endswith(
        "CA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_8"] = s.str.startswith("UU") & s.str.endswith(
        "UA"
    )
    df[f"feat_siRNA_{name}_seq_pattern_9"] = s.str[1] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_10"] = s.str[-2] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_GC_frac"] = (
        s.str.contains("G") + s.str.contains("C")
    ) / s.str.len()
    return df.iloc[:, 1:]

In [28]:
df_publication_id = pd.get_dummies(df.publication_id)
df_publication_id.columns = [
    f"feat_publication_id_{c}" for c in df_publication_id.columns
]
df_gene_target_symbol_name = pd.get_dummies(df.gene_target_symbol_name)
df_gene_target_symbol_name.columns = [
    f"feat_gene_target_symbol_name_{c}" for c in df_gene_target_symbol_name.columns
]
df_gene_target_ncbi_id = pd.get_dummies(df.gene_target_ncbi_id)
df_gene_target_ncbi_id.columns = [
    f"feat_gene_target_ncbi_id_{c}" for c in df_gene_target_ncbi_id.columns
]
df_gene_target_species = pd.get_dummies(df.gene_target_species)
df_gene_target_species.columns = [
    f"feat_gene_target_species_{c}" for c in df_gene_target_species.columns
]
siRNA_duplex_id_values = df.siRNA_duplex_id.str.split("-|\.").str[1].astype("int")
siRNA_duplex_id_values = (siRNA_duplex_id_values - siRNA_duplex_id_values.min()) / (
    siRNA_duplex_id_values.max() - siRNA_duplex_id_values.min()
)
df_siRNA_duplex_id = pd.DataFrame(siRNA_duplex_id_values)
df_cell_line_donor = pd.get_dummies(df.cell_line_donor)
df_cell_line_donor.columns = [
    f"feat_cell_line_donor_{c}" for c in df_cell_line_donor.columns
]
df_cell_line_donor["feat_cell_line_donor_hepatocytes"] = (
    (df.cell_line_donor.str.contains("Hepatocytes")).fillna(False).astype("int")
)
df_cell_line_donor["feat_cell_line_donor_cells"] = (
    df.cell_line_donor.str.contains("Cells").fillna(False).astype("int")
)
df_siRNA_concentration = df.siRNA_concentration.to_frame()
df_Transfection_method = pd.get_dummies(df.Transfection_method)
df_Transfection_method.columns = [
    f"feat_Transfection_method_{c}" for c in df_Transfection_method.columns
]
df_Duration_after_transfection_h = pd.get_dummies(df.Duration_after_transfection_h)
df_Duration_after_transfection_h.columns = [
    f"feat_Duration_after_transfection_h_{c}"
    for c in df_Duration_after_transfection_h.columns
]
feats = pd.concat(
    [
        df_publication_id,
        df_gene_target_symbol_name,
        df_gene_target_ncbi_id,
        df_gene_target_species,
        df_siRNA_duplex_id,
        df_cell_line_donor,
        df_siRNA_concentration,
        df_Transfection_method,
        df_Duration_after_transfection_h,
        siRNA_feat_builder(df.siRNA_sense_seq, False),
        siRNA_feat_builder(df.siRNA_antisense_seq, True),
        df.iloc[:, -1].to_frame(),
    ],
    axis=1,
)

In [29]:
import lightgbm as lgb
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
import numpy as np
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation


X_train, X_test, y_train, y_test = train_test_split(
    feats.iloc[:n_original, :-1],
    feats.iloc[:n_original, -1],
    test_size=0.2,
    random_state=42,
)

In [30]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 自定义评价函数
def calculate_metrics(preds, data, threshold=30):
    y_pred = preds
    y_true = data.get_label()
    mae = np.mean(np.abs(y_true - y_pred))
    
    y_true_binary = ((y_true <= threshold) & (y_true >= 0)).astype(int)
    y_pred_binary = ((y_pred <= threshold) & (y_pred >= 0)).astype(int)

    mask = (y_pred >= 0) & (y_pred <= threshold)
    range_mae = mean_absolute_error(y_true[mask], y_pred[mask]) if np.sum(mask) > 0 else 100

    precision = (np.array(y_pred_binary) & y_true_binary).sum() / np.sum(y_pred_binary) if np.sum(y_pred_binary) > 0 else 0
    recall = (np.array(y_pred_binary) & y_true_binary).sum() / np.sum(y_true_binary) if np.sum(y_true_binary) > 0 else 0

    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    
    score = (1 - mae / 100) * 0.5 + (1 - range_mae / 100) * f1 * 0.5
    return "custom_score", score, True

# 自适应学习率回调函数
def adaptive_learning_rate(decay_rate=0.8, patience=50):
    best_score = float("-inf")
    wait = 0

    def callback(env):
        nonlocal best_score, wait
        current_score = env.evaluation_result_list[-1][2]
        current_lr = env.model.params.get('learning_rate')

        if current_score > best_score:
            best_score = current_score
            wait = 0
        else:
            wait += 1

        if wait >= patience:
            new_lr = float(current_lr) * decay_rate
            wait = 0
            env.model.params['learning_rate'] = new_lr
            print(f"Learning rate adjusted to {env.model.params.get('learning_rate')}")

    return callback

# 训练函数

def train(feats, n_original):
    n_splits = 10
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    gbms = []

    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
            'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
            'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
            'objective': 'regression',
            'metric': 'rmse'
        }
        
        model = LGBMRegressor(**params)
        
        # Initialize the scores
        scores = []
        
        for train_idx, val_idx in kf.split(feats.iloc[:n_original, :]):
            X_train, X_val = feats.iloc[train_idx, :-1], feats.iloc[val_idx, :-1]
            y_train, y_val = feats.iloc[train_idx, -1], feats.iloc[val_idx, -1]
            
            # 使用早停功能
            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                eval_metric='rmse',
                callbacks=[early_stopping(stopping_rounds=100, verbose=False)]
            )
            valid_score = model.best_score_['valid_0']['rmse']
            scores.append(valid_score)
        
        return np.mean(scores)
    
    # Optimize with Optuna
    study = optuna.create_study(direction='minimize')  # We want to minimize RMSE
    study.optimize(objective, n_trials=100)

    print('Best trial:')
    trial = study.best_trial
    print('Value: {}'.format(trial.value))
    print('Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))
    
    # Train the final model with the best parameters on the entire data
    best_params = trial.params
    for fold, (train_idx, val_idx) in enumerate(kf.split(feats.iloc[:n_original, :]), 1):
        print(f"Starting fold {fold} with best parameters")
        X_train, X_val = feats.iloc[train_idx, :-1], feats.iloc[val_idx, :-1]
        y_train, y_val = feats.iloc[train_idx, -1], feats.iloc[val_idx, -1]
        
        gbm = LGBMRegressor(**best_params)
        gbm.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            callbacks=[early_stopping(stopping_rounds=100, verbose=200)]
        )

        valid_score = gbm.best_score_['valid_0']['rmse']
        print(f"Fold {fold} best valid score: {valid_score}")
        gbms.append(gbm)

    return gbms

# 进行模型训练
trained_gbms = train(feats, n_original)

[I 2024-08-20 22:14:46,815] A new study created in memory with name: no-name-0d11ab56-9361-40f6-883d-07d234a3bbb2
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 618
[LightGBM] [Info] Number of data points in the train set: 23203, number of used features: 169
[LightGBM] [Info] Start training from score 54.346793
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000683 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 636
[LightGBM] [Info] Number of data points in the train set: 23203, number of used features: 178
[LightGBM] [Info] Start training from score 54.494719
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000930 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

[I 2024-08-20 22:14:50,952] Trial 0 finished with value: 23.049096228741593 and parameters: {'max_depth': 3, 'learning_rate': 0.07553399293380524, 'n_estimators': 1760, 'min_child_samples': 88}. Best is trial 0 with value: 23.049096228741593.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 656
[LightGBM] [Info] Number of data points in the train set: 23203, number of used features: 188


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[LightGBM] [Info] Start training from score 54.346793
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 652
[LightGBM] [Info] Number of data points in the train set: 23203, number of used features: 186
[LightGBM] [Info] Start training from score 54.494719
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 649
[LightGBM] [Info] Number of data points in the train set: 23204, number of used features: 184
[LightGBM] [Info] Start training from score 54.494735
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000930 seconds.
You can set `force_row_wise=

[I 2024-08-20 22:14:55,021] Trial 1 finished with value: 22.734247066742935 and parameters: {'max_depth': 4, 'learning_rate': 0.05693417772227005, 'n_estimators': 1435, 'min_child_samples': 69}. Best is trial 1 with value: 22.734247066742935.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 23203, number of used features: 197
[LightGBM] [Info] Start training from score 54.346793


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 23203, number of used features: 197
[LightGBM] [Info] Start training from score 54.494719
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 675
[LightGBM] [Info] Number of data points in the train set: 23204, number of used features: 197
[LightGBM] [Info] Start training from score 54.494735


[W 2024-08-20 22:14:57,022] Trial 2 failed with parameters: {'max_depth': 10, 'learning_rate': 0.007031839513742786, 'n_estimators': 1268, 'min_child_samples': 40} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Roaming\Python\Python311\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Administrator\AppData\Local\Temp\ipykernel_5912\513215281.py", line 79, in objective
    model.fit(
  File "C:\Users\Administrator\AppData\Roaming\Python\Python311\site-packages\lightgbm\sklearn.py", line 1173, in fit
    super().fit(
  File "C:\Users\Administrator\AppData\Roaming\Python\Python311\site-packages\lightgbm\sklearn.py", line 954, in fit
    self._Booster = train(
                    ^^^^^^
  File "C:\Users\Administrator\AppData\Roaming\Python\Python311\site-packages\lightgbm\engine.py", line 307, in train
    boost

KeyboardInterrupt: 

In [None]:
# 预测并保存结果
y_pred = np.mean([gbm.predict(feats.iloc[n_original:, :-1]) for gbm in trained_gbms], axis=0)

In [None]:
df_submit["mRNA_remaining_pct"] = y_pred
df_submit.to_csv("submission.csv", index=False)