# Sentence Extraction from PMC articles

In [11]:
import lexas.sentence
import lexas.relation_extraction
import torch

# Define the device to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
# Step 1: Extracting result sections from the articles
# The 'extract_results' function is used to extract sections of results from articles.
lexas.sentence.extract_results(
    article_dir="./articles/",
    output_file="./data/result_sections.txt",
)

  0%|          | 0/12 [00:00<?, ?it/s]

In [4]:
# Step 2: Masking gene terms and experiments
# The 'mask_gene_experiment' function is used to replace gene terms and experiments with MASK tokens in the text.
lexas.sentence.mask_gene_experiment(
    input_file_path="./data/result_sections.txt",
    output_file_path="./data/masked_sentences.txt",
)

0it [00:00, ?it/s]

In [5]:
# Step 3: Relation extraction using BioBERT
# The 'predict' function is used to predict relations using BioBERT model on the masked sentences.
lexas.relation_extraction.predict(
    device=device,
    input_filepath="./data/masked_sentences.txt",
    output_filepath="./data/masked_sentences_bert.txt",
)

1517it [03:20,  7.55it/s]


# Prediction model for genes

In [1]:
import lexas.prediction
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [3]:
# Step 1: Extracting context from experiments
# The 'extract_context_from_experiments' function processes the BioBERT predictions
# and extracts the context in which each experiment mention was made.
lexas.prediction.extract_context_from_experiments(
    input_file="./data/masked_sentences_bert.txt",
    output_file="./data/experiments_for_xgboost.csv"
)

824it [00:00, 181261.16it/s]

Done!





In [2]:
# Step 2: Loading feature data
# The 'feature_load' function is used to load feature data from various resources.
lexas.prediction.feature_load()

Loading categorical features...
Loading numerical features...
Loading string11_rwr.txt...
Loading funcoup5_rwr.txt...
Loading gosemsim.txt...


In [3]:
# Step 3: Selecting features
# The 'select_features' function is used to select the features to be used in the model.
cat_use = ['Chromosome', 'GO', 'MGI', 'HPO', 'OMIM', 'TF', 'iRefIndex', 'Localization', 'WebSter']
num_use = ['Tissue_expression', 'Cancer_expression', 'DepMap', 'Word2Vec']
plus = ["String","Funcoup","GOSemSim"]
feature_list, gene_cat, gene_num = lexas.prediction.select_features(cat_use, num_use)

# Print feature information
print("List of features: ", feature_list[:10])
print("\nFeatures assigned to a gene: ", gene_cat["CDK1"][:10])
print("\nNumerical features assigned to a gene: ", gene_num.keys())

List of features:  ['10p', '10q', '11p', '11q', '12p', '12q', '13q', '14p', '14q', '15q']

Features assigned to a gene:  ['10q', 'GO:0046686', 'GO:0065003', 'GO:0005634', 'GO:0030261', 'GO:0004674', 'GO:0000086', 'GO:0007098', 'GO:0060045', 'GO:0006281']

Numerical features assigned to a gene:  dict_keys(['Tissue_expression', 'Cancer_expression', 'DepMap', 'Word2Vec'])


In [4]:
# Step 4: Constructing the CSR matrix
# The 'construct_csr_matrix' function is used to transform the data into a format that can be processed by the XGBoost model.
path_to_csv="./data/experiments_for_xgboost.csv"
posi_tuple, nega_tuple = lexas.prediction.generate_experiment_tuples(path_to_csv, 1990, 2018, negative_sampling=3)
X, y = lexas.prediction.construct_csr_matrix(posi_tuple, nega_tuple, gene_cat, feature_list, gene_num, additional_features=plus)

793it [00:00, 757995.23it/s]

Constructing CSR matrix...  




Done


In [5]:
# Step 5: Train-Test split
# The train_test_split function is used to split the data into training and testing sets for model training and evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
# Step 6: Model Training
# The XGBClassifier is used to train a model on the data. The model is then saved using the joblib library.
model = xgb.XGBClassifier(
    objective= "binary:logistic",
    alpha=1e-3, 
    min_child_weight=3,
    max_depth=10,
    n_estimators=40000,
    n_jobs=-1,
    eta=0.03
)
model.fit(X_train, y_train, early_stopping_rounds=3, eval_set=[[X_test, y_test]])

# Save the model to a file
from joblib import dump
model_path = "./model/xgboost.joblib"
dump(model, model_path)



[0]	validation_0-logloss:0.68517
[1]	validation_0-logloss:0.68236
[2]	validation_0-logloss:0.67584
[3]	validation_0-logloss:0.66625
[4]	validation_0-logloss:0.65965
[5]	validation_0-logloss:0.65783
[6]	validation_0-logloss:0.65276
[7]	validation_0-logloss:0.65167
[8]	validation_0-logloss:0.64617
[9]	validation_0-logloss:0.64540
[10]	validation_0-logloss:0.63756
[11]	validation_0-logloss:0.63376
[12]	validation_0-logloss:0.63340
[13]	validation_0-logloss:0.63384
[14]	validation_0-logloss:0.63589
[15]	validation_0-logloss:0.63258
[16]	validation_0-logloss:0.63111
[17]	validation_0-logloss:0.62823
[18]	validation_0-logloss:0.62956
[19]	validation_0-logloss:0.62701
[20]	validation_0-logloss:0.62595
[21]	validation_0-logloss:0.62376
[22]	validation_0-logloss:0.62526
[23]	validation_0-logloss:0.62813
[24]	validation_0-logloss:0.62342
[25]	validation_0-logloss:0.62498
[26]	validation_0-logloss:0.62800


['./model/xgboost.joblib']

# Gene prediction for the next experiment

In [10]:
import os
import lexas.prediction
import pandas as pd
from joblib import load

# Step 1: Load model
model_name = "xgboost"

model_filepath = f"./model/{model_name}.joblib"

if os.path.exists(model_filepath):
    model = load(model_filepath)
else:
    raise Exception("Model file does not exist: " + model_filepath)

In [19]:
# Step 2: Generate scores
# The 'generate_scores' function is used to score all genes in relation to the query using the XGBoost model.
query = "CEP152"
scores = lexas.prediction.generate_scores(query, model_name, model, gene_cat, feature_list, gene_num, additional_features=plus)

In [20]:
# Step 3: Display result
# The result is displayed as a DataFrame sorted by the XGBoost score in descending order.
output_dir = f"./result/{model_name}"
os.makedirs(output_dir, exist_ok=True)

df = pd.DataFrame(scores)
df.to_csv(os.path.join(output_dir,f"{query}.csv"),index=False)
df.sort_values(model_name, ascending=False)[:10]

Unnamed: 0,Symbol,xgboost
1799,BUB1B-PAK6,0.649517
17144,SLC8A3,0.649517
5551,FAM86C1P,0.649517
17598,SMTNL2,0.649517
6965,GRPEL1,0.637327
6050,FN1,0.637327
15751,REM1,0.637327
11877,MST1R,0.637327
7753,IARS2,0.637327
6889,GPRIN2,0.637327
