# Sentence Extraction from PMC articles

In [11]:
import lexas.sentence
import lexas.relation_extraction
import torch

# Define the device to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
# Step 1: Extracting result sections from the articles
# The 'extract_results' function is used to extract sections of results from articles.
lexas.sentence.extract_results(
    article_dir="./articles/",
    output_file="./data/result_sections.txt",
)

  0%|          | 0/12 [00:00<?, ?it/s]

In [4]:
# Step 2: Masking gene terms and experiments
# The 'mask_gene_experiment' function is used to replace gene terms and experiments with MASK tokens in the text.
lexas.sentence.mask_gene_experiment(
    input_file_path="./data/result_sections.txt",
    output_file_path="./data/masked_sentences.txt",
)

0it [00:00, ?it/s]

In [5]:
# Step 3: Relation extraction using BioBERT
# The 'predict' function is used to predict relations using BioBERT model on the masked sentences.
lexas.relation_extraction.predict(
    device=device,
    input_filepath="./data/masked_sentences.txt",
    output_filepath="./data/masked_sentences_bert.txt",
)

1517it [03:20,  7.55it/s]


# Prediction model for genes

In [2]:
import lexas.prediction
import xgboost as xgb
import pickle
from sklearn.model_selection import train_test_split

In [3]:
# Step 1: Extracting context from experiments
# The 'extract_context_from_experiments' function processes the BioBERT predictions
# and extracts the context in which each experiment mention was made.
lexas.prediction.extract_context_from_experiments(
    input_file="./data/masked_sentences_bert.txt",
    output_file="./data/experiments_for_xgboost.csv"
)

824it [00:00, 181261.16it/s]

Done!





In [4]:
# Step 2: Loading feature data
# The 'feature_load' function is used to load feature data from various resources.
lexas.prediction.feature_load()

Loading categorical features...
Loading numerical features...
Loading string11_rwr.txt...
Loading funcoup5_rwr.txt...
Loading gosemsim.txt...


In [5]:
# Step 3: Selecting features
# The 'select_features' function is used to select the features to be used in the model.
cat_use = ['Chromosome', 'GO', 'MGI', 'HPO', 'OMIM', 'TF', 'iRefIndex', 'Localization', 'WebSter']
num_use = ['Tissue_expression', 'Cancer_expression', 'DepMap', 'Word2Vec']
plus = ["String","Funcoup","GOSemSim"]
feature_list, gene_cat, gene_num = lexas.prediction.select_features(cat_use, num_use)

# Print feature information
print("List of features: ", feature_list[:10])
print("\nFeatures assigned to a gene: ", gene_cat["CDK1"][:10])
print("\nNumerical features assigned to a gene: ", gene_num.keys())

List of features:  ['10p', '10q', '11p', '11q', '12p', '12q', '13q', '14p', '14q', '15q']

Features assigned to a gene:  ['10q', 'GO:0046686', 'GO:0065003', 'GO:0005634', 'GO:0030261', 'GO:0004674', 'GO:0000086', 'GO:0007098', 'GO:0060045', 'GO:0006281']

Numerical features assigned to a gene:  dict_keys(['Tissue_expression', 'Cancer_expression', 'DepMap', 'Word2Vec'])


In [49]:
# Step 4: Constructing the CSR matrix
# The 'construct_csr_matrix' function is used to transform the data into a format that can be processed by the XGBoost model.
path_to_csv="./data/experiments_for_xgboost.csv"
posi_tuple, nega_tuple = lexas.prediction.generate_experiment_tuples(path_to_csv, 1990, 2018, negative_sampling=3)
X, y = lexas.prediction.construct_csr_matrix(posi_tuple, nega_tuple, gene_cat, feature_list, gene_num, additional_features=plus)

Constructing CSR matrix...  Done


In [50]:
# Step 5: Train-Test split
# The train_test_split function is used to split the data into training and testing sets for model training and evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [51]:
# Step 6: Model Training
# The XGBClassifier is used to train a model on the data. The model is then saved using the pickle library.
model = xgb.XGBClassifier(
    objective= "binary:logistic",
    alpha=1e-3, 
    min_child_weight=3,
    max_depth=10,
    n_estimators=40000,
    n_jobs=-1,
    eta=0.03
)
model.fit(X_train, y_train, early_stopping_rounds=3, eval_set=[[X_test, y_test]])
pickle.dump(model, open("./model/xgboost.pickle", "wb"))

[0]	validation_0-logloss:0.68921
[1]	validation_0-logloss:0.68372
[2]	validation_0-logloss:0.67863
[3]	validation_0-logloss:0.67482
[4]	validation_0-logloss:0.67037
[5]	validation_0-logloss:0.66708
[6]	validation_0-logloss:0.66397
[7]	validation_0-logloss:0.66194
[8]	validation_0-logloss:0.65564
[9]	validation_0-logloss:0.65401
[10]	validation_0-logloss:0.65295




[11]	validation_0-logloss:0.65033
[12]	validation_0-logloss:0.64793
[13]	validation_0-logloss:0.64554
[14]	validation_0-logloss:0.64461
[15]	validation_0-logloss:0.64185
[16]	validation_0-logloss:0.64201
[17]	validation_0-logloss:0.63922
[18]	validation_0-logloss:0.64004
[19]	validation_0-logloss:0.63187
[20]	validation_0-logloss:0.63142
[21]	validation_0-logloss:0.62520
[22]	validation_0-logloss:0.62588
[23]	validation_0-logloss:0.61977
[24]	validation_0-logloss:0.61879
[25]	validation_0-logloss:0.61912
[26]	validation_0-logloss:0.61847
[27]	validation_0-logloss:0.61328
[28]	validation_0-logloss:0.61145
[29]	validation_0-logloss:0.61081
[30]	validation_0-logloss:0.60920
[31]	validation_0-logloss:0.60930
[32]	validation_0-logloss:0.60737
[33]	validation_0-logloss:0.60001
[34]	validation_0-logloss:0.60029
[35]	validation_0-logloss:0.59325
[36]	validation_0-logloss:0.59201
[37]	validation_0-logloss:0.59231
[38]	validation_0-logloss:0.59044
[39]	validation_0-logloss:0.58854
[40]	validatio

# Gene prediction for the next experiment

In [54]:
import os
import pickle
import lexas.prediction
import pandas as pd

# Step 1: Load model
# The 'pickle.load' function is used to load the previously saved XGBoost model.
model_name = "xgboost"
model_filepath = f"./model/{model_name}.pickle"
if os.path.exists(model_filepath):
    model = pickle.load(open(model_filepath,"rb"))
else:
    raise Exception("Model file does not exist: " + model_filepath)

In [67]:
# Step 2: Generate scores
# The 'generate_scores' function is used to score all genes in relation to the query using the XGBoost model.
query = "CEP152"
scores = lexas.prediction.generate_scores(query, model_name, model, gene_cat, feature_list, gene_num, additional_features=plus)

In [68]:
# Step 3: Display result
# The result is displayed as a DataFrame sorted by the XGBoost score in descending order.
output_dir = f"./result/{model_name}"
os.makedirs(output_dir, exist_ok=True)

df = pd.DataFrame(scores)
df.to_csv(os.path.join(output_dir,f"{query}.csv"),index=False)
df.sort_values(model_name, ascending=False)[:10]

Unnamed: 0,Symbol,xgboost
14237,PIFO,0.620295
3418,CNN2,0.599426
5346,ESRRA,0.595573
9774,LYZ,0.59385
13765,PARP3,0.58756
22774,HP,0.58528
5433,F2,0.579334
22326,DBI,0.57263
18622,STON1,0.57263
17483,SLX1B-SULT1A4,0.571906


In [62]:
df.sort_values(model_name, ascending=False)[:100]

Unnamed: 0,Symbol,xgboost
2586,CCNQ,0.771731
12765,NR1D1,0.769125
8385,ITGB3,0.767917
16049,RNF207,0.761760
16336,RTL8A,0.761760
...,...,...
5502,FAM43A,0.749447
17364,SLC35E4,0.749447
2991,CETN4P,0.749447
8451,JDP2,0.749447
