# Sentence Extraction from PMC articles

In [18]:
import lexas.sentence
import lexas.relation_extraction
import torch

# Define the device to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
# Step 1: Extracting result sections from the articles
# The 'extract_results' function is used to extract sections of results from articles.
lexas.sentence.extract_results(
    article_dir="./articles/",
    output_file="./data/result_sections.txt",
)

  0%|          | 0/7 [00:00<?, ?it/s]

In [20]:
# Step 2: Masking gene terms and experiments
# The 'mask_gene_experiment' function is used to replace gene terms and experiments with MASK tokens in the text.
lexas.sentence.mask_gene_experiment(
    input_file_path="./data/result_sections.txt",
    output_file_path="./data/masked_sentences.txt",
)

Initializing dictionaries...
Done


0it [00:00, ?it/s]

In [21]:
# Step 3: Relation extraction using BioBERT
# The 'predict' function is used to predict relations using BioBERT model on the masked sentences.
lexas.relation_extraction.predict(
    device=device,
    input_filepath="./data/masked_sentences.txt",
    output_filepath="./data/masked_sentences_bert.txt",
)

764it [01:30,  8.48it/s]


# Prediction model for genes

In [22]:
import lexas.prediction
import xgboost as xgb
import pickle
from sklearn.model_selection import train_test_split

In [23]:
# Step 1: Extracting context from experiments
# The 'extract_context_from_experiments' function processes the BioBERT predictions
# and extracts the context in which each experiment mention was made.
lexas.prediction.extract_context_from_experiments(
    input_file="./data/masked_sentences_bert.txt",
    output_file="./data/experiments_for_xgboost.csv"
)

455it [00:00, 172270.11it/s]

Done!





In [24]:
# Step 2: Loading feature data
# The 'feature_load' function is used to load feature data from various resources.
lexas.prediction.feature_load()

In [25]:
# Step 3: Selecting features
# The 'select_features' function is used to select the features to be used in the model.
cat_use = ['Chromosome', 'GO', 'MGI', 'HPO', 'OMIM', 'TF', 'iRefIndex', 'Localization', 'WebSter']
num_use = ['Tissue_expression', 'Cancer_expression', 'DepMap', 'Word2Vec']
plus = ["String","Funcoup","GOSemSim"]
feature_list, gene_cat, gene_num = lexas.prediction.select_features(cat_use, num_use)

# Print feature information
print("List of features: ", feature_list[:10])
print("\nFeatures assigned to a gene: ", gene_cat["CDK1"][:10])
print("\nNumerical features assigned to a gene: ", gene_num.keys())

List of features:  ['10p', '10q', '11p', '11q', '12p', '12q', '13q', '14p', '14q', '15q']

Features assigned to a gene:  ['10q', 'GO:0046686', 'GO:0065003', 'GO:0005634', 'GO:0030261', 'GO:0004674', 'GO:0000086', 'GO:0007098', 'GO:0060045', 'GO:0006281']

Numerical features assigned to a gene:  dict_keys(['Tissue_expression', 'Cancer_expression', 'DepMap', 'Word2Vec'])


In [26]:
# Step 4: Constructing the CSR matrix
# The 'construct_csr_matrix' function is used to transform the data into a format that can be processed by the XGBoost model.
posi_tuple, nega_tuple = lexas.prediction.generate_experiment_tuples(path_to_csv, 1990, 2021, negative_sampling=3)
X, y = lexas.prediction.construct_csr_matrix(posi_tuple, nega_tuple, gene_cat, feature_list, gene_num, additional_features=plus)

Constructing CSR matrix...  Done


In [32]:
# Step 5: Train-Test split
# The train_test_split function is used to split the data into training and testing sets for model training and evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [41]:
# Step 6: Model Training
# The XGBClassifier is used to train a model on the data. The model is then saved using the pickle library.
model = xgb.XGBClassifier(
    objective= "binary:logistic",
    alpha=1e-3, 
    min_child_weight=3,
    max_depth=10,
    n_estimators=40000,
    n_jobs=-1,
    eta=0.03
)
model.fit(X_train, y_train, early_stopping_rounds=3, eval_set=[[X_test, y_test]])
pickle.dump(model, open("./model/xgboost.pickle", "wb"))

[0]	validation_0-logloss:0.68504
[1]	validation_0-logloss:0.67846
[2]	validation_0-logloss:0.67326
[3]	validation_0-logloss:0.66693
[4]	validation_0-logloss:0.66131
[5]	validation_0-logloss:0.65650
[6]	validation_0-logloss:0.65148
[7]	validation_0-logloss:0.64710
[8]	validation_0-logloss:0.64310
[9]	validation_0-logloss:0.63918
[10]	validation_0-logloss:0.63690
[11]	validation_0-logloss:0.63330
[12]	validation_0-logloss:0.62897
[13]	validation_0-logloss:0.62554
[14]	validation_0-logloss:0.62268
[15]	validation_0-logloss:0.61899
[16]	validation_0-logloss:0.61661
[17]	validation_0-logloss:0.61360
[18]	validation_0-logloss:0.61155
[19]	validation_0-logloss:0.60876
[20]	validation_0-logloss:0.60703
[21]	validation_0-logloss:0.60226
[22]	validation_0-logloss:0.60157




[23]	validation_0-logloss:0.59998
[24]	validation_0-logloss:0.59813
[25]	validation_0-logloss:0.59750
[26]	validation_0-logloss:0.59669
[27]	validation_0-logloss:0.59548
[28]	validation_0-logloss:0.59278
[29]	validation_0-logloss:0.58873
[30]	validation_0-logloss:0.58810
[31]	validation_0-logloss:0.58652
[32]	validation_0-logloss:0.58852
[33]	validation_0-logloss:0.58574
[34]	validation_0-logloss:0.58396
[35]	validation_0-logloss:0.58206
[36]	validation_0-logloss:0.57975
[37]	validation_0-logloss:0.58184
[38]	validation_0-logloss:0.58252
[39]	validation_0-logloss:0.58183


# Gene prediction for the next experiment

In [44]:
import os
import pickle
import lexas.prediction
import pandas as pd

# Step 1: Load model
# The 'pickle.load' function is used to load the previously saved XGBoost model.
model_filepath = "./model/xgboost.pickle"
if os.path.exists(model_filepath):
    model = pickle.load(open(model_filepath,"rb"))
else:
    raise Exception("Model file does not exist: " + model_filepath)

In [45]:
# Step 2: Generate scores
# The 'generate_scores' function is used to score all genes in relation to the query using the XGBoost model.
query = "CEP63"
models = {"xgboost": model}
scores = lexas.prediction.generate_scores(query, models, gene_cat, feature_list, gene_num, additional_features=plus)

In [46]:
# Step 3: Display result
# The result is displayed as a DataFrame sorted by the XGBoost score in descending order.
df = pd.DataFrame(scores)
df.sort_values("xgboost", ascending=False)[:10]

Unnamed: 0,Symbol,xgboost
2939,CEP63,0.736764
2937,CEP57,0.6997
2913,CENPJ,0.690875
2941,CEP70,0.663008
3080,CHCHD2,0.645934
22507,NBN,0.625187
1797,BUB1,0.625187
12328,NDE1,0.618029
7682,HSPA8,0.616741
12933,NUP62,0.616092
