# Sentence Extraction from PMC articles

In [1]:
import tqdm
import lexas.sentence
article_dir = "./articles/"

In [2]:
#Result extraction
import os
with open("./data/result_sections.txt", "w") as f:
    for file in tqdm.tqdm(os.listdir(article_dir)):
        pmcid = file.split(".")[0]
        year,sentences = lexas.sentence.parse(os.path.join(article_dir,file))
        segmented_sentences = lexas.sentence.segmentation(sentences)
        if year != 0:
            f.write("\t".join([year, pmcid, segmented_sentences]) + "\n")

100%|██████████| 7/7 [00:06<00:00,  1.11it/s]


In [3]:
#Masking gene terms and experiments
dic_hgnc,dic_expe = lexas.sentence.initialize_dictionaries()
with open("./data/masked_sentences.txt", "w") as f:
    with open("./data/result_sections.txt", "r") as f2:    
        for line in tqdm.tqdm(f2):
            year,pmcid,sentences = line.strip("\n").split("\t")
            for sentence in sentences.split("#####"):
                masked = lexas.sentence.mask(sentence,dic_hgnc,dic_expe)
                for m in masked:
                    f.write("\t".join([year,pmcid] + m[1:3]+[sentence] + m[0:1])+"\n")

7it [00:00, 74.46it/s]


In [2]:
#Relation extraction using bio-BERT
import lexas.relation_extraction
import torch
device=torch.device("cpu")#device=torch.device("cuda")
input_file="./data/masked_sentences.txt"
output_file="./data/masked_sentences_bert.txt"

lexas.relation_extraction.predict(device,input_file,output_file)

# Prediction model for genes

In [2]:
import lexas.prediction

In [1]:
#Extraction of experment_context
input_file="./data/masked_sentences_bert.txt"
output_file="./data/experiments_for_xgboost.csv"
lexas.prediction.experiment_context(input_file,output_file)

NameError: name 'lexas' is not defined

In [3]:
#Loading features
lexas.prediction.feature_load()
symbols = lexas.prediction.symbols

cat_use = ['Chromosome', 'GO', 'MGI', 'HPO', 'OMIM', 'TF', 'iRefIndex', 'Localization', 'WebSter']
num_use = ['Tissue_expression', 'Cancer_expression', 'DepMap', 'Word2Vec']

feature_list,all_cat,cat_num,all_num = lexas.prediction.choose_feature(cat_use,num_use)

Loading categorical features...
Loading numerical features...
Loading STRING...


In [4]:
#Constracting csr matrix
path_to_csv ="./data/experiments_for_xgboost.csv"

#Experiment tuples for training 
#from 2010-2020
#Return tuples of positive examples and 10 times the number of negative examples
posi_tuple,nega_tuple = lexas.prediction.make_tuple(path_to_csv,2010,2020,sampling=None)

#Experiment tuples for validation
#from 2021-2023
posi_tuple_dev,nega_tuple_dev = lexas.prediction.make_tuple(path_to_csv,2021,2023,sampling=None)

#Constructing CSR sparse matrix for training
X,y = lexas.prediction.make_csr(posi_tuple,nega_tuple,all_cat,cat_num,all_num)
X_dev,y_dev = lexas.prediction.make_csr(posi_tuple_dev,nega_tuple_dev,all_cat,cat_num,all_num)

Constructing CSR matrix...  Done
Constructing CSR matrix...  Done


In [8]:
#Training a SVM model
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
import pickle

reg = 0.001
lrn = SGDClassifier(loss='hinge', alpha=reg,penalty='l2')
lrn.fit(X,y)
calibrator = CalibratedClassifierCV(lrn, cv='prefit')
model=calibrator.fit(X,y)
pickle.dump(model, open("./model/svm_{}.pickle".format(reg), "wb"))

#AUC
from sklearn.metrics import roc_auc_score
y_pred = model.predict_proba(X_dev)[:,1]
print("AUC:",roc_auc_score(y_dev,y_pred))

AUC:  0.6839728564638003


# Gene prediction for the next experiment

In [5]:
#Before running, please load the features
import os
import lexas.prediction
import pickle
model = pickle.load(open("./model/svm_0.001.pickle","rb"))
models={"svm-0.01":model}
query="CEP57"

scores = lexas.prediction.scoring(query,models,all_cat,cat_num,all_num)

In [6]:
#Result
import pandas as pd
df = pd.DataFrame(scores)
df

Unnamed: 0,Symbol,svm-0.01
0,A1BG,0.137148
1,A1BG-AS1,0.042312
2,A1CF,0.061554
3,A2M,0.024529
4,A2ML1,0.013632
...,...,...
22938,PEPN,0.042129
22939,PYK,0.042086
22940,TAX,0.148369
22941,TMPRSS3,0.015149
