In [1]:
from os import listdir
from os.path import isfile, isdir, join
from lxml import etree
import pandas as pd
import tarfile
import gzip
import time
import csv
import re
import sys
import math
import nltk
import string
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
import pyltr

# TESTING pyltr with LETOR

In [None]:
folder="/Users/ari/Downloads/MQ2007/Fold1"

In [None]:
with open(join(folder,'train.txt')) as trainfile, open(join(folder,'vali.txt')) as valifile, open(join(folder,'test.txt')) as evalfile:
    TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
    VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
    EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

In [None]:
metric = pyltr.metrics.NDCG(k=10)

# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(VX, Vy, Vqids, metric=metric, stop_after=250)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1000,
    learning_rate=0.02,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1,
)

model.fit(TX, Ty, Tqids, monitor=monitor)

In [None]:
Epred = model.predict(EX)
print('Random ranking:', metric.calc_mean_random(Eqids, Ey))
print('Our model:', metric.calc_mean(Eqids, Ey, Epred))

# Reading GS Files

In [3]:
gsPath = "/Users/ari/Downloads/TREC/trec2018/results/goldstandard"
trainYear = "2017"
testYear = "2018"

In [4]:
gsTrainFile = join(join(gsPath, trainYear),"20180622processedGoldStandardXMLTXT.tsv")
gsTestFile = join(join(gsPath, testYear),"20190111processedGoldStandardPub2018.tsv")

In [5]:
trainData = pd.read_csv(gsTrainFile, sep = '\t', encoding='utf8', dtype={'trec_doc_id':object})
trainData.fillna("", inplace=True)
trainData.head()

Unnamed: 0.1,Unnamed: 0,trec_topic_number,trec_doc_id,pm_rel_desc,disease_desc,gene1_annotation_desc,gene1_name,gene2_annotation_desc,gene2_name,gene3_annotation_desc,...,title,abstract,major_mesh,minor_mesh,trec_topic_disease,trec_topic_age,trec_topic_sex,trec_topic_other1,trec_topic_other2,trec_topic_other3
0,0,1,10065107,Human PM,Exact,Missing Gene,CDK4 Amplification,,,,...,[A case of metastatic liposarcoma originating ...,We reported a 36-year-old woman with metastati...,,Adult;Antineoplastic Combined Chemotherapy Pro...,Liposarcoma,38-year-old,male,GERD,,
1,1,1,10101594,Human PM,More General,Exact,CDK4 Amplification,,,,...,Analysis of SAS gene and CDK4 and MDM2 protein...,The region q13-15 of chromosome 12 contains SA...,Nuclear Proteins,"Adolescent;Adult;Chromosomes, Human, Pair 12;C...",Liposarcoma,38-year-old,male,GERD,,
2,2,1,10220412,Human PM,More Specific,Missing Gene,CDK4 Amplification,,,,...,Induction of a secreted protein by the myxoid ...,"The TLS-CHOP oncoprotein, found in the majorit...",CCAAT-Enhancer-Binding Proteins;Gene Expressio...,"Animals;Cells, Cultured;Cloning, Molecular;DNA...",Liposarcoma,38-year-old,male,GERD,,
3,3,1,10323080,Human PM,More General,Exact,CDK4 Amplification,,,,...,"Mutations of TP53, amplification of EGFR, MDM2...",We investigated the frequency and mutual relat...,Gene Deletion;Nuclear Proteins,Cyclin-Dependent Kinase 4;Cyclin-Dependent Kin...,Liposarcoma,38-year-old,male,GERD,,
4,4,1,10466061,Human PM,Exact,Missing Gene,CDK4 Amplification,,,,...,[A case of advanced retroperitoneal dedifferen...,We report a case of retroperitoneal dedifferen...,,"Antineoplastic Agents, Alkylating/administrati...",Liposarcoma,38-year-old,male,GERD,,


In [6]:
testValData = pd.read_csv(gsTestFile, sep = '\t', encoding='utf8', dtype={'trec_doc_id':object})
testValData.fillna("", inplace=True)
testValData.head()

Unnamed: 0.1,Unnamed: 0,trec_topic_number,trec_doc_id,pm_rel_desc,disease_desc,gene1_annotation_desc,gene1_name,gene2_annotation_desc,gene2_name,gene3_annotation_desc,...,demographics_desc,other_desc,relevance_score,title,abstract,major_mesh,minor_mesh,trec_topic_disease,trec_topic_age,trec_topic_sex
0,0,1,1007359,Human PM,More Specific,Missing Gene,BRAF (V600E),,,,...,Matches,Not Discussed,0,[Primary multiple malignant melanomas of unusu...,"In 1975, 117 patients with malignant melanoma ...","Melanoma/pathology;Neoplasms, Multiple Primary...",Adult;Aged;Female;Humans;Male;Middle Aged;Skin...,melanoma,64-year-old,male
1,1,1,10833951,Human PM,More Specific,Missing Gene,BRAF (V600E),,,,...,Excludes,Not Discussed,0,[Malignant melanomas and young men].,Medullary thyroid cancer (MTC) is a distinct C...,Melanoma/epidemiology;Melanoma/etiology;Melano...,Adult;Age Factors;Humans;Male;Risk Factors,melanoma,64-year-old,male
2,2,1,11381855,Human PM,More Specific,Missing Gene,BRAF (V600E),,,,...,Excludes,Not Discussed,0,Update on malignant melanoma in children.,Malignant melanoma is a rare event in children...,Melanoma/diagnosis;Melanoma/etiology;Melanoma/...,"Adolescent;Child;Child, Preschool;Female;Human...",melanoma,64-year-old,male
3,3,1,1234252,Not PM,,,,,,,...,,,0,[Triage].,"Multiple endocrine neoplasia, type 2B (MEN 2B)...",Disasters;Emergency Medical Services;Triage,Austria;Humans,melanoma,64-year-old,male
4,4,1,1234878,Not PM,,,,,,,...,,,0,Malignant melanoma--an overview.,"Multiple endocrine neoplasia, type 2B (MEN 2B)...",Melanoma/diagnosis;Melanoma/therapy;Skin Neopl...,Female;Humans;Immunotherapy,melanoma,64-year-old,male


# Preprocessing the data

## Functions to tokenize, remove stop words, get stemms

In [7]:
# Get Stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopWords = stopwords.words('english')

def tokenizePorter(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = PorterStemmer()
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

def tokenizeSnowball(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

[nltk_data] Downloading package stopwords to /Users/ari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## TrainData

In [8]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
trainData['title_abstract_mesh'] = trainData[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
trainData['title_abstract_mesh_stemmed'] = trainData['title_abstract_mesh'].apply(tokenizeSnowball)

np.unique(trainData['pm_rel_desc'])

trainData["qid"] = "0"
trainData["qid"][trainData["pm_rel_desc"] == "Human PM"] = "1"
trainData["qid"][trainData["pm_rel_desc"] == "Animal PM"] = "1"

trainDataSliced = trainData[['relevance_score','qid','title_abstract_mesh_stemmed']]
trainDocId = trainData[['trec_doc_id']]
trainDataSliced.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,relevance_score,qid,title_abstract_mesh_stemmed
0,0,1,case metastat liposarcoma origin retroperitoneum success treat combin chemotherapi report 36yearold woman metastat liposarcoma origin retroperitoneum respond well adjuv chemotherapi primari tumor remov surgeri two month later patient develop metastasi brain lung four month later metastat liposarcoma brain general extrem rare patient treat combin chemotherapi use cyclophosphamid vincristin adriamycin dacarbazin cyvad examin former two drug altern vindesin ifosfamid anoth regimen cisplatin etoposid given threeweek interv result metastas total disappear recurr lesion note two year although role chemotherapi liposarcoma well defin littl data support use adjuv set combin chemotherapi seem effect advanc liposarcoma adult antineoplast combin chemotherapi protocol therapeut use brain neoplasm drug therapi brain neoplasm secondari cyclophosphamid administr dosag dacarbazin administr dosag doxorubicin administr dosag drug administr schedul femal human liposarcoma drug therapi liposarcoma secondari lung neoplasm drug therapi lung neoplasm secondari remiss induct retroperiton neoplasm patholog vincristin administr dosag
1,1,1,analysi sas gene cdk4 mdm2 protein lowgrad osteosarcoma region q1315 chromosom 12 contain sas cdk4 mdm2 gene rearrang amplifi varieti human sarcoma studi evalu sas gene amplif mdm2 cdk4 protein express 20 tumor sampl central lowgrad osteosarcoma 16 primari 3 recurr 1 lung metastasi sas amplif analyz quantit polymeras chain reaction pcr paraffinembed sampl mdm2 cdk4 protein express evalu immunohistochemistri mdm2 cdk4 protein found strong express 35 65 respect sampl sas found amplifi 15 sampl find indic gene may involv tumorigenesi progress lowgrad osteosarcoma nuclear protein adolesc adult chromosom human pair 12 cyclindepend kinas 4 cyclindepend kinas genet cyclindepend kinas metabol dna neoplasm isol purif femal gene express human immunohistochemistri male membran protein genet membran protein metabol middl age neoplasm protein genet neoplasm protein metabol osteosarcoma genet osteosarcoma metabol osteosarcoma patholog polymeras chain reaction protooncogen protein genet protooncogen protein metabol protooncogen protein cmdm2 tetraspanin
2,0,1,induct secret protein myxoid liposarcoma oncogen tlschop oncoprotein found major human myxoid liposarcoma consist fusion transcript factor chop gadd153 n terminus rnabind protein tls fus clinic correl vitro transform assay indic n terminus tls play import role oncogenesi tlschop howev activ attribut oncoprotein inhibit bind transcript factor c ebp class certain adipogen target gene function tlschop share nononcogen chop protein report isol gene dol54 activ primari fibroblast express tlschop dol54 express neoplast compon human myxoid liposarcoma increas tumorigen cell inject nude mice activ dol54 requir intact dnabind dimer domain tlschop suitabl cellular dimer partner depend tls n terminus normal adipocyt differenti associ earli transient express dol54 gene encod secret protein tight associ cell surfac extracellular matrix tlschop thus lead unschedul express gene normal associ adipocyt differenti ccaatenhancerbind protein gene express regul neoplast rnabind protein fus anim cell cultur clone molecular dnabind protein genet dnabind protein metabol fibroblast metabol human liposarcoma myxoid genet liposarcoma myxoid metabol mice molecular sequenc data neoplasm protein genet neoplasm protein secret nuclear protein genet oncogen protein fusion genet transcript factor chop
3,1,1,mutat tp53 amplif egfr mdm2 cdk4 delet cdkn2a malign astrocytoma investig frequenc mutual relationship molecular alter 33 malign astrocytoma 28 glioblastoma 5 anaplast astrocytoma genet alter analyz delet cdkn2a p16 gene tp53 mutat amplif egfr mdm2 cdk4 common genet alter egfr amplif reveal 15 case 45 tp53 mutat identifi 9 case 27 cdkn2 p16 delet detect 13 case 41 either mdm2 cdk4 amplif less frequent identifi 4 12 1 3 case respect 15 case show amplif egfr 9 cdkn2 p16 delet 60 p 004 hand cdkn2 p16 delet egfr amplif rare occur tp53 mutat 2 14 case cdkn2 p16 delet 14 result confirm exist least two differ pathway lead format glioblastoma gene delet nuclear protein cyclindepend kinas 4 cyclindepend kinas inhibitor p16 genet cyclindepend kinas genet glioblastoma genet human mutat neoplasm protein genet polymeras chain reaction protooncogen protein genet protooncogen protein cmdm2 receptor epiderm growth factor genet retrospect studi tumor suppressor protein p53 genet
4,0,1,case advanc retroperiton dedifferenti liposarcoma treat effect highdos ifosfamid report case retroperiton dedifferenti liposarcoma treat effect highdos ifosfamid 59yearold man receiv tumorectomi right nephrectomi retroperiton liposarcoma twentytwo month oper liver metastasi resect incomplet three month later right pleural retroperiton periton metastas appear 6 cycl highdos ifosfamid therapi tumor reduc partial ten cycl chemotherapi administ tumor show regrowth 14 month administr high dose ifosfamid combin chemotherapi ifosfamid doxorubicin etoposid effect regrowth tumor antineoplast agent alkyl administr dosag antineoplast combin chemotherapi protocol therapeut use combin modal therapi drug administr schedul human ifosfamid administr dosag liposarcoma drug therapi liposarcoma secondari liposarcoma surgeri liver neoplasm secondari male middl age periton neoplasm secondari pleural neoplasm secondari retroperiton neoplasm drug therapi retroperiton neoplasm patholog retroperiton neoplasm surgeri


## Test Data Only (No Validation)

In [None]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
testValData['title_abstract_mesh'] = testValData[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
testValData['title_abstract_mesh_stemmed'] = testValData['title_abstract_mesh'].apply(tokenizeSnowball)

np.unique(testValData['pm_rel_desc'])

testValData["qid"] = "0"
testValData["qid"][testValData["pm_rel_desc"] == "Human PM"] = "1"
testValData["qid"][testValData["pm_rel_desc"] == "Animal PM"] = "1"

testData = testValData[['relevance_score','qid','title_abstract_mesh_stemmed']]
testDocId = testValData[['trec_doc_id']]
testData.head()

## Test and Validation Data

In [9]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
testValData['title_abstract_mesh'] = testValData[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
testValData['title_abstract_mesh_stemmed'] = testValData['title_abstract_mesh'].apply(tokenizeSnowball)

np.unique(testValData['pm_rel_desc'])

testValData["qid"] = "0"
testValData["qid"][testValData["pm_rel_desc"] == "Human PM"] = "1"
testValData["qid"][testValData["pm_rel_desc"] == "Animal PM"] = "1"

testValDataSliced = testValData[['relevance_score','qid','title_abstract_mesh_stemmed', 'trec_doc_id']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [10]:
testValDataSliced.head(1)

Unnamed: 0,relevance_score,qid,title_abstract_mesh_stemmed,trec_doc_id
0,0,1,primari multipl malign melanoma unusu long durat 1975 117 patient malign melanoma attent depart dermatolog univers cologn three superfici spread melanoma ssm long periodon patient probabl 25 year patient anoth primari malign melanoma develop though metastasi appear ssm give protect melanoma superfici nodular type comparison 70yearold patient pigment tumour 40 year sudden spread five year ago howev least metastas regress form vitiligolik lesion surround deposit melanin macrophag case spontan regress protect patient new metastas lymph node probabl liver well melanoma patholog neoplasm multipl primari patholog skin neoplasm patholog adult age femal human male middl age skin patholog time factor,1007359


In [11]:
# split into validation and testing
testDataSliced, valDataSliced, yT, yV = train_test_split(testValDataSliced, testValDataSliced['qid'], test_size=0.25)
testDataSliced.shape

(16821, 4)

In [12]:
valDataSliced.shape

(5608, 4)

In [13]:
valDataSliced.head(1)

Unnamed: 0,relevance_score,qid,title_abstract_mesh_stemmed,trec_doc_id
22094,0,1,incorpor flt3 inhibitor acut myeloid leukemia treatment regimen fmsliketyrosin kinase3 flt3 mutat found 30 case acut myeloid leukemia confer increas relaps rate reduc overal surviv target tyrosin kinas direct inhibit focus preclin clinic research aml sever molecul clinic develop inhibit flt3 vari degre specif preclin model suggest compound enhanc cytotox convent chemotherapeut flt3 mutant leukemia cell pharmacodynam interact flt3 inhibitor chemotherapi appear sequenc depend flt3 inhibitor use prior chemotherapi antagon display flt3 inhibit institut exposur chemotherapi synergist cytotox seen combin flt3 inhibitor chemotherapi also complic potenti pharmacokinet obstacl plasma protein bind pglycoprotein interact ongo futur studi aim incorpor flt3 inhibitor convent induct consolid therapi specif patient flt3 mutant aml antineoplast agent therapeut use drug interact enzym inhibitor pharmacokinet enzym inhibitor therapeut use human leukemia myeloid acut drug therapi fmslike tyrosin kinas 3 antagonist inhibitor,18452067


In [14]:
testDocId = testDataSliced[['trec_doc_id']]
testData = testDataSliced[['relevance_score','qid','title_abstract_mesh_stemmed']]
testData.shape

(16821, 3)

In [15]:
valDocId = valDataSliced[['trec_doc_id']]
valData = valDataSliced[['relevance_score','qid','title_abstract_mesh_stemmed']]
valData.shape

(5608, 3)

## TDIFD weighting

### N_word = 1000 and min_df

In [16]:
min_df = 0.1
n_words = 10000
tvec = TfidfVectorizer(max_features = n_words)

In [17]:
tvec = TfidfVectorizer(max_features = n_words, min_df = min_df)

### Train File

In [18]:
trainWeights = tvec.fit_transform(trainDataSliced['title_abstract_mesh_stemmed'])
trainScore = pd.DataFrame(trainWeights.toarray(), columns=tvec.get_feature_names())
trainVoc = tvec.vocabulary_

In [19]:
trainData.shape

(22642, 27)

In [20]:
resTrain = pd.concat([trainDataSliced, trainScore, trainDocId], axis=1)
trainFinal = resTrain.drop(['title_abstract_mesh_stemmed'], axis=1)
trainFinal = trainFinal.sort_values('qid')
trainFinal.head(1)

Unnamed: 0,relevance_score,qid,10,12,20,80,activ,addit,adenocarcinoma,adult,...,trial,tumor,two,type,tyrosin,use,well,without,year,trec_doc_id
18731,0,0,0.0,0.372092,0.0,0.05491,0.044095,0.106621,0.0,0.044824,...,0.0,0.02902,0.051121,0.0,0.0,0.0,0.0,0.0,0.0,2064725


In [21]:
trainFinal.shape

(22642, 212)

In [22]:
rankTrain = trainFinal.to_dict('records')

In [23]:
f = open("train.txt", "w")

for item in rankTrain:
    for i,val in item.items():
        if(i == "relevance_score"):
            f.write(str(val)+" ")
        elif(i == "trec_doc_id"):
            f.write('# '+str(val))
        elif(i == "qid"):
            f.write(str(i)+":"+str(val)+" ")
        else:
            j = tvec.vocabulary_[i] + 1
            f.write(str(j)+":"+str(val)+" ")
    f.write("\n")    
f.close()

### Test File

In [24]:
tvec = TfidfVectorizer(vocabulary = trainVoc)
testWeights = tvec.fit_transform(testData['title_abstract_mesh_stemmed'])
testScore = pd.DataFrame(testWeights.toarray(), columns=tvec.get_feature_names())
testScore.fillna("", inplace=True)

In [25]:
# if split into test and val
testData.reset_index(drop=True, inplace=True)
testScore.reset_index(drop=True, inplace=True)
testDocId.reset_index(drop=True, inplace=True)

In [26]:
resTest = pd.concat([testData, testScore, testDocId], axis=1)
resTest.shape

(16821, 213)

In [27]:
testFinal = resTest.drop(['title_abstract_mesh_stemmed'], axis=1)
testFinal = testFinal.sort_values('qid')
testFinal.head(1)

Unnamed: 0,relevance_score,qid,10,12,20,80,activ,addit,adenocarcinoma,adult,...,trial,tumor,two,type,tyrosin,use,well,without,year,trec_doc_id
0,0,0,0.0,0.0,0.0,0.073791,0.0,0.0,0.0,0.115353,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084053,0.0,26818209


In [28]:
rankTest = testFinal.to_dict('records')

In [29]:
f = open("test.txt", "w")

for item in rankTest:
    for i,val in item.items():
        if(i == "relevance_score"):
            f.write(str(val)+" ")
        elif(i == "trec_doc_id"):
            f.write('# '+str(val))
        elif(i == "qid"):
            f.write(str(i)+":"+str(val)+" ")
        else:
            j = tvec.vocabulary_[i] + 1
            f.write(str(j)+":"+str(val)+" ")
    f.write("\n")    
f.close()

### Evaluation File

In [30]:
tvec = TfidfVectorizer(vocabulary = trainVoc)
valWeights = tvec.fit_transform(valData['title_abstract_mesh_stemmed'])
valScore = pd.DataFrame(valWeights.toarray(), columns=tvec.get_feature_names())

In [31]:
valData.reset_index(drop=True, inplace=True)
valScore.reset_index(drop=True, inplace=True)
valDocId.reset_index(drop=True, inplace=True)

In [32]:
resVal = pd.concat([valData, valScore, valDocId], axis=1)
valFinal = resVal.drop(['title_abstract_mesh_stemmed'], axis=1)
valFinal = valFinal.sort_values('qid')
valFinal.shape

(5608, 212)

In [33]:
rankVal = valFinal.to_dict('records')

In [34]:
f = open("vali.txt", "w")

for item in rankTest:
    for i,val in item.items():
        if(i == "relevance_score"):
            f.write(str(val)+" ")
        elif(i == "trec_doc_id"):
            f.write('# '+str(val))
        elif(i == "qid"):
            f.write(str(i)+":"+str(val)+" ")
        else:
            j = tvec.vocabulary_[i] + 1
            f.write(str(j)+":"+str(val)+" ")
    f.write("\n")    
f.close()

# L2R

In [54]:
%ls

Articles-tfidf.ipynb                  Result-Analysis.ipynb
CombineKeyWords.ipynb                 disease1718Diff.ipynb
L2R.ipynb                             pm_study.ipynb
PreProcess_GoldStandard_TXT.ipynb     test.txt
PreProcess_GoldStandard_XML.ipynb     train.txt
PreProcess_GoldStandard_XML_CT.ipynb  vali.txt


In [55]:
metric = pyltr.metrics.NDCG(k=10)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1000,
    learning_rate=0.02,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1,
)

# No validation

In [None]:
with open('train.txt') as trainfile, open('test.txt') as evalfile:
    TrainX, Trainy, TrainQids, _ = pyltr.data.letor.read_dataset(trainfile)
    EvalX, Evaly, EvalQids, _ = pyltr.data.letor.read_dataset(evalfile)

In [None]:
# without validation
model.fit(TrainX, Trainy, TrainQids)

# With Validation

In [56]:
with open('train.txt') as trainfile, open('vali.txt') as valifile, open('test.txt') as evalfile:
    TrainX, Trainy, TrainQids, _ = pyltr.data.letor.read_dataset(trainfile)
    ValX, Valy, ValQids, _ = pyltr.data.letor.read_dataset(valifile)
    EvalX, Evaly, EvalQids, _ = pyltr.data.letor.read_dataset(evalfile)

In [57]:
# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(ValX, Valy, ValQids, metric=metric, stop_after=250)

In [58]:
# with validation
model.fit(TrainX, Trainy, TrainQids, monitor=monitor)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.0000       0.0000        4.83m      C:      0.0000 B:      0.0000 S:  0
    2       0.0000       0.0000        4.53m      C:      0.0000 B:      0.0000 S:  1
    3       0.0000       0.0000        6.53m      C:      0.0000 B:      0.0000 S:  2
    4       0.0000       0.0000        7.57m      C:      0.0000 B:      0.0000 S:  3
    5       0.0000       0.0000        6.87m      C:      0.0000 B:      0.0000 S:  4
    6       0.0000       0.0000        6.42m      C:      0.0000 B:      0.0000 S:  5
    7       0.2201       0.0000        7.08m      C:      0.0000 B:      0.0000 S:  6
    8       0.0000       0.0000        6.83m      C:      0.0000 B:      0.0000 S:  7
    9       0.2201       0.0000        7.43m      C:      0.0000 B:      0.0000 S:  8
   10       0.2201       0.0000        7.83m      C:      0.0000 B:      0.0000 S:  9
   15       0.0000       0.0000        8.24m      C: 

<pyltr.models.lambdamart.LambdaMART at 0x1a126eb4e0>

In [59]:
Epred = model.predict(EvalX)
print('Random ranking:', metric.calc_mean_random(EvalQids, Evaly))
print('Our model:', metric.calc_mean(EvalQids, Evaly, Epred))

Random ranking: 0.22469509267262078
Our model: 0.3436666355597751


In [60]:
print(model.feature_importances_)

[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 5.18572342e-02 8.53630134e-03 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 8.66623063e-03
 0.00000000e+00 0.00000000e+00 1.34518633e-06 2.82658096e-05
 3.64541710e-03 0.00000000e+00 1.44015742e-02 0.00000000e+00
 0.00000000e+00 0.00000000e+00 8.69227036e-02 1.08284719e-06
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 2.24912207e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.98512508e-05 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 6.50529001e-05 0.00000000e+00
 0.00000000e+00 0.000000

In [61]:
listFeatures = np.argsort(model.feature_importances_)
listFeatures

array([  0, 128, 129, 132, 133, 134, 135, 136, 137, 139, 140, 141, 142,
       143, 145, 146, 147, 149, 150, 151, 127, 126, 125, 124,  97,  98,
       101, 102, 103, 207, 105, 106, 107, 153, 111, 113, 114, 115, 118,
       119, 120, 121, 122, 123, 112,  96, 154, 156, 183, 184, 185, 187,
       188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 202,
       204, 206, 181, 180, 179, 178, 157, 158, 159, 160, 161, 162, 163,
       164, 165, 155, 166, 168, 169, 170, 171, 172, 173, 174, 175, 177,
       167,  95, 104,  48,  32,  33,  36,  37,  38,  39,  41,  42,  43,
        31,  44,  46,  47,  93,  49,  50,  51,  52,  53,  54,  45,  55,
        29,  24,   1,   2,   3,   4,   5,   6,   7,   8,   9,  25,  10,
        12,  13,  14,  15,  18,  19,  20,  21,  22,  11,  56, 208,  58,
        80,  81,  82,  83,  76,  75,  74,  84,  73,  71,  70,  77,  69,
        68,  85,  66,  92,  91,  59,  60,  61,  67,  63,  89,  64,  88,
        87,  86,  65,  78,  35,  26,  72,  99,  79, 109,  94,  5

In [62]:
for feature in listFeatures:
    voc = feature-1
    for key,value in tvec.vocabulary_.items():
        if value == voc:
            print(value, key)

127 outcom
128 overal
131 patholog
132 pathway
133 patient
134 perform
135 pharmacolog
136 polymeras
138 posit
139 potenti
140 predict
141 present
142 previous
144 prognosi
145 prognost
146 progress
148 protein
149 protooncogen
150 provid
126 one
125 oncogen
124 occur
123 observ
96 line
97 local
100 major
101 male
102 malign
206 well
104 may
105 mechan
106 metabol
152 ras
110 mice
112 model
113 molecular
114 month
117 neoplasm
118 neoplast
119 new
120 normal
121 novel
122 number
111 middl
95 level
153 rate
155 recent
182 stage
183 status
184 studi
186 suppressor
187 surgeri
188 surgic
189 surviv
190 system
191 target
192 test
193 therapeut
194 therapi
195 three
196 time
197 tissu
198 treat
201 tumor
203 type
205 use
180 small
179 similar
178 signific
177 signal
156 receptor
157 recurr
158 regul
159 relat
160 remain
161 report
162 resect
163 resist
164 respect
154 reaction
165 respons
167 reveal
168 review
169 risk
170 role
171 sampl
172 select
173 sensit
174 sequenc
176 show
166 result