In [1]:
from os import listdir
from os.path import isfile, isdir, join
from lxml import etree
import pandas as pd
import tarfile
import gzip
import time
import csv
import re
import sys
import math
import nltk
import string
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import pyltr

# TESTING pyltr with LETOR

In [37]:
folder="/Users/ari/Downloads/MQ2007/Fold1"

In [81]:
with open(join(folder,'train.txt')) as trainfile, open(join(folder,'vali.txt')) as valifile, open(join(folder,'test.txt')) as evalfile:
    TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
    VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
    EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

In [97]:
TX

array([[0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.03131 , 0.666667, 0.5     , ..., 0.333333, 0.448276, 0.      ],
       [0.078682, 0.166667, 0.5     , ..., 0.833333, 0.678161, 0.      ],
       ...,
       [0.762295, 0.      , 0.      , ..., 0.5     , 0.686275, 0.      ],
       [0.02459 , 0.      , 0.      , ..., 0.5     , 0.352941, 0.      ],
       [0.663934, 0.      , 0.      , ..., 0.5     , 0.431373, 0.      ]])

In [98]:
Ty

array([0., 1., 1., ..., 0., 0., 0.])

In [92]:
Tqids

array(['10', '10', '10', ..., '6000', '6000', '6000'], dtype='<U4')

In [None]:
metric = pyltr.metrics.NDCG(k=10)

# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=250)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1000,
    learning_rate=0.02,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1,
)

model.fit(TX, Ty, Tqids, monitor=monitor)

In [None]:
Epred = model.predict(EX)
print('Random ranking:', metric.calc_mean_random(Eqids, Ey))
print('Our model:', metric.calc_mean(Eqids, Ey, Epred))

# Reading GS Files

In [3]:
gsPath = "/Users/ari/Downloads/TREC/trec2018/results/goldstandard"
trainYear = "2017"
testYear = "2018"

In [4]:
gsTrainFile = join(join(gsPath, trainYear),"20180622processedGoldStandardXMLTXT.tsv")
gsTestFile = join(join(gsPath, testYear),"20190111processedGoldStandardPub2018.tsv")

In [5]:
trainData = pd.read_csv(gsTrainFile, sep = '\t', encoding='utf8', dtype={'trec_doc_id':object})
trainData.fillna("", inplace=True) # ?
trainData.head()

Unnamed: 0.1,Unnamed: 0,trec_topic_number,trec_doc_id,pm_rel_desc,disease_desc,gene1_annotation_desc,gene1_name,gene2_annotation_desc,gene2_name,gene3_annotation_desc,...,title,abstract,major_mesh,minor_mesh,trec_topic_disease,trec_topic_age,trec_topic_sex,trec_topic_other1,trec_topic_other2,trec_topic_other3
0,0,1,10065107,Human PM,Exact,Missing Gene,CDK4 Amplification,,,,...,[A case of metastatic liposarcoma originating ...,We reported a 36-year-old woman with metastati...,,Adult;Antineoplastic Combined Chemotherapy Pro...,Liposarcoma,38-year-old,male,GERD,,
1,1,1,10101594,Human PM,More General,Exact,CDK4 Amplification,,,,...,Analysis of SAS gene and CDK4 and MDM2 protein...,The region q13-15 of chromosome 12 contains SA...,Nuclear Proteins,"Adolescent;Adult;Chromosomes, Human, Pair 12;C...",Liposarcoma,38-year-old,male,GERD,,
2,2,1,10220412,Human PM,More Specific,Missing Gene,CDK4 Amplification,,,,...,Induction of a secreted protein by the myxoid ...,"The TLS-CHOP oncoprotein, found in the majorit...",CCAAT-Enhancer-Binding Proteins;Gene Expressio...,"Animals;Cells, Cultured;Cloning, Molecular;DNA...",Liposarcoma,38-year-old,male,GERD,,
3,3,1,10323080,Human PM,More General,Exact,CDK4 Amplification,,,,...,"Mutations of TP53, amplification of EGFR, MDM2...",We investigated the frequency and mutual relat...,Gene Deletion;Nuclear Proteins,Cyclin-Dependent Kinase 4;Cyclin-Dependent Kin...,Liposarcoma,38-year-old,male,GERD,,
4,4,1,10466061,Human PM,Exact,Missing Gene,CDK4 Amplification,,,,...,[A case of advanced retroperitoneal dedifferen...,We report a case of retroperitoneal dedifferen...,,"Antineoplastic Agents, Alkylating/administrati...",Liposarcoma,38-year-old,male,GERD,,


In [6]:
testData = pd.read_csv(gsTestFile, sep = '\t', encoding='utf8', dtype={'trec_doc_id':object})
testData.fillna("", inplace=True) # ?
testData.head()

Unnamed: 0.1,Unnamed: 0,trec_topic_number,trec_doc_id,pm_rel_desc,disease_desc,gene1_annotation_desc,gene1_name,gene2_annotation_desc,gene2_name,gene3_annotation_desc,...,demographics_desc,other_desc,relevance_score,title,abstract,major_mesh,minor_mesh,trec_topic_disease,trec_topic_age,trec_topic_sex
0,0,1,1007359,Human PM,More Specific,Missing Gene,BRAF (V600E),,,,...,Matches,Not Discussed,0,[Primary multiple malignant melanomas of unusu...,"In 1975, 117 patients with malignant melanoma ...","Melanoma/pathology;Neoplasms, Multiple Primary...",Adult;Aged;Female;Humans;Male;Middle Aged;Skin...,melanoma,64-year-old,male
1,1,1,10833951,Human PM,More Specific,Missing Gene,BRAF (V600E),,,,...,Excludes,Not Discussed,0,[Malignant melanomas and young men].,Medullary thyroid cancer (MTC) is a distinct C...,Melanoma/epidemiology;Melanoma/etiology;Melano...,Adult;Age Factors;Humans;Male;Risk Factors,melanoma,64-year-old,male
2,2,1,11381855,Human PM,More Specific,Missing Gene,BRAF (V600E),,,,...,Excludes,Not Discussed,0,Update on malignant melanoma in children.,Malignant melanoma is a rare event in children...,Melanoma/diagnosis;Melanoma/etiology;Melanoma/...,"Adolescent;Child;Child, Preschool;Female;Human...",melanoma,64-year-old,male
3,3,1,1234252,Not PM,,,,,,,...,,,0,[Triage].,"Multiple endocrine neoplasia, type 2B (MEN 2B)...",Disasters;Emergency Medical Services;Triage,Austria;Humans,melanoma,64-year-old,male
4,4,1,1234878,Not PM,,,,,,,...,,,0,Malignant melanoma--an overview.,"Multiple endocrine neoplasia, type 2B (MEN 2B)...",Melanoma/diagnosis;Melanoma/therapy;Skin Neopl...,Female;Humans;Immunotherapy,melanoma,64-year-old,male


# Preprocessing the data

# Functions to tokenize, remove stop words, get stemms

In [7]:
# Get Stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopWords = stopwords.words('english')

def tokenizePorter(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = PorterStemmer()
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

def tokenizeSnowball(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

[nltk_data] Downloading package stopwords to /Users/ari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# TrainData

In [8]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
trainData['title_abstract_mesh'] = trainData[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
trainData['title_abstract_mesh_stemmed'] = trainData['title_abstract_mesh'].apply(tokenizeSnowball)

np.unique(trainData['pm_rel_desc'])

trainData["qid"] = "0"
trainData["qid"][trainData["pm_rel_desc"] == "Human PM"] = "1"
trainData["qid"][trainData["pm_rel_desc"] == "Animal PM"] = "1"

trainDataSliced = trainData[['relevance_score','qid','title_abstract_mesh_stemmed']]
trainDocId = trainData[['trec_doc_id']]
trainDataSliced.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,relevance_score,qid,title_abstract_mesh_stemmed
0,0,1,case metastat liposarcoma origin retroperitoneum success treat combin chemotherapi report 36yearold woman metastat liposarcoma origin retroperitoneum respond well adjuv chemotherapi primari tumor remov surgeri two month later patient develop metastasi brain lung four month later metastat liposarcoma brain general extrem rare patient treat combin chemotherapi use cyclophosphamid vincristin adriamycin dacarbazin cyvad examin former two drug altern vindesin ifosfamid anoth regimen cisplatin etoposid given threeweek interv result metastas total disappear recurr lesion note two year although role chemotherapi liposarcoma well defin littl data support use adjuv set combin chemotherapi seem effect advanc liposarcoma adult antineoplast combin chemotherapi protocol therapeut use brain neoplasm drug therapi brain neoplasm secondari cyclophosphamid administr dosag dacarbazin administr dosag doxorubicin administr dosag drug administr schedul femal human liposarcoma drug therapi liposarcoma secondari lung neoplasm drug therapi lung neoplasm secondari remiss induct retroperiton neoplasm patholog vincristin administr dosag
1,1,1,analysi sas gene cdk4 mdm2 protein lowgrad osteosarcoma region q1315 chromosom 12 contain sas cdk4 mdm2 gene rearrang amplifi varieti human sarcoma studi evalu sas gene amplif mdm2 cdk4 protein express 20 tumor sampl central lowgrad osteosarcoma 16 primari 3 recurr 1 lung metastasi sas amplif analyz quantit polymeras chain reaction pcr paraffinembed sampl mdm2 cdk4 protein express evalu immunohistochemistri mdm2 cdk4 protein found strong express 35 65 respect sampl sas found amplifi 15 sampl find indic gene may involv tumorigenesi progress lowgrad osteosarcoma nuclear protein adolesc adult chromosom human pair 12 cyclindepend kinas 4 cyclindepend kinas genet cyclindepend kinas metabol dna neoplasm isol purif femal gene express human immunohistochemistri male membran protein genet membran protein metabol middl age neoplasm protein genet neoplasm protein metabol osteosarcoma genet osteosarcoma metabol osteosarcoma patholog polymeras chain reaction protooncogen protein genet protooncogen protein metabol protooncogen protein cmdm2 tetraspanin
2,0,1,induct secret protein myxoid liposarcoma oncogen tlschop oncoprotein found major human myxoid liposarcoma consist fusion transcript factor chop gadd153 n terminus rnabind protein tls fus clinic correl vitro transform assay indic n terminus tls play import role oncogenesi tlschop howev activ attribut oncoprotein inhibit bind transcript factor c ebp class certain adipogen target gene function tlschop share nononcogen chop protein report isol gene dol54 activ primari fibroblast express tlschop dol54 express neoplast compon human myxoid liposarcoma increas tumorigen cell inject nude mice activ dol54 requir intact dnabind dimer domain tlschop suitabl cellular dimer partner depend tls n terminus normal adipocyt differenti associ earli transient express dol54 gene encod secret protein tight associ cell surfac extracellular matrix tlschop thus lead unschedul express gene normal associ adipocyt differenti ccaatenhancerbind protein gene express regul neoplast rnabind protein fus anim cell cultur clone molecular dnabind protein genet dnabind protein metabol fibroblast metabol human liposarcoma myxoid genet liposarcoma myxoid metabol mice molecular sequenc data neoplasm protein genet neoplasm protein secret nuclear protein genet oncogen protein fusion genet transcript factor chop
3,1,1,mutat tp53 amplif egfr mdm2 cdk4 delet cdkn2a malign astrocytoma investig frequenc mutual relationship molecular alter 33 malign astrocytoma 28 glioblastoma 5 anaplast astrocytoma genet alter analyz delet cdkn2a p16 gene tp53 mutat amplif egfr mdm2 cdk4 common genet alter egfr amplif reveal 15 case 45 tp53 mutat identifi 9 case 27 cdkn2 p16 delet detect 13 case 41 either mdm2 cdk4 amplif less frequent identifi 4 12 1 3 case respect 15 case show amplif egfr 9 cdkn2 p16 delet 60 p 004 hand cdkn2 p16 delet egfr amplif rare occur tp53 mutat 2 14 case cdkn2 p16 delet 14 result confirm exist least two differ pathway lead format glioblastoma gene delet nuclear protein cyclindepend kinas 4 cyclindepend kinas inhibitor p16 genet cyclindepend kinas genet glioblastoma genet human mutat neoplasm protein genet polymeras chain reaction protooncogen protein genet protooncogen protein cmdm2 receptor epiderm growth factor genet retrospect studi tumor suppressor protein p53 genet
4,0,1,case advanc retroperiton dedifferenti liposarcoma treat effect highdos ifosfamid report case retroperiton dedifferenti liposarcoma treat effect highdos ifosfamid 59yearold man receiv tumorectomi right nephrectomi retroperiton liposarcoma twentytwo month oper liver metastasi resect incomplet three month later right pleural retroperiton periton metastas appear 6 cycl highdos ifosfamid therapi tumor reduc partial ten cycl chemotherapi administ tumor show regrowth 14 month administr high dose ifosfamid combin chemotherapi ifosfamid doxorubicin etoposid effect regrowth tumor antineoplast agent alkyl administr dosag antineoplast combin chemotherapi protocol therapeut use combin modal therapi drug administr schedul human ifosfamid administr dosag liposarcoma drug therapi liposarcoma secondari liposarcoma surgeri liver neoplasm secondari male middl age periton neoplasm secondari pleural neoplasm secondari retroperiton neoplasm drug therapi retroperiton neoplasm patholog retroperiton neoplasm surgeri


# TestData

In [9]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
testData['title_abstract_mesh'] = testData[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
testData['title_abstract_mesh_stemmed'] = testData['title_abstract_mesh'].apply(tokenizeSnowball)

np.unique(testData['pm_rel_desc'])

testData["qid"] = "0"
testData["qid"][testData["pm_rel_desc"] == "Human PM"] = "1"
testData["qid"][testData["pm_rel_desc"] == "Animal PM"] = "1"

testDataSliced = testData[['relevance_score','qid','title_abstract_mesh_stemmed']]
testDocId = testData[['trec_doc_id']]
testDataSliced.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,relevance_score,qid,title_abstract_mesh_stemmed
0,0,1,primari multipl malign melanoma unusu long durat 1975 117 patient malign melanoma attent depart dermatolog univers cologn three superfici spread melanoma ssm long periodon patient probabl 25 year patient anoth primari malign melanoma develop though metastasi appear ssm give protect melanoma superfici nodular type comparison 70yearold patient pigment tumour 40 year sudden spread five year ago howev least metastas regress form vitiligolik lesion surround deposit melanin macrophag case spontan regress protect patient new metastas lymph node probabl liver well melanoma patholog neoplasm multipl primari patholog skin neoplasm patholog adult age femal human male middl age skin patholog time factor
1,0,1,malign melanoma young men medullari thyroid cancer mtc distinct ccell tumor thyroid review oncogenesi manag sporad tumor tumor aris part specif inherit syndrom ret protooncogen play role develop inherit form mtc becom import clinic manag patient famili recognit high rate region nodal involv led lymphadenectomi strong consid patient undergo thyroidectomi mtc melanoma epidemiolog melanoma etiolog melanoma prevent control skin neoplasm epidemiolog skin neoplasm etiolog skin neoplasm prevent control adult age factor human male risk factor
2,0,1,updat malign melanoma children malign melanoma rare event children yet overal incid consist risen past 20 year thus likelihood pediatr patient develop malign melanoma increas previous bulk lesion estim occur children larg congenit melanocyt nevi recent report howev highlight new risk factor malign melanoma children demystifi entiti previous believ grave prognosi knowledg risk factor particip public health effort toward prevent earli intervent help practition protect pediatr patient malign melanoma diagnosi melanoma etiolog melanoma therapi skin neoplasm diagnosi skin neoplasm etiolog skin neoplasm therapi adolesc child child preschool femal human male risk factor
3,0,0,triag multipl endocrin neoplasia type 2b men 2b phenotyp variant group autosomaldomin neurocristopathi men 2b associ medullari thyroid carcinoma pheochromocytoma oral ocular alimentari submucos ganglioneuroma marfanoid bodi featur approxim 50 case thought spontan mutat ret protooncogen ret 21exon gene encod tyrosin kinas receptor codon 918 germ line mutat convert high conserv methionin threonin intracellular tyrosin kinas portion receptor ret identifi 95 patient men 2b mutat easili detect direct deoxyribonucl acid sequenc restrict enzym fok 1 analysi amplifi polymeras chain reaction product ret gene normal express oral gastrointestin submucos neural ganglia codon 918 mutat thought caus neuroma virtu transform activ ganglia identifi clinic featur men 2b 11yearold boy oral pathologist led confirm mutat analysi genet test avail patient later date mother underw thyroidectomi base sole biochem test result indic patient codon 918 mutat wherea phenotyp normal mother father older brother normal ret analys studi famili demonstr mutant allel deriv father possibl acquisit spermatogenesi believ mother affect patient normal absenc phenotyp featur men 2b normal genotyp suggest calcitonin abnorm minim evid ccell hyperplasia inconsequenti molecular analysi ret abnorm like supplant biochem method diagnosi patient men 2b disast emerg medic servic triag austria human
4,0,0,malign melanomaan overview multipl endocrin neoplasia type 2b men 2b phenotyp variant group autosomaldomin neurocristopathi men 2b associ medullari thyroid carcinoma pheochromocytoma oral ocular alimentari submucos ganglioneuroma marfanoid bodi featur approxim 50 case thought spontan mutat ret protooncogen ret 21exon gene encod tyrosin kinas receptor codon 918 germ line mutat convert high conserv methionin threonin intracellular tyrosin kinas portion receptor ret identifi 95 patient men 2b mutat easili detect direct deoxyribonucl acid sequenc restrict enzym fok 1 analysi amplifi polymeras chain reaction product ret gene normal express oral gastrointestin submucos neural ganglia codon 918 mutat thought caus neuroma virtu transform activ ganglia identifi clinic featur men 2b 11yearold boy oral pathologist led confirm mutat analysi genet test avail patient later date mother underw thyroidectomi base sole biochem test result indic patient codon 918 mutat wherea phenotyp normal mother father older brother normal ret analys studi famili demonstr mutant allel deriv father possibl acquisit spermatogenesi believ mother affect patient normal absenc phenotyp featur men 2b normal genotyp suggest calcitonin abnorm minim evid ccell hyperplasia inconsequenti molecular analysi ret abnorm like supplant biochem method diagnosi patient men 2b melanoma diagnosi melanoma therapi skin neoplasm diagnosi skin neoplasm therapi femal human immunotherapi


# TDIFD weighting

In [10]:
n_words = 10000
tvec = TfidfVectorizer(max_features = n_words)

In [25]:
trainWeights = tvec.fit_transform(trainDataSliced['title_abstract_mesh_stemmed'])
trainScore = pd.DataFrame(trainWeights.toarray(), columns=tvec.get_feature_names())
trainVoc = tvec.vocabulary_

In [102]:
resTrain = pd.concat([trainDataSliced, trainScore, trainDocId], axis=1)
trainFinal = resTrain.drop(['title_abstract_mesh_stemmed'], axis=1)
trainFinal = trainFinal.sort_values('qid')
trainFinal.head(1)

Unnamed: 0,relevance_score,qid,00,000,0000,000001,00001,00002,00003,00004,...,β1,βcatenin,βcell,γh2ax,γtmt,δmcv,μg,μm,χ2,trec_doc_id
18731,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2064725


In [103]:
rankTrain = trainFinal.to_dict('records')

In [104]:
f = open("train.txt", "w")

for item in rankTrain:
    j = 1
    for i,val in item.items():
        if(i == "relevance_score"):
            f.write(str(val)+" ")
        elif(i == "trec_doc_id"):
            f.write('# '+str(val))
        elif(i == "qid"):
            f.write(str(i)+":"+str(val)+" ")
        else:
            f.write(str(j)+":"+str(val)+" ")
            j+=1
    f.write("\n")    
f.close()

In [26]:
tvec = TfidfVectorizer(vocabulary = trainVoc)
testWeights = tvec.fit_transform(testDataSliced['title_abstract_mesh_stemmed'])
testScore = pd.DataFrame(testWeights.toarray(), columns=tvec.get_feature_names())

In [99]:
resTest = pd.concat([testDataSliced, testScore, testDocId], axis=1)
testFinal = resTest.drop(['title_abstract_mesh_stemmed'], axis=1)
testFinal = testFinal.sort_values('qid')
testFinal.head(1)

Unnamed: 0,relevance_score,qid,00,000,0000,000001,00001,00002,00003,00004,...,β1,βcatenin,βcell,γh2ax,γtmt,δmcv,μg,μm,χ2,trec_doc_id
11214,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23431193


In [115]:
rankTest = testFinal.to_dict('records')

In [116]:
f = open("test.txt", "w")

for item in rankTest:
    j = 1
    for i,val in item.items():
        if(i == "relevance_score"):
            f.write(str(val)+" ")
        elif(i == "trec_doc_id"):
            f.write('# '+str(val))
        elif(i == "qid"):
            f.write(str(i)+":"+str(val)+" ")
        else:
            f.write(str(j)+":"+str(val)+" ")
            j+=1
    f.write("\n")    
f.close()

# L2R

In [108]:
%ls

Articles-tfidf.ipynb                  PreProcess_GoldStandard_XML.ipynb
CombineKeyWords.ipynb                 PreProcess_GoldStandard_XML_CT.ipynb
Disease1718Diff.ipynb                 Result-Analysis.ipynb
L2R.ipynb                             test.txt
PM_Study.ipynb                        train.txt
PreProcess_GoldStandard_TXT.ipynb


In [117]:
with open('train.txt') as trainfile, open('test.txt') as evalfile:
    TrainX, Trainy, TrainQids, _ = pyltr.data.letor.read_dataset(trainfile)
    EvalX, Evaly, EvalQids, _ = pyltr.data.letor.read_dataset(evalfile)

In [109]:
TrainX

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [110]:
Trainy

array([0., 0., 0., ..., 0., 0., 2.])

In [111]:
TrainQids

array(['0', '0', '0', ..., '1', '1', '1'], dtype='<U1')

In [112]:
metric = pyltr.metrics.NDCG(k=10)

# Only needed if you want to perform validation (early stopping & trimming)
# monitor = pyltr.models.monitors.ValidationMonitor(
#    VX, Vy, Vqids, metric=metric, stop_after=250)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1000,x
    learning_rate=0.02,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1,
)

model.fit(TrainX, Trainy, TrainQids)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.0000       0.0000      458.98m                                         
    2       0.0000       0.0000      417.15m                                         
    3       0.0000       0.0000      404.37m                                         
    4       0.0000       0.0000      416.99m                                         
    5       0.0000       0.0000      407.62m                                         
    6       0.0000       0.0000      401.20m                                         
    7       0.0000       0.0000      396.14m                                         
    8       0.0000       0.0000      393.31m                                         
    9       0.0000       0.0000      400.44m                                         
   10       0.0000       0.0000      396.72m                                         
   15       0.5638       0.0000      406.67m         

<pyltr.models.lambdamart.LambdaMART at 0x112888780>

In [118]:
Epred = model.predict(EvalX)
print('Random ranking:', metric.calc_mean_random(EvalQids, Evaly))
print('Our model:', metric.calc_mean(EvalQids, Evaly, Epred))

Random ranking: 0.22535414859786068
Our model: 0.3876683119885543


In [None]:
print(model.feature_importances_)