In [1]:
from os import listdir
from os.path import isfile, isdir, join
from lxml import etree
import pandas as pd
import tarfile
import json
import gzip
import time
import csv
import re
import sys
import math
import nltk
import string
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import pyltr

# TESTING pyltr with LETOR

In [None]:
folder="/Users/ari/Downloads/MQ2007/Fold1"

In [None]:
with open(join(folder,'train.txt')) as trainfile, open(join(folder,'vali.txt')) as valifile, open(join(folder,'test.txt')) as evalfile:
    TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
    VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
    EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

In [None]:
metric = pyltr.metrics.NDCG(k=10)

# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(VX, Vy, Vqids, metric=metric, stop_after=250)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1000,
    learning_rate=0.02,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1,
)

model.fit(TX, Ty, Tqids, monitor=monitor)

In [None]:
Epred = model.predict(EX)
print('Random ranking:', metric.calc_mean_random(Eqids, Ey))
print('Our model:', metric.calc_mean(Eqids, Ey, Epred))

# Reading GS Files

In [3]:
gsPath = "/Users/ari/Downloads/TREC/trec2018/results/goldstandard"
trainYear = "2017"
testYear = "2018"

In [4]:
gsTrainFile = join(join(gsPath, trainYear),"20180622processedGoldStandardXMLTXT.tsv")
gsTestFile = join(join(gsPath, testYear),"20190111processedGoldStandardPub2018.tsv")

In [5]:
trainData = pd.read_csv(gsTrainFile, sep = '\t', encoding='utf8', dtype={'trec_doc_id':object})
trainData.fillna("", inplace=True)
trainData.head(1)

Unnamed: 0.1,Unnamed: 0,trec_topic_number,trec_doc_id,pm_rel_desc,disease_desc,gene1_annotation_desc,gene1_name,gene2_annotation_desc,gene2_name,gene3_annotation_desc,...,title,abstract,major_mesh,minor_mesh,trec_topic_disease,trec_topic_age,trec_topic_sex,trec_topic_other1,trec_topic_other2,trec_topic_other3
0,0,1,10065107,Human PM,Exact,Missing Gene,CDK4 Amplification,,,,...,[A case of metastatic liposarcoma originating ...,We reported a 36-year-old woman with metastati...,,Adult;Antineoplastic Combined Chemotherapy Pro...,Liposarcoma,38-year-old,male,GERD,,


In [6]:
testValData = pd.read_csv(gsTestFile, sep = '\t', encoding='utf8', dtype={'trec_doc_id':object})
testValData.fillna("", inplace=True)
testValData.head(1)

Unnamed: 0.1,Unnamed: 0,trec_topic_number,trec_doc_id,pm_rel_desc,disease_desc,gene1_annotation_desc,gene1_name,gene2_annotation_desc,gene2_name,gene3_annotation_desc,...,demographics_desc,other_desc,relevance_score,title,abstract,major_mesh,minor_mesh,trec_topic_disease,trec_topic_age,trec_topic_sex
0,0,1,1007359,Human PM,More Specific,Missing Gene,BRAF (V600E),,,,...,Matches,Not Discussed,0,[Primary multiple malignant melanomas of unusu...,"In 1975, 117 patients with malignant melanoma ...","Melanoma/pathology;Neoplasms, Multiple Primary...",Adult;Aged;Female;Humans;Male;Middle Aged;Skin...,melanoma,64-year-old,male


# Preprocessing the data

## Functions to tokenize, remove stop words, get stemms

In [7]:
# Get Stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopWords = stopwords.words('english')

def tokenizePorter(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = PorterStemmer()
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

def tokenizeSnowball(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

[nltk_data] Downloading package stopwords to /Users/ari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## TrainData

In [8]:
with open('/Users/ari/Downloads/TREC/trec-pm/resources/lexigramOutputTopics2017.json') as f:
    exPandedDisease = json.load(f)

In [9]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
trainData['title_abstract_mesh'] = trainData[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
trainData['title_abstract_mesh_stemmed'] = trainData['title_abstract_mesh'].apply(tokenizeSnowball)

In [10]:
# Transforms the text to lower case, remove punctuations, get the stemms of words 
trainData['title_stemmed'] = trainData[['title']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
trainData['abstract_stemmed'] = trainData[['abstract']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
trainData['mesh_stemmed'] = trainData[['major_mesh', 'minor_mesh']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
trainData['disease_stemmed'] = trainData[['trec_topic_disease']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)

In [11]:
# Defining query ids
trainData["qid"] = trainData["trec_topic_number"].astype(str)+str(trainYear)

In [12]:
trainDocId = trainData[['trec_doc_id']]
trainDataSliced = trainData[['relevance_score','qid', 'title_stemmed', 'abstract_stemmed', 'mesh_stemmed', 'title_abstract_mesh_stemmed', 'disease_stemmed']]
trainDataSliced.head(1)

Unnamed: 0,relevance_score,qid,title_stemmed,abstract_stemmed,mesh_stemmed,title_abstract_mesh_stemmed,disease_stemmed
0,0,12017,case metastat liposarcoma origin retroperitoneum success treat combin chemotherapi,report 36yearold woman metastat liposarcoma origin retroperitoneum respond well adjuv chemotherapi primari tumor remov surgeri two month later patient develop metastasi brain lung four month later metastat liposarcoma brain general extrem rare patient treat combin chemotherapi use cyclophosphamid vincristin adriamycin dacarbazin cyvad examin former two drug altern vindesin ifosfamid anoth regimen cisplatin etoposid given threeweek interv result metastas total disappear recurr lesion note two year although role chemotherapi liposarcoma well defin littl data support use adjuv set combin chemotherapi seem effect advanc liposarcoma,adult antineoplast combin chemotherapi protocol therapeut use brain neoplasm drug therapi brain neoplasm secondari cyclophosphamid administr dosag dacarbazin administr dosag doxorubicin administr dosag drug administr schedul femal human liposarcoma drug therapi liposarcoma secondari lung neoplasm drug therapi lung neoplasm secondari remiss induct retroperiton neoplasm patholog vincristin administr dosag,case metastat liposarcoma origin retroperitoneum success treat combin chemotherapi report 36yearold woman metastat liposarcoma origin retroperitoneum respond well adjuv chemotherapi primari tumor remov surgeri two month later patient develop metastasi brain lung four month later metastat liposarcoma brain general extrem rare patient treat combin chemotherapi use cyclophosphamid vincristin adriamycin dacarbazin cyvad examin former two drug altern vindesin ifosfamid anoth regimen cisplatin etoposid given threeweek interv result metastas total disappear recurr lesion note two year although role chemotherapi liposarcoma well defin littl data support use adjuv set combin chemotherapi seem effect advanc liposarcoma adult antineoplast combin chemotherapi protocol therapeut use brain neoplasm drug therapi brain neoplasm secondari cyclophosphamid administr dosag dacarbazin administr dosag doxorubicin administr dosag drug administr schedul femal human liposarcoma drug therapi liposarcoma secondari lung neoplasm drug therapi lung neoplasm secondari remiss induct retroperiton neoplasm patholog vincristin administr dosag,liposarcoma


In [27]:
def calculateTFs(terms, target):
    vectorizer = CountVectorizer(vocabulary = terms)
    transformed_data = vectorizer.fit_transform(target)

    score = pd.DataFrame(transformed_data.toarray(), columns=vectorizer.get_feature_names())
    scoreDict = score.to_dict('records')
    return scoreDict
    
def termTF(termFreq, index, term):
    tf = termFreq[index][term]
    return(tf)

## Feature TF for Disease in Title, Abstract, Mesh and Combined

### TF Title

In [28]:
diseases = trainDataSliced['disease_stemmed'].unique()
termFreqTitle = calculateTFs(diseases, trainDataSliced['title_stemmed'])

trainDataSliced['disease_title_tf'] = trainDataSliced.apply(lambda row: termTF(termFreqTitle, row.name, row['disease_stemmed']), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


### TF Abstract

In [38]:
termFreqAbst = calculateTFs(diseases, trainDataSliced['abstract_stemmed'])
trainDataSliced['disease_abstract_tf'] = trainDataSliced.apply(lambda row: termTF(termFreqAbst, row.name, row['disease_stemmed']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### TF Mesh

In [40]:
termFreqMesh = calculateTFs(diseases, trainDataSliced['mesh_stemmed'])
trainDataSliced['disease_mesh_tf'] = trainDataSliced.apply(lambda row: termTF(termFreqMesh, row.name, row['disease_stemmed']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### TF Combined

In [42]:
termFreqCombined = calculateTFs(diseases, trainDataSliced['title_abstract_mesh_stemmed'])
trainDataSliced['disease_combined_tf'] = trainDataSliced.apply(lambda row: termTF(termFreqCombined, row.name, row['disease_stemmed']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [43]:
trainDataSliced.head(1)
test = trainDataSliced.loc[(trainDataSliced['disease_stemmed'] == 'melanoma') & (trainDataSliced['disease_title_tf'] == 2)]
test.head(4)

Unnamed: 0,relevance_score,qid,title_stemmed,abstract_stemmed,mesh_stemmed,title_abstract_mesh_stemmed,disease_stemmed,disease_title_tf,disease_abstract_tf,disease_mesh_tf,disease_combined_tf
2612,1,52017,locus link p16 modifi melanoma risk dutch famili atyp multipl mole melanoma fammm syndrom famili,cdkn2a gene encod cell cycl inhibitor p16 show mutat mani 9p21link melanoma famili dutch melanoma famili segreg uniqu founder mutat p16leiden encod truncat nonfunct p16 protein high variabl risk p16leiden carrier develop melanoma suggest role genet environment factor hypothes 9p21 gene cdkn2a may relev remain 9p21link melanoma famili without p16 mutat may also act risk modifi p16leiden carrier haplotyp analysi 9p21 perform use microsatellit marker six p16leiden famili origin founder popul p16leiden carrier two famili share unexpect larg founder haplotyp approxim 20cm around cdkn2a most proxim direct melanomaposit p16leiden carrier famili show extens proxim haplotyp compar melanomaneg p16leiden carrier famili addit p16leiden famili less heavili affect melanoma show shorter haplotyp share exclud region proxim cdkn2a presenc gene involv melanoma suscept proxim cdkn2a corrobor somat delet 9p tumor frequent includ cdkn2a proxim chromosom area instead result provid candid region gene map p16negat 9p21link melanoma famili guid search risk modifi melanoma develop,allel altern splice genet chromosom human pair 9 genet cyclindepend kinas inhibitor p16 genet dysplast nevus syndrom genet femal genet linkag genet genet marker heterozygot detect human male middl age netherland pedigre risk factor,locus link p16 modifi melanoma risk dutch famili atyp multipl mole melanoma fammm syndrom famili cdkn2a gene encod cell cycl inhibitor p16 show mutat mani 9p21link melanoma famili dutch melanoma famili segreg uniqu founder mutat p16leiden encod truncat nonfunct p16 protein high variabl risk p16leiden carrier develop melanoma suggest role genet environment factor hypothes 9p21 gene cdkn2a may relev remain 9p21link melanoma famili without p16 mutat may also act risk modifi p16leiden carrier haplotyp analysi 9p21 perform use microsatellit marker six p16leiden famili origin founder popul p16leiden carrier two famili share unexpect larg founder haplotyp approxim 20cm around cdkn2a most proxim direct melanomaposit p16leiden carrier famili show extens proxim haplotyp compar melanomaneg p16leiden carrier famili addit p16leiden famili less heavili affect melanoma show shorter haplotyp share exclud region proxim cdkn2a presenc gene involv melanoma suscept proxim cdkn2a corrobor somat delet 9p tumor frequent includ cdkn2a proxim chromosom area instead result provid candid region gene map p16negat 9p21link melanoma famili guid search risk modifi melanoma develop allel altern splice genet chromosom human pair 9 genet cyclindepend kinas inhibitor p16 genet dysplast nevus syndrom genet femal genet linkag genet genet marker heterozygot detect human male middl age netherland pedigre risk factor,melanoma,2,8,0,10
2616,0,52017,vaccin melanoma patient interleukin 4 genetransduc allogen melanoma cell,human melanoma line genet modifi releas interleukin 4 il4 util immun advanc melanoma patient order elicit increas specif antimelanoma immun respons may affect distant lesion twelv metastat melanoma patient inject subcutan least three time 5 x 107 il4 genetransduc irradi allogen melanoma cell per dose system local toxic mild consist transient fever erythema swell indur vaccin site two mix complet partial clinic respons record assess immun respons vaccin patient serolog cellmedi activ evalu antibodi alloantigen could detect 2 11 patient test mix tumorlymphocyt cultur perform util autolog allogen hlaa2match melanoma line simul target signific increas ifngamma releas detect 7 11 case postvaccin lymphocyt stimul untransduc allomelanoma cell howev induct specif recognit autolog melanoma cell pbls obtain vaccin one six case studi respons involv melanoma peptid melana mart12735 recogn hlaa2restrict fashion result indic vaccin allogen melanoma cell releas il4 local expand cell respons antigen autolog untransduc tumor although minor patient,genet therapi adult age autoantibodi blood cancer vaccin administr dosag cytotox immunolog femal histocompat antigen class immunolog human interferongamma metabol interleukin4 blood interleukin4 genet interleukin6 blood lymphocyt cultur test mix male melanoma genet melanoma immunolog melanoma therapi middl age tumor cell cultur,vaccin melanoma patient interleukin 4 genetransduc allogen melanoma cell human melanoma line genet modifi releas interleukin 4 il4 util immun advanc melanoma patient order elicit increas specif antimelanoma immun respons may affect distant lesion twelv metastat melanoma patient inject subcutan least three time 5 x 107 il4 genetransduc irradi allogen melanoma cell per dose system local toxic mild consist transient fever erythema swell indur vaccin site two mix complet partial clinic respons record assess immun respons vaccin patient serolog cellmedi activ evalu antibodi alloantigen could detect 2 11 patient test mix tumorlymphocyt cultur perform util autolog allogen hlaa2match melanoma line simul target signific increas ifngamma releas detect 7 11 case postvaccin lymphocyt stimul untransduc allomelanoma cell howev induct specif recognit autolog melanoma cell pbls obtain vaccin one six case studi respons involv melanoma peptid melana mart12735 recogn hlaa2restrict fashion result indic vaccin allogen melanoma cell releas il4 local expand cell respons antigen autolog untransduc tumor although minor patient genet therapi adult age autoantibodi blood cancer vaccin administr dosag cytotox immunolog femal histocompat antigen class immunolog human interferongamma metabol interleukin4 blood interleukin4 genet interleukin6 blood lymphocyt cultur test mix male melanoma genet melanoma immunolog melanoma therapi middl age tumor cell cultur,melanoma,2,8,3,13
2626,0,52017,cdkn2a mutat spanish cutan malign melanoma famili patient multipl melanoma neoplasia,cdkn2a gene implic cutan malign melanoma cmm 40 famili linkag chromosom 9p21 small proport famili mutat cdk4 gene order estim import gene predisposit cmm spanish famili patient analys ssca total 56 subject belong 34 cmm famili nine patient multipl cmm neoplasia detect germlin cdkn2a mutat six 34 famili 17 frameshift mutat 358delg four missens mutat g59v g101w two case d84i r87w identifi five cmm patient differ famili 14 carri a148t variant known affect p16 activ mutat detect patient multipl cmm neoplasm found mutat either exon 1 beta cdkn2a gene exon 2a cdk4 linkag analysi 9p21 region show exclus one famili cmm four famili cmm dysplast naevi studi indic small role cdkn2a spanish cmm famili suggest gene also respons cmm predisposit,mutat carrier protein genet chromosom human pair 9 genet cyclindepend kinas inhibitor p16 cyclindepend kinas antagonist inhibitor dysplast nevus syndrom genet femal germlin mutat human male melanoma genet neoplasm multipl primari genet skin neoplasm genet spain,cdkn2a mutat spanish cutan malign melanoma famili patient multipl melanoma neoplasia cdkn2a gene implic cutan malign melanoma cmm 40 famili linkag chromosom 9p21 small proport famili mutat cdk4 gene order estim import gene predisposit cmm spanish famili patient analys ssca total 56 subject belong 34 cmm famili nine patient multipl cmm neoplasia detect germlin cdkn2a mutat six 34 famili 17 frameshift mutat 358delg four missens mutat g59v g101w two case d84i r87w identifi five cmm patient differ famili 14 carri a148t variant known affect p16 activ mutat detect patient multipl cmm neoplasm found mutat either exon 1 beta cdkn2a gene exon 2a cdk4 linkag analysi 9p21 region show exclus one famili cmm four famili cmm dysplast naevi studi indic small role cdkn2a spanish cmm famili suggest gene also respons cmm predisposit mutat carrier protein genet chromosom human pair 9 genet cyclindepend kinas inhibitor p16 cyclindepend kinas antagonist inhibitor dysplast nevus syndrom genet femal germlin mutat human male melanoma genet neoplasm multipl primari genet skin neoplasm genet spain,melanoma,2,1,1,4
2627,0,52017,larg delet chromosom 9p cutan malign melanoma identifi patient high risk develop metastas hospit clinic malign melanoma group univers barcelona,cutan malign melanoma cmm aggress tumour high metastat potenti delet chromosom 9p detect cmm involv cdkn2a p14arf gene loss heterozygos loh 16 microsatellit marker 9p mutat cdkn2a p14arf gene previous studi 32 melanoma patient group 9p delet detect 15 primari tumour 455 correl clinic outcom 5 year compar classic prognost factor eight 32 patient develop metastas 25 metastas detect within 768 day initi diagnosi patient without metastas last monitor least 1621 day diagnosi none 21 patient eight microsatellit conserv develop metastas wherea eight patient develop metastas eight marker delet sensit analysi predict metastas 100 specif 84 wherea sensit sampl use breslow thick 3 mm 625 specif 68 loh eight 9p microsatellit marker therefor use prognost factor predict develop metastas first 4463 year 16212294 day,chromosom delet chromosom map chromosom human pair 9 genet dna mutat analysi dna neoplasm analysi followup studi genet marker human loss heterozygos melanoma genet melanoma secondari microsatellit repeat neoplasm metastasi sensit specif skin neoplasm genet,larg delet chromosom 9p cutan malign melanoma identifi patient high risk develop metastas hospit clinic malign melanoma group univers barcelona cutan malign melanoma cmm aggress tumour high metastat potenti delet chromosom 9p detect cmm involv cdkn2a p14arf gene loss heterozygos loh 16 microsatellit marker 9p mutat cdkn2a p14arf gene previous studi 32 melanoma patient group 9p delet detect 15 primari tumour 455 correl clinic outcom 5 year compar classic prognost factor eight 32 patient develop metastas 25 metastas detect within 768 day initi diagnosi patient without metastas last monitor least 1621 day diagnosi none 21 patient eight microsatellit conserv develop metastas wherea eight patient develop metastas eight marker delet sensit analysi predict metastas 100 specif 84 wherea sensit sampl use breslow thick 3 mm 625 specif 68 loh eight 9p microsatellit marker therefor use prognost factor predict develop metastas first 4463 year 16212294 day chromosom delet chromosom map chromosom human pair 9 genet dna mutat analysi dna neoplasm analysi followup studi genet marker human loss heterozygos melanoma genet melanoma secondari microsatellit repeat neoplasm metastasi sensit specif skin neoplasm genet,melanoma,2,2,2,6


## Test and Validation Data

In [None]:
testDataSetSliced = []
valDataSetSliced = []

In [None]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
testValData['title_abstract_mesh'] = testValData[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
testValData['title_abstract_mesh_stemmed'] = testValData['title_abstract_mesh'].apply(tokenizeSnowball)

np.unique(testValData['pm_rel_desc'])

testValData["qid"] = "0"
testValData["qid"][testValData["pm_rel_desc"] == "Human PM"] = "1"
testValData["qid"][testValData["pm_rel_desc"] == "Animal PM"] = "1"

testValDataSliced = testValData[['relevance_score','qid','title_abstract_mesh_stemmed', 'trec_doc_id']]

In [None]:
testValDataSliced.head(1)

### Only Testind and *no* Validation Dataset

In [None]:
testValDataSliced.shape

In [None]:
testDataSetSliced.append(testValDataSliced)

### One Testing and Validation Datasets

In [None]:
# split into validation and testing
testDataSliced, valDataSliced, yT, yV = train_test_split(testValDataSliced, testValDataSliced['qid'], test_size=0.5)

In [None]:
testDataSetSliced.append(testDataSliced)
valDataSetSliced.append(valDataSliced)

In [None]:
testDataSetSliced[0].head(1)

In [None]:
valDataSetSliced[0].shape

### K-Fold Test and Validation Datasets

In [None]:
from sklearn.model_selection import KFold

testDataSetSliced = []
valDataSetSliced = []

kf = KFold(n_splits=10, random_state=123, shuffle=True)
for train_index, test_index in kf.split(testValDataSliced):
    testDataSetSliced.append(testValDataSliced.iloc[train_index])
    valDataSetSliced.append(testValDataSliced.iloc[test_index])

In [None]:
testDataSetSliced[0].shape

In [None]:
testDataSetSliced[0].head(1)

In [None]:
valDataSetSliced[0].shape

In [None]:
valDataSetSliced[0].head(1)

## TDIFD weighting

### N_word = 1000 and min_df

In [None]:
min_df = 0.1
n_words = 10000
tvec = TfidfVectorizer(max_features = n_words)

In [None]:
tvec = TfidfVectorizer(max_features = n_words, min_df = min_df)

### Train File

In [None]:
trainWeights = tvec.fit_transform(trainDataSliced['title_abstract_mesh_stemmed'])
trainScore = pd.DataFrame(trainWeights.toarray(), columns=tvec.get_feature_names())
trainVoc = tvec.vocabulary_

In [None]:
print(trainVoc)

In [None]:
trainData.shape

In [None]:
resTrain = pd.concat([trainDataSliced, trainScore, trainDocId], axis=1)
trainFinal = resTrain.drop(['title_abstract_mesh_stemmed'], axis=1)
trainFinal = trainFinal.sort_values('qid')
trainFinal.head(1)

In [None]:
trainFinal.shape

In [None]:
rankTrain = trainFinal.to_dict('records')

In [None]:
f = open("train.txt", "w")

for item in rankTrain:
    for i,val in item.items():
        if(i == "relevance_score"):
            f.write(str(val)+" ")
        elif(i == "trec_doc_id"):
            f.write('# '+str(val))
        elif(i == "qid"):
            f.write(str(i)+":"+str(val)+" ")
        else:
            j = tvec.vocabulary_[i] + 1
            f.write(str(j)+":"+str(val)+" ")
    f.write("\n")    
f.close()

### Test Files

In [None]:
sets = 0
for testDataSliced in testDataSetSliced:
    testDocId = testDataSliced[['trec_doc_id']]
    testData = testDataSliced[['relevance_score','qid','title_abstract_mesh_stemmed']]
    
    tvec = TfidfVectorizer(vocabulary = trainVoc)
    testWeights = tvec.fit_transform(testData['title_abstract_mesh_stemmed'])
    testScore = pd.DataFrame(testWeights.toarray(), columns=tvec.get_feature_names())
    
    testData.reset_index(drop=True, inplace=True)
    testScore.reset_index(drop=True, inplace=True)
    testDocId.reset_index(drop=True, inplace=True)
    
    resTest = pd.concat([testData, testScore, testDocId], axis=1)    
    testFinal = resTest.drop(['title_abstract_mesh_stemmed'], axis=1)
    testFinal = testFinal.sort_values('qid')

    rankTest = testFinal.to_dict('records')
    
    f = open("test"+str(sets)+".txt", "w")

    for item in rankTest:
        for i,val in item.items():
            if(i == "relevance_score"):
                f.write(str(val)+" ")
            elif(i == "trec_doc_id"):
                f.write('# '+str(val))
            elif(i == "qid"):
                f.write(str(i)+":"+str(val)+" ")
            else:
                j = tvec.vocabulary_[i] + 1
                f.write(str(j)+":"+str(val)+" ")
        f.write("\n")    
    f.close()
    sets = sets + 1

### Validation Files

In [None]:
sets = 0
for valDataSliced in valDataSetSliced:
    valDocId = valDataSliced[['trec_doc_id']]
    valData = valDataSliced[['relevance_score','qid','title_abstract_mesh_stemmed']]

    tvec = TfidfVectorizer(vocabulary = trainVoc)
    valWeights = tvec.fit_transform(valData['title_abstract_mesh_stemmed'])
    valScore = pd.DataFrame(valWeights.toarray(), columns=tvec.get_feature_names())
    
    valData.reset_index(drop=True, inplace=True)
    valScore.reset_index(drop=True, inplace=True)
    valDocId.reset_index(drop=True, inplace=True)

    resVal = pd.concat([valData, valScore, valDocId], axis=1)
    valFinal = resVal.drop(['title_abstract_mesh_stemmed'], axis=1)
    valFinal = valFinal.sort_values('qid')

    rankVal = valFinal.to_dict('records')

    f = open("vali"+str(sets)+".txt", "w")

    for item in rankVal:
        for i,val in item.items():
            if(i == "relevance_score"):
                f.write(str(val)+" ")
            elif(i == "trec_doc_id"):
                f.write('# '+str(val))
            elif(i == "qid"):
                f.write(str(i)+":"+str(val)+" ")
            else:
                j = tvec.vocabulary_[i] + 1
                f.write(str(j)+":"+str(val)+" ")
        f.write("\n")    
    f.close()
    sets = sets + 1

# L2R

In [None]:
%ls

# No validation

In [None]:
count = 0
randonRanking = []
ourRanking = []
allFeatures = []

while count < sets:
    metric = pyltr.metrics.NDCG(k=10)

    model = pyltr.models.LambdaMART(
        metric=metric,
        n_estimators=1000,
        learning_rate=0.02,
        max_features=0.5,
        query_subsample=0.5,
        max_leaf_nodes=10,
        min_samples_leaf=64,
        verbose=1,
    )

    print("Fold: "+str(count))
    with open('train.txt') as trainfile, open('test'+str(count)+'.txt') as evalfile:
        TrainX, Trainy, TrainQids, _ = pyltr.data.letor.read_dataset(trainfile)
        EvalX, Evaly, EvalQids, _ = pyltr.data.letor.read_dataset(evalfile)
        
    model.fit(TrainX, Trainy, TrainQids)
    Epred = model.predict(EvalX)
    randonRanking.append(metric.calc_mean_random(EvalQids, Evaly))
    ourRanking.append(metric.calc_mean(EvalQids, Evaly, Epred))
    
    # features
    nonZero = np.nonzero(model.feature_importances_)
    for i in nonZero:
        nonZeros = i.tolist()
        
    listFeatures = np.argsort(model.feature_importances_)
        
    for feature in listFeatures:
        if (feature in nonZeros) and (feature not in allFeatures):
            allFeatures.append(feature)
    
    count+=1

# With Validation

In [None]:
count = 0
randonRanking = []
ourRanking = []

while count < sets:
    metric = pyltr.metrics.NDCG(k=10)

    model = pyltr.models.LambdaMART(
        metric=metric,
        n_estimators=1000,
        learning_rate=0.02,
        max_features=0.5,
        query_subsample=0.5,
        max_leaf_nodes=10,
        min_samples_leaf=64,
        verbose=1,
    )

    print("Fold: "+str(count))
    with open('train.txt') as trainfile, open('vali'+str(count)+'.txt') as valifile, open('test'+str(count)+'.txt') as evalfile:
        TrainX, Trainy, TrainQids, _ = pyltr.data.letor.read_dataset(trainfile)
        ValX, Valy, ValQids, _ = pyltr.data.letor.read_dataset(valifile)
        EvalX, Evaly, EvalQids, _ = pyltr.data.letor.read_dataset(evalfile)
        
    monitor = pyltr.models.monitors.ValidationMonitor(ValX, Valy, ValQids, metric=metric, stop_after=250)
    model.fit(TrainX, Trainy, TrainQids, monitor=monitor)
    Epred = model.predict(EvalX)
    randonRanking.append(metric.calc_mean_random(EvalQids, Evaly))
    ourRanking.append(metric.calc_mean(EvalQids, Evaly, Epred))
    
    # features
    nonZero = np.nonzero(model.feature_importances_)
    for i in nonZero:
        nonZeros = i.tolist()
        
    listFeatures = np.argsort(model.feature_importances_)
        
    for feature in listFeatures:
        if (feature in nonZeros) and (feature not in allFeatures):
            allFeatures.append(feature)
    
    count+=1

In [None]:
print(randonRanking)

In [None]:
numpR = np.asarray(randonRanking)
np.mean(numpR)

In [None]:
print(ourRanking)

In [None]:
numpR = np.asarray(ourRanking)
np.mean(numpR)

In [None]:
len(allFeatures)

In [None]:
for feature in allFeatures:
    voc = feature-1
    for key,value in tvec.vocabulary_.items():
        if value == voc:
            print(value, key)