In [1]:
import numpy as np
import pandas as pd
import pyterrier as pt
import os
from sklearn.model_selection import train_test_split

In [2]:
if not pt.started():
    pt.init()

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



# <span Style='font-family: Georgia, serif; color:orange'> **Read in Docset, Index, and Qrels**

### <span Style='font-family: Georgia, serif; color:orange'> **Docset**

In [14]:
docset = pd.read_csv('final_curriculum_data\\final_docset.csv', index_col=0)
docset['cip'] = docset['cip'].astype(str)
docset['docno'] = [i for i in range(1, len(docset)+1)]

cip_titles = pd.read_csv('final_curriculum_data\\cip_names.csv')[['Title', 'CIP Code']]
cip_titles['CIP Code'] = [i[2:-1] if i[2] != '0' else i[3:-1] for i in cip_titles['CIP Code']]
cip_titles['CIP Code'] = [i[:-1] if i[-1] == '0' else i for i in cip_titles['CIP Code']]
docset = docset[docset['cip'].isin(cip_titles['CIP Code'])]
docset['cip_name'] = [cip_titles[cip_titles['CIP Code']==i].Title.iloc[0] for i in docset.cip]

### <span Style='font-family: Georgia, serif; color:orange'> **Index**

In [16]:
cwd = os.getcwd()
pt_index_path = cwd + '\\curriculum_docs'

docset = docset.drop(columns=['institutions', 'programs', 'degree_levels'])
docset_dicts = docset.to_dict(orient='records')

if not os.path.exists(pt_index_path + '\\data_1.properties'):
    indexer = pt.IterDictIndexer(pt_index_path,
                                 overwrite=True, meta={'docno': 20, 'cip': 20, 'courses':60000},
                                 blocks=True)
    index_ref = indexer.index(docset_dicts, fields=['descriptions'])
else:
    index_ref = pt.IndexRef.of(pt_index_path + "\\data_1.properties")
index = pt.IndexFactory.of(index_ref)


### <span Style='font-family: Georgia, serif; color:orange'> **Qrels**

In [17]:
qrels = pd.read_csv('training_qrels_annotated.csv')
qrels = qrels.drop(columns='cip_name')
query_dict = {}
docid = 0
for i in qrels['query'].unique():
    query_dict[i] = docid
    docid +=1

qid = []
query = []
for k,v in query_dict.items():
    qid.append(v)
    query.append(k)

topics = pd.DataFrame()
topics['qid'] = qid
topics['query'] = query
topics = topics.astype(str)

qids = []
for i, r in qrels.iterrows():
    for k, v in query_dict.items():
        if r['query'] == k:
            qids.append(v)

qrels['qid'] = qids

docnos = []
for i in qrels.cip_code:
    docnos.append(docset[docset['cip']==str(i)].docno.iloc[0])
qrels['docno'] = docnos
qrels = qrels.drop(columns=['query'])
qrels = qrels[['qid', 'docno', 'assigned_score']]
qrels = qrels.rename(columns={'assigned_score':'label'})
qrels['qid'] = qrels['qid'].astype(str)
qrels['docno'] = qrels['docno'].astype(str)

qrels.sample(10)

Unnamed: 0,qid,docno,label
947,41,181,1
2608,113,10,1
1639,72,9,5
1396,63,187,1
1272,57,43,5
1208,52,4,1
840,37,4,1
186,8,123,1
1978,86,131,1
2654,116,125,3


# <span Style='font-family: Georgia, serif; color:orange'> **IR Development and Testing**

### <span Style='font-family: Georgia, serif; color:orange'> **Baseline**

<span Style='font-family: Georgia, serif; color:orange'> Creating a baseline for performance to compare with the 3 basic models (tfidf, bm25, and pl2) as well as sequential dependence and query expansion (separately) on those models. TF-IDF is the *most* basic and not super useful so I chose not to include sdm and qe for it in this baseline.

In [6]:
bm25 = pt.BatchRetrieve(index, wmodel='BM25')
tfidf = pt.BatchRetrieve(index, wmodel='TF_IDF')
pl2 = pt.BatchRetrieve(index, wmodel='PL2')
sdm = pt.rewrite.SDM(prox_model='pBiL')
qe = pt.rewrite.Bo1QueryExpansion(index)

pt.Experiment(
    [tfidf, bm25, pl2, sdm >> bm25, pt.BatchRetrieve(index, wmodel='BM25', controls={"qemodel" : "Bo1", "qe" : "on"}), sdm >> pl2, pt.BatchRetrieve(index, wmodel='PL2', controls={"qemodel" : "Bo1", "qe" : "on"})],
    topics,
    qrels,
    names = ['tfidf', 'bm25', 'pl2', 'bm25_sequential_dependece', 'bm25_query_expansion', 'pl2_sequential_dependence', 'pl2_query_expansion'],
    eval_metrics=['map_cut_20', 'ndcg_cut_20']
)

Unnamed: 0,name,map_cut_20,ndcg_cut_20
0,tfidf,0.387926,0.490109
1,bm25,0.549556,0.622506
2,pl2,0.628619,0.649296
3,bm25_sequential_dependece,0.542522,0.621995
4,bm25_query_expansion,0.347584,0.493596
5,pl2_sequential_dependence,0.625436,0.648451
6,pl2_query_expansion,0.493598,0.5678


### <span Style='font-family: Georgia, serif; color:orange'> **L2R**

#### <span Style='font-family: Georgia, serif; color:orange'> **Feature Configuration**

In [10]:
SEED=42

train_topics, test_topics = train_test_split(topics, test_size=.30, random_state=SEED)

feats = pl2 >> pt.text.get_text(index, ['courses']) >> (
    pt.transformer.IdentityTransformer()
    **
    (sdm >> pl2) #sequential independence pl2
    **
    (pt.text.scorer(body_attr="courses", takes='docs', wmodel='BM25')) #course titles score
    **
    (bm25) #bm25 score
    **
    (pt.BatchRetrieve(index, wmodel="BM25", controls={"qemodel" : "Bo1", "qe" : "on"})) #query expansion
)

fnames = ['PL2', 'SDM', 'Course Titles Score', 'BM25', 'Query Exp']

### <span Style='font-family: Georgia, serif; color:orange'> **Random Forest**

In [8]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=400, verbose=1, random_state=SEED, n_jobs=2)
rf_pipe = feats >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(train_topics, qrels)

  for column, value in meta_column[1].iteritems():




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    1.8s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    3.8s finished


In [9]:
pt.Experiment(
    [bm25, pl2, rf_pipe],
    topics,
    qrels,
    names = ['bm25', 'pl2', 'random_forest'],
    eval_metrics=['map_cut_20', 'ndcg_cut_20']
)

  for column, value in meta_column[1].iteritems():




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,name,map_cut_20,ndcg_cut_20
0,bm25,0.549556,0.622506
1,pl2,0.628619,0.649296
2,random_forest,0.727277,0.882115


In [29]:
new_query = 'animals'

results = (rf_pipe%20).search(new_query).drop(columns='docid')
results['cip_name'] = [docset[docset['docno']==int(i)].cip_name.iloc[0] for i in results.docno]
results



  for column, value in meta_column[1].iteritems():
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.0s finished


Unnamed: 0,qid,docno,score,query,courses,features,rank,cip_name
10,1,82,2.265,animals,foundation computer graphic technology geometr...,"[1.8513459501072271, 1.8513459501072271, 1.146...",0,Graphic Communications.
11,1,188,2.055,animals,agr introduction agricultural industry ag...,"[1.8471620817218235, 1.8471620817218235, 1.079...",1,"Agriculture, General."
1,1,81,1.82,animals,introduction animal agriculture orientation an...,"[2.924398015982549, 2.924398015982549, 1.16952...",2,Animal Sciences.
18,1,49,1.29,animals,principle foundation physical education nutrit...,"[1.368264921729335, 1.368264921729335, 0.0, 0....",3,"Sports, Kinesiology, and Physical Education/Fi..."
21,1,133,1.2775,animals,introduction human science essential college r...,"[1.287405746272816, 1.287405746272816, 0.0, 0....",4,"Human Development, Family Studies, and Related..."
9,1,23,1.21,animals,introductory physics fall sel introductory phy...,"[1.8563196509865316, 1.8563196509865316, 0.0, ...",5,Physics.
20,1,181,1.1075,animals,brass class woodwind class credit brass cl...,"[1.3572469943350152, 1.3572469943350152, 0.0, ...",6,"Intelligence, Command Control and Information ..."
19,1,174,1.0975,animals,calculus physical sciences engineering cal...,"[1.3633556111834886, 1.3633556111834886, 0.0, ...",7,Systems Engineering.
7,1,131,1.085,animals,structure property material material property ...,"[2.076801116989909, 2.076801116989909, 0.0, 0....",8,Materials Engineering.
13,1,6,0.9575,animals,general chemistry fall sel general chemistry i...,"[1.5150134176107442, 1.5150134176107442, 0.0, ...",9,Chemistry.
