In [1]:
import os
import pandas as pd
import csv
import re
import sys
import math
import nltk
import string
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Reading the Data

In [2]:
# Load the data
data = pd.read_csv("/Users/ari/Downloads/TREC/trec2018/results/goldstandard/2017/20180712processedGoldStandardCT.tsv", sep="\t", encoding="utf-8")
data.shape

(13441, 35)

# Functions to tokenize, remove stop words, get stemms

In [3]:
# Get Stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopWords = stopwords.words('english')

def tokenizePorter(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = PorterStemmer()
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

def tokenizeSnowball(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

[nltk_data] Downloading package stopwords to /Users/ari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Preprocessing the data

In [4]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
data['mixed_text'] = data[["brief_title", "official_title", "brief_summary", "detailed_description", "phase", "study_type", "study_design_info", "outcomes", "conditions", "arm_groups", "drug_interventions", "other_interventions", "inclusion_criteria", "mesh_terms_conditions", "mesh_terms_interventions"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)

In [5]:
data['mixedt_text_stemmed'] = data['mixed_text'].apply(tokenizeSnowball)

# Splitting Sets First into PM and not PM

## PM dataset

In [6]:
pmSet = data[data['pm_rel_desc'].str.contains('Human PM|Animal PM', regex=True)]
pmSet.shape

(3961, 37)

## Not PM dataset

In [7]:
notPmSet = data[data['pm_rel_desc'].str.contains('Not PM', regex=True)]
notPmSet.shape

(9480, 37)

## TFIDF weighting

In [8]:
def tfidfMeanWeight(data):
    tvec = TfidfVectorizer()
    tvecWeights = tvec.fit_transform(data['mixedt_text_stemmed'])

    weights = np.asarray(tvecWeights.mean(axis=0)).ravel().tolist()
    weightsDf = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    return weightsDf

In [9]:
weightsPM = tfidfMeanWeight(pmSet)
topPM = weightsPM.sort_values(by='weight', ascending=False).head(10)
topPM

Unnamed: 0,term,weight
22514,patient,0.082823
6826,cancer,0.060596
27902,studi,0.051326
29632,treatment,0.044898
9616,day,0.039847
29906,tumor,0.039777
18067,lung,0.034999
10351,diseas,0.03345
10620,dose,0.033166
25824,respons,0.032847


In [10]:
weightsNotPM = tfidfMeanWeight(notPmSet)
topNotPM = weightsNotPM.sort_values(by='weight', ascending=False).head(10)
topNotPM

Unnamed: 0,term,weight
39865,patient,0.072459
12404,cancer,0.060955
49326,studi,0.041281
32046,lung,0.037495
52440,treatment,0.036861
11688,breast,0.032831
39437,pancreat,0.031872
52912,tumor,0.030595
17263,day,0.030382
55095,week,0.028939


In [11]:
mergedOnlyPM = pd.merge(topPM, topNotPM, on="term", how="outer", suffixes=["_pm", "_notpm"])
mergedOnlyPM

Unnamed: 0,term,weight_pm,weight_notpm
0,patient,0.082823,0.072459
1,cancer,0.060596,0.060955
2,studi,0.051326,0.041281
3,treatment,0.044898,0.036861
4,day,0.039847,0.030382
5,tumor,0.039777,0.030595
6,lung,0.034999,0.037495
7,diseas,0.03345,
8,dose,0.033166,
9,respons,0.032847,


# Splitting DataSets Later

In [12]:
# Based on: https://buhrmann.github.io/tfidf-analysis.html

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=100):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Weights, data, features, min_tfidf=0.1, top_n=100):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(data)
    for label in labels:
        ids = np.where(data==label)
        feats_df = top_mean_feats(Weights, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

def top_feats_pm_notpm(Weights, data, features, min_tfidf=0.1, top_n=50):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []

    ids = np.where(data=="Human PM") or np.where(data=="Animal PM")
    feats_df = top_mean_feats(Weights, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = "PM"
    dfs.append(feats_df)

    ids = np.where(data=="Not PM")
    feats_df = top_mean_feats(Weights, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = "Not PM"
    dfs.append(feats_df)

    return dfs

In [13]:
vec = TfidfVectorizer(max_features=50000)
tvecWeights = vec.fit_transform(data['mixedt_text_stemmed'])
features = vec.get_feature_names()

# Human PM, Animal PM, Not PM

In [14]:
dfs = top_feats_by_class(tvecWeights, data["pm_rel_desc"], features)
newDict = {}
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

mergedPm = pd.merge(dfs[0], dfs[1], on="feature", how="outer", suffixes=["_animal", "_human"])
mergedPmNotPm = pd.merge(mergedPm, dfs[2], on="feature", how="outer", suffixes=["_pm", "_not_pm"])

with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    display(mergedPmNotPm)

Animal PM
            feature     tfidf
0   cmet             0.307024
1   brca1            0.251008
2   brca2            0.204697
3   mutat            0.189137
4   mcn              0.156417
5   egfr             0.136247
6   erlotinib        0.129628
7   ipmn             0.125918
8   pancreat         0.125156
9   nsclc            0.106059
10  express          0.104116
11  amplif           0.103530
12  sish             0.084292
13  panin            0.071599
14  gene             0.071233
15  jew              0.061600
16  heterozygot      0.060490
17  real             0.059965
18  model            0.057997
19  p53              0.056282
20  pdac             0.056171
21  founder          0.055376
22  1st              0.054667
23  mous             0.051764
24  lung             0.051445
25  mucin            0.051110
26  ashkenazi        0.050554
27  displata         0.000000
28  disprov          0.000000
29  disperz          0.000000
..      ...               ...
70  disfigur         0.000000


Unnamed: 0,feature,tfidf_animal,tfidf_human,tfidf
0,cmet,0.307024,,
1,brca1,0.251008,0.004373,
2,brca2,0.204697,,
3,mutat,0.189137,0.020998,0.006324
4,mcn,0.156417,,
5,egfr,0.136247,0.014264,0.006159
6,erlotinib,0.129628,0.017416,0.007379
7,ipmn,0.125918,,
8,pancreat,0.125156,0.017673,0.028917
9,nsclc,0.106059,0.009287,0.005472


## PM and NOT PM

In [15]:
dfs = top_feats_pm_notpm(tvecWeights, data["pm_rel_desc"], features)
newDict = {}

for df in dfs:
    print(df.label)
    print(df)
    print("\n")

merged = pd.merge(dfs[0], dfs[1], on="feature", how="outer", suffixes=["_pm", "_notpm"])

with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    display(merged)

PM
        feature     tfidf
0   patient      0.044177
1   cancer       0.027114
2   lung         0.025747
3   mutat        0.020998
4   breast       0.020553
5   pancreat     0.017673
6   erlotinib    0.017416
7   gastric      0.016865
8   cetuximab    0.016142
9   day          0.016082
10  dose         0.015798
11  colorect     0.014573
12  egfr         0.014264
13  subject      0.013126
14  panitumumab  0.012737
15  kras         0.012685
16  melanoma     0.012386
17  tumor        0.012360
18  cycl         0.011093
19  braf         0.010568
20  week         0.009791
21  particip     0.009769
22  trametinib   0.009751
23  trastuzumab  0.009700
24  gemcitabin   0.009655
25  date         0.009554
26  nsclc        0.009287
27  irinotecan   0.009187
28  lapatinib    0.009119
29  imatinib     0.009096
30  paclitaxel   0.009026
31  her2         0.008951
32  bevacizumab  0.008818
33  treatment    0.008596
34  must         0.008430
35  studi        0.008261
36  respons      0.008239
37  part 

Unnamed: 0,feature,tfidf_pm,tfidf_notpm
0,patient,0.044177,0.037382
1,cancer,0.027114,0.029562
2,lung,0.025747,0.030157
3,mutat,0.020998,0.006324
4,breast,0.020553,0.028381
5,pancreat,0.017673,0.028917
6,erlotinib,0.017416,0.007379
7,gastric,0.016865,
8,cetuximab,0.016142,0.007162
9,day,0.016082,0.009893


# Relevance Score

In [16]:
dfs = top_feats_by_class(tvecWeights, data["relevance_score"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

0
            feature     tfidf
0   patient          0.038712
1   cancer           0.029908
2   lung             0.028769
3   breast           0.027201
4   pancreat         0.027013
5   gemcitabin       0.017251
6   melanoma         0.013144
7   colorect         0.011614
8   prostat          0.011153
9   day              0.011126
10  vaccin           0.010532
11  dose             0.009816
12  week             0.009688
13  cetuximab        0.009330
14  mutat            0.008957
15  cell             0.008668
16  paclitaxel       0.008492
17  erlotinib        0.008439
18  cisplatin        0.008216
19  tumor            0.008149
20  gastric          0.008133
21  colon            0.007999
22  subject          0.007788
23  bevacizumab      0.007699
24  chemotherapi     0.007692
25  adenocarcinoma   0.007585
26  cervic           0.007484
27  cycl             0.006884
28  radiat           0.006804
29  resect           0.006799
..     ...                ...
70  lesion           0.004220
71  bili

## Topic

In [17]:
dfs = top_feats_by_class(tvecWeights, data["trec_topic_disease"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

Ampullary carcinoma
           feature     tfidf
0   patient         0.043386
1   cetuximab       0.036274
2   kras            0.030986
3   panitumumab     0.027730
4   colorect        0.026740
5   biliari         0.025561
6   gemcitabin      0.024949
7   irinotecan      0.022815
8   pancreat        0.020430
9   cisplatin       0.018677
10  carcinoma       0.018203
11  erlotinib       0.016998
12  oxaliplatin     0.016157
13  day             0.015633
14  sorafenib       0.014810
15  bevacizumab     0.014565
16  dose            0.014311
17  mutat           0.013948
18  hydrochlorid    0.013313
19  stent           0.013119
20  capecitabin     0.012897
21  bile            0.012789
22  cancer          0.012459
23  tract           0.012017
24  duct            0.011358
25  subject         0.011210
26  hepatocellular  0.011140
27  lung            0.011070
28  selumetinib     0.010890
29  vitamin         0.010355
..      ...              ...
70  month           0.005692
71  sbrt            0.0

## Topic + PM and NOT PM

In [18]:
data['topic_pm'] = data[['trec_topic_disease', 'pm_rel_desc']].apply(lambda x: ''.join(x.to_string(index=False).replace("\n"," ")), axis=1)
data.head()

dfs = top_feats_by_class(tvecWeights, data["topic_pm"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

Ampullary carcinoma Human PM
               feature     tfidf
0   kras                0.066255
1   cetuximab           0.059741
2   panitumumab         0.058668
3   colorect            0.055468
4   patient             0.042013
5   irinotecan          0.038026
6   mutat               0.026131
7   bevacizumab         0.022371
8   wildtyp             0.020960
9   erlotinib           0.020480
10  subject             0.019632
11  date                0.019273
12  day                 0.017945
13  dose                0.017450
14  lung                0.016630
15  selumetinib         0.016015
16  gemcitabin          0.015317
17  cancer              0.015260
18  azd6244             0.015161
19  folfiri             0.014009
20  arm                 0.014002
21  braf                0.013933
22  mfolfox6            0.013816
23  oxaliplatin         0.012611
24  studi               0.011806
25  part                0.011537
26  mek162              0.011250
27  random              0.011235
28  codon     

## Topic + Relevance

In [19]:
data['topic_relevance'] = data[['trec_topic_disease', 'relevance_score']].apply(lambda x: ''.join(x.to_string(index=False).replace("\n"," ")), axis=1)
data.head()

dfs = top_feats_by_class(tvecWeights, data["topic_relevance"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

Ampullary carcinoma 0
               feature     tfidf
0   patient             0.043970
1   cetuximab           0.036968
2   kras                0.029412
3   colorect            0.027251
4   panitumumab         0.025965
5   gemcitabin          0.023271
6   irinotecan          0.023252
7   biliari             0.022739
8   pancreat            0.020821
9   carcinoma           0.018552
10  cisplatin           0.018364
11  erlotinib           0.016548
12  day                 0.015932
13  sorafenib           0.015093
14  oxaliplatin         0.014645
15  dose                0.014585
16  bevacizumab         0.014290
17  mutat               0.013409
18  stent               0.013370
19  bile                0.013034
20  hydrochlorid        0.012459
21  capecitabin         0.012245
22  cancer              0.011990
23  duct                0.011576
24  subject             0.011425
25  hepatocellular      0.011353
26  lung                0.011282
27  selumetinib         0.011098
28  vitamin          