In [6]:
import os
import pandas as pd
import csv
import re
import sys
import math
import nltk
import string
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Reading the Data

In [3]:
# Load the data
data = pd.read_csv("/Users/ari/Downloads/TREC/trec2018/results/goldstandard/20180622processedGoldStandardXMLTXT.tsv", sep="\t", encoding="utf-8")
data.shape

(22642, 24)

# Functions to tokenize, remove stop words, get stemms

In [7]:
# Get Stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopWords = stopwords.words('english')

def tokenizePorter(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = PorterStemmer()
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

def tokenizeSnowball(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

[nltk_data] Downloading package stopwords to /Users/ari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ari/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Preprocessing the data

In [8]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
data['title_abstract_mesh'] = data[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
data['title_abstract_mesh_stemmed'] = data['title_abstract_mesh'].apply(tokenizeSnowball)

# Splitting Sets First into PM and not PM

## PM dataset

In [9]:
pmSet = data[data['pm_rel_desc'].str.contains('Human PM|Animal PM', regex=True)]
pmSet.shape

(9274, 26)

## Not PM dataset

In [10]:
notPmSet = data[data['pm_rel_desc'].str.contains('Not PM', regex=True)]
notPmSet.shape

(13368, 26)

## TFIDF weighting

In [11]:
def tfidfMeanWeight(data):
    tvec = TfidfVectorizer()
    tvecWeights = tvec.fit_transform(data['title_abstract_mesh_stemmed'])

    weights = np.asarray(tvecWeights.mean(axis=0)).ravel().tolist()
    weightsDf = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    return weightsDf

In [12]:
weightsPM = tfidfMeanWeight(pmSet)
topPM = weightsPM.sort_values(by='weight', ascending=False).head(10)
topPM

Unnamed: 0,term,weight
25839,mutat,0.058655
18273,genet,0.047187
10921,cancer,0.044421
23535,lung,0.041661
11639,cell,0.038988
29735,patient,0.037879
38116,tumor,0.03506
29461,pancreat,0.034632
26667,neoplasm,0.033646
18219,gene,0.032012


In [13]:
weightsNotPM = tfidfMeanWeight(notPmSet)
topNotPM = weightsNotPM.sort_values(by='weight', ascending=False).head(10)
topNotPM

Unnamed: 0,term,weight
14181,cancer,0.046933
31444,lung,0.043569
24167,genet,0.035412
34529,mutat,0.035066
39137,pancreat,0.034958
15157,cell,0.034422
39550,patient,0.03385
35582,neoplasm,0.032403
13480,breast,0.030068
51186,tumor,0.026645


In [14]:
mergedOnlyPM = pd.merge(topPM, topNotPM, on="term", how="outer", suffixes=["_pm", "_notpm"])
mergedOnlyPM

Unnamed: 0,term,weight_pm,weight_notpm
0,mutat,0.058655,0.035066
1,genet,0.047187,0.035412
2,cancer,0.044421,0.046933
3,lung,0.041661,0.043569
4,cell,0.038988,0.034422
5,patient,0.037879,0.03385
6,tumor,0.03506,0.026645
7,pancreat,0.034632,0.034958
8,neoplasm,0.033646,0.032403
9,gene,0.032012,


# Splitting DataSets Later

In [48]:
# Based on: https://buhrmann.github.io/tfidf-analysis.html

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=100):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Weights, data, features, min_tfidf=0.1, top_n=100):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(data)
    for label in labels:
        ids = np.where(data==label)
        feats_df = top_mean_feats(Weights, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

def top_feats_pm_notpm(Weights, data, features, min_tfidf=0.1, top_n=50):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []

    ids = np.where(data=="Human PM") or np.where(data=="Animal PM")
    feats_df = top_mean_feats(Weights, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = "PM"
    dfs.append(feats_df)

    ids = np.where(data=="Not PM")
    feats_df = top_mean_feats(Weights, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = "Not PM"
    dfs.append(feats_df)

    return dfs

In [22]:
vec = TfidfVectorizer(max_features=50000)
tvecWeights = vec.fit_transform(data['title_abstract_mesh_stemmed'])
features = vec.get_feature_names()

# Human PM , Animal PM , Not PM

In [52]:
dfs = top_feats_by_class(tvecWeights, data["pm_rel_desc"], features)
newDict = {}
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

mergedPm = pd.merge(dfs[0], dfs[1], on="feature", how="outer", suffixes=["_animal", "_human"])
mergedPmNotPm = pd.merge(mergedPm, dfs[2], on="feature", how="outer", suffixes=["_pm", "_not_pm"])

with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    display(mergedPmNotPm)

Animal PM
           feature     tfidf
0   mice            0.059852
1   pancreat        0.046508
2   cell            0.037718
3   pten            0.036709
4   mammari         0.031489
5   lung            0.031467
6   prostat         0.021944
7   model           0.019341
8   genet           0.019157
9   tumor           0.017949
10  metabol         0.017948
11  egfr            0.017106
12  rat             0.016239
13  kinas           0.014713
14  mutat           0.014242
15  brca2           0.014105
16  melanoma        0.013803
17  mous            0.013522
18  inhibitor       0.013171
19  p53             0.012722
20  kras            0.012465
21  transgen        0.012034
22  canin           0.011585
23  erbb3           0.011546
24  protein         0.011526
25  cyclin          0.011242
26  pharmacolog     0.011190
27  xenograft       0.010661
28  drug            0.010335
29  breast          0.010300
..     ...               ...
70  ckit            0.005739
71  gastric         0.005556
72  

Unnamed: 0,feature,tfidf_animal,tfidf_human,tfidf
0,mice,0.059852,,0.00462
1,pancreat,0.046508,0.030915,0.033372
2,cell,0.037718,0.020554,0.018346
3,pten,0.036709,0.025177,0.006693
4,mammari,0.031489,,
5,lung,0.031467,0.036029,0.04002
6,prostat,0.021944,0.009123,0.010855
7,model,0.019341,,
8,genet,0.019157,0.027178,0.015998
9,tumor,0.017949,0.012861,0.00671


## PM and NOT PM

In [51]:
dfs = top_feats_pm_notpm(tvecWeights, data["pm_rel_desc"], features)
newDict = {}

for df in dfs:
    print(df.label)
    print(df)
    print("\n")

merged = pd.merge(dfs[0], dfs[1], on="feature", how="outer", suffixes=["_pm", "_notpm"])

with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    display(merged)

PM
           feature     tfidf
0   mutat           0.056355
1   lung            0.036029
2   pancreat        0.030915
3   braf            0.027252
4   genet           0.027178
5   breast          0.026754
6   kras            0.026518
7   pten            0.025177
8   egfr            0.021907
9   cancer          0.021083
10  cell            0.020554
11  melanoma        0.019038
12  patient         0.017371
13  gene            0.016799
14  gist            0.016123
15  gastrointestin  0.015972
16  express         0.014858
17  adenocarcinoma  0.014041
18  colorect        0.013945
19  amplif          0.013899
20  stromal         0.013594
21  gastric         0.013327
22  tumor           0.012861
23  carcinoma       0.012530
24  metabol         0.011639
25  her2            0.010680
26  meningioma      0.010642
27  pik3ca          0.010637
28  nsclc           0.010609
29  protein         0.010491
30  alk             0.010309
31  brca2           0.010293
32  inhibitor       0.010261
33  exon   

Unnamed: 0,feature,tfidf_pm,tfidf_notpm
0,mutat,0.056355,0.022716
1,lung,0.036029,0.04002
2,pancreat,0.030915,0.033372
3,braf,0.027252,0.009931
4,genet,0.027178,0.015998
5,breast,0.026754,0.027501
6,kras,0.026518,0.009047
7,pten,0.025177,0.006693
8,egfr,0.021907,0.012885
9,cancer,0.021083,0.025234


# Relevance Score

In [39]:
dfs = top_feats_by_class(tvecWeights, data["relevance_score"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

0
               feature     tfidf
0   lung                0.037347
1   pancreat            0.032917
2   mutat               0.029756
3   breast              0.027406
4   cancer              0.024608
5   melanoma            0.019754
6   cell                0.019450
7   genet               0.018790
8   adenocarcinoma      0.016645
9   egfr                0.013275
10  braf                0.013001
11  kras                0.012900
12  patient             0.012770
13  gene                0.012677
14  carcinoma           0.012356
15  therapi             0.011346
16  colorect            0.011095
17  amplif              0.010976
18  cholangiocarcinoma  0.010880
19  prostat             0.010633
20  pten                0.010310
21  liposarcoma         0.010206
22  metabol             0.009866
23  express             0.009666
24  cervic              0.008810


1
           feature     tfidf
0   mutat           0.066257
1   gist            0.062621
2   gastrointestin  0.057976
3   stromal         

## Topic

In [40]:
dfs = top_feats_by_class(tvecWeights, data["trec_topic_disease"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

Ampullary carcinoma
                    feature     tfidf
0   kras                     0.107207
1   mutat                    0.084404
2   ampullari                0.070329
3   carcinoma                0.041416
4   colorect                 0.035666
5   ampulla                  0.035512
6   vater                    0.035300
7   braf                     0.033489
8   bile                     0.032803
9   duct                     0.032118
10  genet                    0.029692
11  lung                     0.027207
12  adenocarcinoma           0.024505
13  patient                  0.024274
14  codon                    0.022912
15  egfr                     0.022642
16  pancreat                 0.022241
17  adenoma                  0.015951
18  duoden                   0.013711
19  surviv                   0.012248
20  crc                      0.012011
21  pik3ca                   0.011639
22  pancreaticoduodenectomi  0.011597
23  surgeri                  0.011568
24  protein                  0

## Topic + PM and NOT PM

In [56]:
data['topic_pm'] = data[['trec_topic_disease', 'pm_rel_desc']].apply(lambda x: ''.join(x.to_string(index=False).replace("\n"," ")), axis=1)
data.head()

dfs = top_feats_by_class(tvecWeights, data["topic_pm"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

Ampullary carcinoma Animal PM
     feature     tfidf
0   kras      0.154483
1   mutat     0.119733
2   lung      0.105826
3   mice      0.098220
4   tsc1      0.096302
5   hara      0.092761
6   acc       0.081894
7   acf       0.075303
8   c118s     0.073966
9   vaccin    0.069739
10  adca      0.059469
11  kras2     0.056055
12  krasc118  0.055475
13  genet     0.053311
14  pancreat  0.052090
15  liver     0.051964
16  tsc2      0.050916
17  braf      0.049109
18  urethan   0.048215
19  tumour    0.046571
20  allel     0.046183
21  codon     0.045366
22  felin     0.041938
23  rsai      0.041796
24  pas1      0.039646


Ampullary carcinoma Human PM
           feature     tfidf
0   kras            0.171165
1   mutat           0.122209
2   colorect        0.052676
3   braf            0.043236
4   lung            0.042590
5   genet           0.040705
6   egfr            0.035300
7   patient         0.033925
8   ampullari       0.033575
9   adenocarcinoma  0.031095
10  codon           0.

## Topic + Relevance

In [54]:
data['topic_relevance'] = data[['trec_topic_disease', 'relevance_score']].apply(lambda x: ''.join(x.to_string(index=False).replace("\n"," ")), axis=1)
data.head()

dfs = top_feats_by_class(tvecWeights, data["topic_relevance"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

Ampullary carcinoma 0
                    feature     tfidf
0   kras                     0.104617
1   mutat                    0.080921
2   ampullari                0.069858
3   carcinoma                0.041920
4   colorect                 0.037551
5   ampulla                  0.036104
6   vater                    0.035162
7   braf                     0.033528
8   bile                     0.032384
9   duct                     0.031757
10  lung                     0.028925
11  genet                    0.027904
12  patient                  0.023985
13  adenocarcinoma           0.023905
14  egfr                     0.023344
15  codon                    0.023285
16  pancreat                 0.019052
17  adenoma                  0.016362
18  crc                      0.012770
19  duoden                   0.012523
20  surviv                   0.012332
21  pancreaticoduodenectomi  0.012329
22  pik3ca                   0.012182
23  surgeri                  0.012098
24  cell                    