In [268]:
import os
import pandas as pd
import csv
import re
import sys
import math
import nltk
import string
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Reading the Data

In [269]:
# Load the data
data = pd.read_csv("Y:\TREC\Results\XML-Extraction\\20180622processedGoldStandardXMLTXT.tsv", sep="\t", encoding="utf-8", usecols=["trec_doc_id", "pm_rel_desc", "title", "abstract", "trec_topic_disease", "major_mesh", "minor_mesh", "relevance_score"])
data.shape

(22642, 8)

# Functions to tokenize, remove stop words, get stemms

In [226]:
# Get Stopwords
nltk.download('stopwords')
stopWords = stopwords.words('english')

def tokenizePorter(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = PorterStemmer()
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

def tokenizeSnowball(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

[nltk_data] Downloading package stopwords to C:\Users\Ariane.Morassi-
[nltk_data]     Sasso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocessing the data

In [227]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
data['title_abstract_mesh'] = data[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
data['title_abstract_mesh_stemmed'] = data['title_abstract_mesh'].apply(tokenizeSnowball)

# Mean tfidf weight for each term

In [247]:
def tfidfMeanWeight(data):
    tvec = TfidfVectorizer()
    tvecWeights = tvec.fit_transform(data['title_abstract_mesh_stemmed'])

    weights = np.asarray(tvecWeights.mean(axis=0)).ravel().tolist()
    weightsDf = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    return weightsDf

# Weights for the whole dataset

In [248]:
weightsAll = tfidfMeanWeight(data)
weightsAll.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
45979,mutat,0.045897
19518,cancer,0.045132
41916,lung,0.042312
32436,genet,0.040634
20802,cell,0.035848
52756,patient,0.034837
52243,pancreat,0.034346
47425,neoplasm,0.032374
67944,tumor,0.029981
18485,breast,0.029008


# Relevant Abstracts dataset

In [262]:
relevantSet = data[(data['relevance_score'] == 2) | (data['relevance_score'] == 1)]
relevantSet.shape

(3875, 10)

In [263]:
weightsPM = tfidfMeanWeight(relevantSet)
weightsPM.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
14536,mutat,0.064559
10333,genet,0.049565
13229,lung,0.048107
16725,patient,0.043664
6162,cancer,0.04315
21333,tumor,0.04082
6538,cell,0.039161
15005,neoplasm,0.035812
16563,pancreat,0.035335
5817,braf,0.033422


# Non Relevant Abstracts dataset

In [264]:
nonRelevantSet = data[(data['relevance_score'] == 0)]
nonRelevantSet.shape

(18767, 10)

In [265]:
weightsPM = tfidfMeanWeight(nonRelevantSet)
weightsPM.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
17507,cancer,0.046172
38149,lung,0.041596
41902,mutat,0.041289
29436,genet,0.038911
18694,cell,0.035647
47554,pancreat,0.034528
48031,patient,0.033593
43178,neoplasm,0.03213
16551,breast,0.029607
62056,tumor,0.027944


# PM dataset

In [239]:
pmSet = data[data['pm_rel_desc'].str.contains('Human PM|Animal PM', regex=True)]
pmSet.shape

(9274, 10)

In [244]:
weightsPM = tfidfMeanWeight(pmSet)
weightsPM.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
25839,mutat,0.058655
18273,genet,0.047187
10921,cancer,0.044421
23535,lung,0.041661
11639,cell,0.038988
29735,patient,0.037879
38116,tumor,0.03506
29461,pancreat,0.034632
26667,neoplasm,0.033646
18219,gene,0.032012


# Not PM dataset

In [242]:
notPmSet = data[data['pm_rel_desc'].str.contains('Not PM', regex=True)]
notPmSet.shape

(13368, 10)

In [246]:
weightsNotPM = tfidfMeanWeight(notPmSet)
weightsNotPM.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
14181,cancer,0.046933
31444,lung,0.043569
24167,genet,0.035412
34529,mutat,0.035066
39137,pancreat,0.034958
15157,cell,0.034422
39550,patient,0.03385
35582,neoplasm,0.032403
13480,breast,0.030068
51186,tumor,0.026645


# Special dataset formed by two rows (all pm abstracts + all not pm abstracts)

In [292]:
mergedAbstractsPM = ""
for text in pmSet['title_abstract_mesh_stemmed']:
    mergedAbstractsPM += text

mergedAbstractsNotPM = ""
for text in notPmSet['title_abstract_mesh_stemmed']:
    mergedAbstractsNotPM += text

In [295]:
d = {'type': ["PM", "Not PM"], 'title_abstract_mesh_stemmed': [mergedAbstractsPM, mergedAbstractsNotPM]}
allPm = pd.DataFrame(data=d)

In [324]:
tvec = TfidfVectorizer()
tvecWeights = tvec.fit_transform(allPm['title_abstract_mesh_stemmed'])
features = tvec.get_feature_names()

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

# PM

In [328]:
top_feats_in_doc(tvecWeights, features, 0, 25)

Unnamed: 0,feature,tfidf
0,mutat,0.319858
1,cancer,0.292163
2,genet,0.27535
3,cell,0.240872
4,tumor,0.235291
5,patient,0.233811
6,neoplasm,0.224249
7,gene,0.180704
8,protein,0.17696
9,express,0.148013


# Not PM

In [329]:
# Not PM
top_feats_in_doc(tvecWeights, features, 1, 25)

Unnamed: 0,feature,tfidf
0,cancer,0.345626
1,patient,0.263449
2,neoplasm,0.247574
3,cell,0.246325
4,genet,0.20701
5,lung,0.198118
6,tumor,0.190536
7,mutat,0.180612
8,gene,0.142494
9,protein,0.138998
