In [357]:
import os
import pandas as pd
import csv
import re
import sys
import math
import nltk
import string
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Reading the Data

In [358]:
# Load the data
data = pd.read_csv("Y:\TREC\Results\XML-Extraction\\20180622processedGoldStandardXMLTXT.tsv", sep="\t", encoding="utf-8", usecols=["trec_doc_id", "pm_rel_desc", "title", "abstract", "trec_topic_disease", "major_mesh", "minor_mesh", "relevance_score"])
data.shape

(22642, 8)

# Functions to tokenize, remove stop words, get stemms

In [360]:
# Get Stopwords
nltk.download('stopwords')
stopWords = stopwords.words('english')

def tokenizePorter(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = PorterStemmer()
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

def tokenizeSnowball(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

[nltk_data] Downloading package stopwords to C:\Users\Ariane.Morassi-
[nltk_data]     Sasso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocessing the data

In [361]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
data['title_abstract_mesh'] = data[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
data['title_abstract_mesh_stemmed'] = data['title_abstract_mesh'].apply(tokenizeSnowball)

# Mean tfidf weight for each term

In [362]:
def tfidfMeanWeight(data):
    tvec = TfidfVectorizer()
    tvecWeights = tvec.fit_transform(data['title_abstract_mesh_stemmed'])

    weights = np.asarray(tvecWeights.mean(axis=0)).ravel().tolist()
    weightsDf = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    return weightsDf

# Weights for the whole dataset

In [363]:
weightsAll = tfidfMeanWeight(data)
weightsAll.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
45979,mutat,0.045897
19518,cancer,0.045132
41916,lung,0.042312
32436,genet,0.040634
20802,cell,0.035848
52756,patient,0.034837
52243,pancreat,0.034346
47425,neoplasm,0.032374
67944,tumor,0.029981
18485,breast,0.029008


# Relevant Abstracts dataset

In [366]:
relevantSet = data[(data['relevance_score'] == 2) | (data['relevance_score'] == 1)]
relevantSet.shape

(3875, 10)

In [367]:
weightsPM = tfidfMeanWeight(relevantSet)
relSet = weightsPM.sort_values(by='weight', ascending=False).head(20)

# Non Relevant Abstracts dataset

In [368]:
nonRelevantSet = data[(data['relevance_score'] == 0)]
nonRelevantSet.shape

(18767, 10)

In [369]:
weightsPM = tfidfMeanWeight(nonRelevantSet)
nonRelSet = weightsPM.sort_values(by='weight', ascending=False).head(20)

In [370]:
mergedRelNonRel = pd.merge(relSet, nonRelSet, on="term", how="outer", suffixes=["_relevant", "_irrelevant"])
mergedRelNonRel

Unnamed: 0,term,weight_relevant,weight_irrelevant
0,mutat,0.064559,0.041289
1,genet,0.049565,0.038911
2,lung,0.048107,0.041596
3,patient,0.043664,0.033593
4,cancer,0.04315,0.046172
5,tumor,0.04082,0.027944
6,cell,0.039161,0.035647
7,neoplasm,0.035812,0.03213
8,pancreat,0.035335,0.034528
9,braf,0.033422,


# PM dataset

In [371]:
pmSet = data[data['pm_rel_desc'].str.contains('Human PM|Animal PM', regex=True)]
pmSet.shape

(9274, 10)

In [372]:
weightsPM = tfidfMeanWeight(pmSet)
topPMonly = weightsPM.sort_values(by='weight', ascending=False).head(20)

# Not PM dataset

In [373]:
notPmSet = data[data['pm_rel_desc'].str.contains('Not PM', regex=True)]
notPmSet.shape

(13368, 10)

In [374]:
weightsNotPM = tfidfMeanWeight(notPmSet)
topNotPMonly = weightsNotPM.sort_values(by='weight', ascending=False).head(20)

In [375]:
mergedOnlyPM = pd.merge(topPMonly, topNotPMonly, on="term", how="outer", suffixes=["_pm", "_notpm"])
mergedOnlyPM

Unnamed: 0,term,weight_pm,weight_notpm
0,mutat,0.058655,0.035066
1,genet,0.047187,0.035412
2,cancer,0.044421,0.046933
3,lung,0.041661,0.043569
4,cell,0.038988,0.034422
5,patient,0.037879,0.03385
6,tumor,0.03506,0.026645
7,pancreat,0.034632,0.034958
8,neoplasm,0.033646,0.032403
9,gene,0.032012,0.025071


# Special dataset formed by two rows (all pm abstracts + all not pm abstracts)

In [376]:
mergedAbstractsPM = ""
for text in pmSet['title_abstract_mesh_stemmed']:
    mergedAbstractsPM += text

mergedAbstractsNotPM = ""
for text in notPmSet['title_abstract_mesh_stemmed']:
    mergedAbstractsNotPM += text

In [377]:
d = {'type': ["PM", "Not PM"], 'title_abstract_mesh_stemmed': [mergedAbstractsPM, mergedAbstractsNotPM]}
allPm = pd.DataFrame(data=d)

In [378]:
tvec = TfidfVectorizer()
tvecWeights = tvec.fit_transform(allPm['title_abstract_mesh_stemmed'])
features = tvec.get_feature_names()

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

# PM

In [379]:
topPM = top_feats_in_doc(tvecWeights, features, 0, 25)

# Not PM

In [380]:
topNotPM = top_feats_in_doc(tvecWeights, features, 1, 25)

In [381]:
mergedAllPM = pd.merge(topPM, topNotPM, on="feature", how="outer", suffixes=["_pm", "_notpm"])
mergedAllPM

Unnamed: 0,feature,tfidf_pm,tfidf_notpm
0,mutat,0.319858,0.180612
1,cancer,0.292163,0.345626
2,genet,0.27535,0.20701
3,cell,0.240872,0.246325
4,tumor,0.235291,0.190536
5,patient,0.233811,0.263449
6,neoplasm,0.224249,0.247574
7,gene,0.180704,0.142494
8,protein,0.17696,0.138998
9,express,0.148013,0.108107


In [411]:
# Based on: https://buhrmann.github.io/tfidf-analysis.html

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        print(ids)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        # feats_df.label = label
        # dfs.append(feats_df)
    return dfs

In [412]:
tvec = TfidfVectorizer()
tvecWeights = tvec.fit_transform(data['title_abstract_mesh_stemmed'])
features = tvec.get_feature_names()

dfs = []
dfs = top_feats_by_class(tvecWeights, data["relevance_score"], features)

(array([    0,     2,     4, ..., 22633, 22635, 22639], dtype=int64),)


MemoryError: 