In [23]:
import pandas as pd

### Cleaning

In [24]:
# let's take a look at our dataset
documents = pd.read_json("dataset-mie.json")
documents.head()

Unnamed: 0,title,year,journal_issn,language,abstract,doi,pmid,citation_count,IOSPressVolume,publication_type,authors,keywords,topics,affiliation_countries,affiliations
0,Using an open source observational tool to mea...,2009,0926-9630,eng,Computerization of general practice is an inte...,,19745467,4,150,[Journal Article],"[De Lusignan, S, Kumarapeli, P, Debar, S, Kush...","[Attitude to Computers, Computer Systems, Deci...","[EPR systems, consultation, primary care compu...",[united kingdom],"[St George's University of London, London SW17..."
1,"Portable devices, sensors and networks: wirele...",2009,0926-9630,eng,The 21st century healthcare systems aim at inv...,,19745466,7,150,"[Journal Article, Research Support, Non-U.S. G...","[Pharow, P, Blobel, B, Ruotsalainen, P, Peters...","[Health Services, Humans, Internet, Precision ...","[health services, Portable devices, portable d...",[germany],"[eHealth Competence Center, Regensburg Univers..."
2,Archetype-based knowledge management for seman...,2009,0926-9630,eng,Formal modeling of clinical content that can b...,,19745465,13,150,[Journal Article],"[Garde, S, Chen, R, Leslie, H, Beale, T, McNic...","[Medical Record Linkage, Medical Records Syste...","[compliance templates, templates, archetype re...",[united kingdom],"[Ocean Informatics, London, UK. sebastian.gard..."
3,Is there a common background to support better...,2009,0926-9630,eng,The workshop is proposed by the EFMI WG Health...,,19745464,0,150,[Journal Article],"[Stoicu-Tivadar, L, Blobel, B, Kern, J, Masic,...","[Education, Europe, Humans, International Coop...","[healthcare services, better healthcare, healt...",[romania],"[University Politehnica Timisoara, Romania. st..."
4,Digital pathology in Europe: coordinating pati...,2009,0926-9630,eng,The COST Action IC0604 Telepathology Network ...,,19745463,7,150,[Journal Article],"[Garcia Rojo, M, Punys, V, Slodkowska, J, Schr...","[Biomedical Research, Europe, Humans, Medical ...","[research efforts, Anatomic Pathology, Patholo...",[spain],"[Hospital General de Ciudad Real, 13005 Ciudad..."


In [25]:
# explore dataset for different language types
documents["language"].value_counts()

language
eng    4565
ger      41
Name: count, dtype: int64

In [None]:
# only look at english doucments
documents = documents[documents['language'] == 'eng'][['abstract', 'keywords', 'topics', 'pmid']]
documents = documents.set_index('pmid')
documents = documents[len(documents['abstract']) > 0]
documents.head()

Unnamed: 0_level_0,abstract,keywords,topics
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19745467,Computerization of general practice is an inte...,"[Attitude to Computers, Computer Systems, Deci...","[EPR systems, consultation, primary care compu..."
19745466,The 21st century healthcare systems aim at inv...,"[Health Services, Humans, Internet, Precision ...","[health services, Portable devices, portable d..."
19745465,Formal modeling of clinical content that can b...,"[Medical Record Linkage, Medical Records Syste...","[compliance templates, templates, archetype re..."
19745464,The workshop is proposed by the EFMI WG Health...,"[Education, Europe, Humans, International Coop...","[healthcare services, better healthcare, healt..."
19745463,The COST Action IC0604 Telepathology Network ...,"[Biomedical Research, Europe, Humans, Medical ...","[research efforts, Anatomic Pathology, Patholo..."


In [86]:
# remove the documents that are empty
documents = documents[documents["abstract"] != '']
documents.shape[0]

4497

In [30]:
words_per_document = documents["abstract"].apply(len)
words_per_document.mean()

881.7986856516977

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
import nltk

In [94]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johndriscoll/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johndriscoll/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
[SnowballStemmer('english').stem(word) for word in ['classify', 'classifier', 'classifying']]

['classifi', 'classifi', 'classifi']

In [64]:
def expandContractions(text, c_re=c_re):
    def replace(match):
        return c_dict[match.group(0)]
    return c_re.sub(replace, text).split(" ")

expandContractions("won't")

['will', 'not']

In [101]:
'has' in stop_words

True

In [135]:
#tokenize

# Contraction map
c_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'alls": "you alls",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you you will",
    "you'll've": "you you will have",
    "you're": "you are",
    "you've": "you have"
}

# Compiling the contraction dict
c_re = re.compile('(%s)' % '|'.join(c_dict.keys()))

# List of stop words
add_stop = ['said', 'say', '...', 'like']
stop_words = ENGLISH_STOP_WORDS.union(add_stop).union(nltk.corpus.stopwords.words('english'))

# List of punctuation
punc = list(set(string.punctuation))


# Splits words on white spaces (leaves contractions intact) and splits out
# trailing punctuation
def casual_tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


def expandContractions(text, c_re=c_re):
    def replace(match):
        return c_dict[match.group(0)]
    return c_re.sub(replace, text)#.split(" ")

def strip_punc(word):
        for c in punc:
            word = word.replace(c, "")
        return word

def process_text(text):
    wnl = WordNetLemmatizer()
    text = casual_tokenizer(text)
    text = [each.lower() for each in text]
    #text = [re.sub('[0-9]+', '', each) for each in text] #removes all numbers from words
    text = [each for each in text if not any(c.isdigit() for c in each)] #removes all words with numbers
    """We usually remove numbers when we do text clustering or getting keyphrases as we numbers doesn’t give much importance to get the main words. """
    text = [expandContractions(each, c_re=c_re) for each in text]
    # temp = []
    # for each in text:
    #     temp += expandContractions(each, c_re=c_re)
    # text = temp
    #have to remove stopwords before lemmatizing them 
    text = [w for w in text if w not in stop_words]
    text = [wnl.lemmatize(each) if each else each for each in text]
    text = [w for w in text if w not in punc]
    text = [strip_punc(w) for w in text]
    text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return text


def top_words(topic, n_top_words):
    return topic.argsort()[:-n_top_words - 1:-1]  


def topic_table(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        t = (topic_idx)
        topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)]
    return pd.DataFrame(topics)


def whitespace_tokenizer(text): 
    pattern = r"(?u)\b\w\w+\b" 
    tokenizer_regex = RegexpTokenizer(pattern)
    tokens = tokenizer_regex.tokenize(text)
    return tokens


# Funtion to remove duplicate words
def unique_words(text): 
    ulist = []
    [ulist.append(x) for x in text if x not in ulist]
    return ulist


def word_count(text):
    return len(str(text).split(' '))

In [57]:
wnl = WordNetLemmatizer()

In [136]:
### tokenize the abstracts
documents['processed_abstracts'] = documents['abstract'].apply(process_text)
documents['processed_abstracts']

pmid
19745467    [computerization, general, practice, internati...
19745466    [century, healthcare, aim, involving, citizen,...
19745465    [formal, modeling, clinical, content, availabl...
19745464    [workshop, proposed, efmi, wg, health, informa...
19745463    [cost, action, telepathology, network, europe,...
                                  ...                        
39176482    [study, advance, utility, synthetic, study, da...
39176481    [administrable, dose, form, obtained, transfor...
39176480    [key, research, area, kras, identified, establ...
39176479    [paper, present, versatile, solution, formally...
39176478    [international, classification, icd, icd, sexs...
Name: processed_abstracts, Length: 4497, dtype: object

In [81]:
from collections import Counter

In [137]:
# Get the top 20 most common words among all the articles
p_text = documents['processed_abstracts']

# Flaten the list of lists
p_text = [item for sublist in p_text for item in sublist]

# Top 20
top_20 = pd.DataFrame(
    Counter(p_text).most_common(20),
    columns=['word', 'frequency']
)

top_20

Unnamed: 0,word,frequency
0,data,5334
1,patient,3944
2,health,3506
3,information,3211
4,clinical,2455
5,medical,2383
6,study,2352
7,care,2063
8,model,2000
9,result,1791


In [116]:
import numpy as np

In [139]:
1600/18400

0.08695652173913043

In [138]:
np.unique(p_text), len(np.unique(p_text))

(array(['aa', 'aaa', 'aact', ..., 'österle', 'αu', 'βblockers'],
       dtype='<U60'),
 18400)

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    min_df=3,
    max_df=0.85,
    max_features=5000,
    ngram_range=(1, 2),
    preprocessor=' '.join
)

tfidf = tfidf_vectorizer.fit_transform(texts)

### Run NMF