In [1]:
import re
import random
import string
import pandas as pd
import numpy as np

from gensim.models import Word2Vec

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt')
nltk.download('wordnet')

SEED = 2
random.seed(SEED)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hecto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hecto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def extract_article_data(data):
    pmids = []
    pyears = []
    journals = []
    authors = []
    titles = []
    abstracts = []
    arttypes = []
    langs = []
    
    entries = data.split('\n\n')

    for entry in entries:
        pmid = ''
        pyear = ''
        journal = ''
        authors = ''
        title = ''
        abstract = ''
        arttype = ''
        lang = ''
        
        lines = entry.split('\n')
        abstract_started = False

        for line in lines:
            if re.match(r'^PMID-\s', line):
                pmid = line[re.search(r'-(.*)', line).start() + 1:].strip()
            elif re.match(r'^DP\s\s-\s', line):
                year_match = re.search(r'\d{4}', line)
                if year_match:
                    pyear = year_match.group()
            elif re.match(r'^JT\s\s-\s', line):
                journal = line[re.search(r'-(.*)', line).start() + 1:].strip()
            elif re.match(r'^TI\s\s-\s', line):
                title = line[re.search(r'-(.*)', line).start() + 1:].strip()
            elif re.match(r'^PT\s\s-\s', line):
                arttype = line[re.search(r'-(.*)', line).start() + 1:].strip()
            elif re.match(r'^LA\s\s-\s', line):
                lang = line[re.search(r'-(.*)', line).start() + 1:].strip()
            else:
                if re.match(r'^AB\s\s-\s', line):
                    abstract_started = True
                    abstract += line[re.search(r'-(.*)', line).start() + 1:].strip()
                elif not re.match(r'^\s\s\s\s\s\s', line):
                    if abstract_started:
                        break
                elif abstract_started:
                    abstract += ' ' + line.strip()

        titles.append(title)
        
        if pmid != '':
            pmids.append(pmid)
        else:
            pmids.append('NA')
        
        if pyear != '':
            pyears.append(int(pyear))
        else:
            pyears.append('NA')
            
        if journal != '':
            journals.append(journal)
        else:
            journals.append('NA')
            
        if lang != '':
            langs.append(lang)
        else:
            langs.append('NA')

        if abstract != '':
            abstracts.append(abstract)
        else:
            abstracts.append('NA')

    return pmids, titles, pyears, abstracts, arttypes, langs, journals

In [3]:
file_path = 'set3.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    data = file.read()
    
data[:500]

'PMID- 33735179\nOWN - NLM\nSTAT- MEDLINE\nDCOM- 20210726\nLR  - 20210726\nIS  - 1545-7885 (Electronic)\nIS  - 1544-9173 (Print)\nIS  - 1544-9173 (Linking)\nVI  - 19\nIP  - 3\nDP  - 2021 Mar\nTI  - Is "bioinformatics" dead?\nPG  - e3001165\nLID - 10.1371/journal.pbio.3001165 [doi]\nLID - e3001165\nAB  - Why would a computational biologist with 40 years of research experience say \n      bioinformatics is dead? The short answer is, in being the Founding Dean of a new \n      School of Data Science, what we do sudd'

In [4]:
pmids, titles, pyears, abstracts, arttypes, langs, journals = extract_article_data(data)

na_column = ['NA' for x in range(len(pmids))

article_data = {'PMID': pmids, 'YearPub':pyears, 'Journal': journals, 'Authors': na_column,
                'Title': titles, 'Abstract': abstracts, 'articleType': arttypes, 'language': langs,
               'pmcCitationCount'}

article_df = pd.DataFrame(article_data)

article_df.head()

Unnamed: 0,PMID,Title,YearPub,Abstract
0,33735179,"Is ""bioinformatics"" dead?",2021,Why would a computational biologist with 40 ye...
1,24348234,Education in computational biology today and t...,2013,
2,28899250,"Again, What Is Nursing Science?",2017,"This article again asks, What is nursing scien..."
3,23012581,Perspectives on an education in computational ...,2012,The mainstream application of massively parall...
4,33228539,Microbiology research at the systems biology a...,2020,


In [None]:
article_df = article_df[article_df['Abstract'] != "NA"]
corpus = article_df['Abstract'].values.astype('U')
corpus[2]

In [None]:
Lemmatizer = WordNetLemmatizer()

custom_stopwords = set(stopwords.words("english")+ ["study", "group", "patient", "used", 
                                                     "disease", "effect", "method", "also",
                                                     "result", "two", "may","level",
                                                     "participant","treatment","associated"
                                                    "risk", "however","year",
                                                     "the", "this","using", "showed", "analysis","text",
                                                    "abstract","figure", "article", "genomics",
                                                   "student", "research", "genetic", "bioinformatics",
                                                   "bioinformatic", "health", "biology", "science",
                                                   "genomic", "data", "education", "genetics", 
                                                   "gene", "genome", "nursing", "knowledge", "information"])

custom_stopwords = [Lemmatizer.lemmatize(word) for word in custom_stopwords]

In [None]:
def clean_text(corpus):
    abstracts =[]
    for line in corpus:
        line.replace("\n", "")
        line = line.lower()
        line = line.translate(str.maketrans('', '', string.punctuation))
        line = re.sub('[^A-Za-z]', ' ', line)
        new = ' '
        for word in line.split():
            word = Lemmatizer.lemmatize(word)
            if word not in custom_stopwords and len(word)>3:
                new = new + ' ' + word
        abstracts.append(new)
    return abstracts

In [None]:
abstract_list = clean_text(corpus)

abstract_list[2]

In [None]:
def tokenizer(abst_list):
    abstract_tokens =[]
    for line in abst_list:
        tokens = word_tokenize(line)
        tokens = [t for t in tokens if len(t) > 3]
        abstract_tokens.append(tokens)
    return abstract_tokens

In [None]:
abstract_tokens = tokenizer(abstract_list)

model = Word2Vec(sentences=abstract_tokens, workers=1, seed=SEED)

In [None]:
def vectorize(list_of_docs, model):
  features = []

  for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
  return features

vectorized_docs = vectorize(abstract_tokens, model=model)
len(vectorized_docs), len(vectorized_docs[0])

In [None]:
range_n_clusters = list(range(2,11))
clusters = []
n_cluster = []
inertia_vals = []

for n_clusters in range_n_clusters:
  cluster_model = KMeans(n_clusters=n_clusters, random_state=5)
  cluster_model.fit(vectorized_docs)

  clusters.append(cluster_model)
  inertia_vals.append(cluster_model.inertia_)
  n_cluster.append(n_clusters)

range_n_clusters

In [None]:
plt.plot(range(1,10,1),inertia_vals,marker='*')
plt.show()

In [None]:
for i in range(0,9,1):
  print("----------------------")
  print('cluster',n_cluster[i])
  print(clusters[i])
  print("Silhouette score: ", silhouette_score(vectorized_docs,clusters[i].predict(vectorized_docs)))

In [None]:
k = 2

k_means = KMeans(n_clusters=k, random_state=5)
k_means.fit(vectorized_docs)

In [None]:
pca = PCA(n_components=2, random_state=5)

reduced_vectors = pca.fit_transform(vectorized_docs)
reduced_clusters = pca.fit_transform(k_means.cluster_centers_)

In [None]:
plt.scatter(reduced_vectors[:,0],reduced_vectors[:,1],c=k_means.labels_)
plt.scatter(reduced_clusters[:, 0],reduced_clusters[:, 1], marker='x', s=150, c='r')

In [None]:
article_df['preprocessed_abstract'] = abstract_list
article_df['labels'] = k_means.labels_
article_df['labels'].value_counts()

In [None]:
def get_top_n_words(corpus, n):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
def get_wordcloud(text):
    word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
for clus in range (2):
    text =' '
    df2 = article_df.loc[article_df["labels"]==clus]
    
    for abst in df2.preprocessed_abstract:
        text += abst
    get_wordcloud(text)
    
    words = []
    for i, j in get_top_n_words(df2["preprocessed_abstract"],10):
        words.append(i)
    print ("Top 10 words from cluster",clus,":")
    print (words)

In [None]:
article_df[article_df['labels']==0]

In [None]:
article_df[article_df['labels']==1]

In [None]:
article_df['Journal'] = pd.Series([1 for x in range(len(df.index))])