In [1]:
#conda install -c conda-forge spacy-model-en_core_web_sm
import re
import pandas as pd
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing


import logging

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

#convet abstracts to csv file from txt
filename = 'pubmed-genesprote-set.txt' # Change filename to match your file!

df = pd.read_csv(filename, sep='\n', header=None)
# Keep track of keywords and abstracts
keyword_list = []
abstract_list = []

keywords = ''
abstract = ''
for index, element in enumerate(df[0]):
    # Enumerate all rows
    if element[0:2] == 'AB':
        # Add to the abstract string
        abstract += element[6:]
        counter = 1
        while True:
            new_element = df[0][index + counter]
            if new_element[0:4] != '    ':
                break
            else:
                abstract += new_element[6:]
                counter += 1
                

    elif element[0:3] == 'OT ':
        # Add to the keyword string, 6 if no *, 7 if *
        if element[6] == '*':
            keywords += ' | ' + element[7:]
        else:
            keywords += ' | ' + element[6:]
            
    elif element[0:4] == 'PMID' and index > 0:
        # If no abstract, add empty fields
        if len(abstract) == 0:
            keyword_list.append('EMPTY')
            abstract_list.append('EMPTY')
        else:
            # New publication, append keywords and abstract and clear
            keyword_list.append(keywords[3:])
            abstract_list.append(abstract)
            
        keywords = ''
        abstract = ''

# Append last keywords and abstract
keyword_list.append(keywords)
abstract_list.append(abstract)

print('Looped through records and found ' + str(len(keyword_list)) + ' abstracts.')
result_df = pd.DataFrame({'abstract': abstract_list, 'keywords': keyword_list})

result_df.to_csv('woundabstracts.csv', index=False, encoding='utf-8')


Looped through records and found 10000 abstracts.


In [2]:

df =pd.read_csv('woundabstracts.csv')

##print(df['abstract'])
df=df.dropna().reset_index(drop=True)
##send=[row.split(' ') for row in df]
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"]) # disabling Named Entity Recognition for speed
def cleaning(doc):
    txt=[token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['abstract'])
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))
df_clean = pd.DataFrame({'abstract': txt})
df_clean = df_clean.dropna().drop_duplicates()
print(df_clean.shape)
sent = [row.split() for row in df_clean['abstract']]
print(sent[:2])
import multiprocessing

from gensim.models import Word2Vec
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

w2v_model = Word2Vec(min_count=3,window=2,vector_size=50,sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20,workers=cores-1)

t = time()

w2v_model.build_vocab(sent, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()

w2v_model.train(sent, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

##w2v_model.init_sims(replace=True)
print(w2v_model.wv.most_similar(positive=["wound"]))
##w2v_model.wv.similarity("moe_'s", 'tavern')





Time to clean up everything: 0.57 mins
(2207, 1)
[['wound', 'healing', 'important', 'physiological', 'process', 'maintain', 'integrity', 'skin', 'trauma', 'accident', 'intent', 'procedure', 'normal', 'wound', 'healing', 'involve', 'successive', 'overlap', 'phase', 'include', 'hemostasis', 'inflammatory', 'phase', 'proliferative', 'phase', 'remodeling', 'phase', 'aberration', 'wound', 'healing', 'excessive', 'wound', 'heal', 'hypertrophic', 'scar', 'keloid', 'chronic', 'wound', 'ulcer', 'impair', 'normal', 'physical', 'function', 'large', 'number', 'sophisticated', 'experimental', 'study', 'provide', 'insight', 'wound', 'healing', 'article', 'highlight', 'information', 'main', 'text', 'include', 'wound', 'healing', 'ii', 'wound', 'healing', 'fetus', 'adult', 'iii', 'prostaglandin', 'wound', 'healing', 'iv', 'pathogenesis', 'excessive', 'wound', 'healing', 'v', 'epidemiology', 'excessive', 'wound', 'healing', 'vi', 'vitro', 'vivo', 'study', 'excessive', 'wound', 'healing', 'vii', 'stem',

INFO - 19:53:58: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=50, alpha=0.03)', 'datetime': '2022-04-07T19:53:58.301609', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
INFO - 19:53:58: collecting all words and their counts
INFO - 19:53:58: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 19:53:58: collected 13655 word types from a corpus of 306191 raw words and 2207 sentences
INFO - 19:53:58: Creating a fresh vocabulary
INFO - 19:53:58: Word2Vec lifecycle event {'msg': 'effective_min_count=3 retains 7208 unique words (52.786525082387406%% of original 13655, drops 6447)', 'datetime': '2022-04-07T19:53:58.403567', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
INFO - 19:53:58: Word2Vec lifecycle event {'msg': 'e

Time to build vocab: 0.0 mins


INFO - 19:53:58: worker thread finished; awaiting finish of 4 more threads
INFO - 19:53:58: worker thread finished; awaiting finish of 3 more threads
INFO - 19:53:58: worker thread finished; awaiting finish of 2 more threads
INFO - 19:53:58: worker thread finished; awaiting finish of 1 more threads
INFO - 19:53:58: worker thread finished; awaiting finish of 0 more threads
INFO - 19:53:58: EPOCH - 1 : training on 306191 raw words (142700 effective words) took 0.2s, 726606 effective words/s
INFO - 19:53:59: worker thread finished; awaiting finish of 6 more threads
INFO - 19:53:59: worker thread finished; awaiting finish of 5 more threads
INFO - 19:53:59: worker thread finished; awaiting finish of 4 more threads
INFO - 19:53:59: worker thread finished; awaiting finish of 3 more threads
INFO - 19:53:59: worker thread finished; awaiting finish of 2 more threads
INFO - 19:53:59: worker thread finished; awaiting finish of 1 more threads
INFO - 19:53:59: worker thread finished; awaiting finish

INFO - 19:54:02: worker thread finished; awaiting finish of 6 more threads
INFO - 19:54:02: worker thread finished; awaiting finish of 5 more threads
INFO - 19:54:02: worker thread finished; awaiting finish of 4 more threads
INFO - 19:54:02: worker thread finished; awaiting finish of 3 more threads
INFO - 19:54:02: worker thread finished; awaiting finish of 2 more threads
INFO - 19:54:02: worker thread finished; awaiting finish of 1 more threads
INFO - 19:54:02: worker thread finished; awaiting finish of 0 more threads
INFO - 19:54:02: EPOCH - 14 : training on 306191 raw words (142420 effective words) took 0.3s, 548794 effective words/s
INFO - 19:54:02: worker thread finished; awaiting finish of 6 more threads
INFO - 19:54:02: worker thread finished; awaiting finish of 5 more threads
INFO - 19:54:02: worker thread finished; awaiting finish of 4 more threads
INFO - 19:54:02: worker thread finished; awaiting finish of 3 more threads
INFO - 19:54:02: worker thread finished; awaiting finis

INFO - 19:54:05: EPOCH - 26 : training on 306191 raw words (142475 effective words) took 0.2s, 593401 effective words/s
INFO - 19:54:05: worker thread finished; awaiting finish of 6 more threads
INFO - 19:54:05: worker thread finished; awaiting finish of 5 more threads
INFO - 19:54:05: worker thread finished; awaiting finish of 4 more threads
INFO - 19:54:05: worker thread finished; awaiting finish of 3 more threads
INFO - 19:54:05: worker thread finished; awaiting finish of 2 more threads
INFO - 19:54:05: worker thread finished; awaiting finish of 1 more threads
INFO - 19:54:05: worker thread finished; awaiting finish of 0 more threads
INFO - 19:54:05: EPOCH - 27 : training on 306191 raw words (142726 effective words) took 0.2s, 582561 effective words/s
INFO - 19:54:06: worker thread finished; awaiting finish of 6 more threads
INFO - 19:54:06: worker thread finished; awaiting finish of 5 more threads
INFO - 19:54:06: worker thread finished; awaiting finish of 4 more threads
INFO - 19:

Time to train the model: 0.14 mins
[('healing', 0.8176631331443787), ('heal', 0.7974416613578796), ('acceleration', 0.7646549344062805), ('wh', 0.7627471089363098), ('cutaneous', 0.7290781140327454), ('accelerate', 0.7165963053703308), ('vob', 0.7119753956794739), ('prompt', 0.7103709578514099), ('fast', 0.7019315958023071), ('reepithelization', 0.6986245512962341)]


In [3]:
print(w2v_model.wv.most_similar(positive=["wound"]))

[('healing', 0.8176631331443787), ('heal', 0.7974416613578796), ('acceleration', 0.7646549344062805), ('wh', 0.7627471089363098), ('cutaneous', 0.7290781140327454), ('accelerate', 0.7165963053703308), ('vob', 0.7119753956794739), ('prompt', 0.7103709578514099), ('fast', 0.7019315958023071), ('reepithelization', 0.6986245512962341)]


In [4]:
print(w2v_model.wv.key_to_index)

w2v_model.wv.similarity("wound", 'healing')


{'wound': 0, 'healing': 1, 'cell': 2, 'group': 3, 'study': 4, 'tissue': 5, 'skin': 6, 'factor': 7, 'expression': 8, 'result': 9, 'effect': 10, 'collagen': 11, 'growth': 12, 'treatment': 13, 'increase': 14, 'day': 15, 'protein': 16, 'show': 17, 'control': 18, 'fibroblast': 19, 'process': 20, 'p': 21, 'mouse': 22, 'model': 23, 'proliferation': 24, 'promote': 25, 'treat': 26, 'migration': 27, 'rat': 28, 'compare': 29, 'level': 30, 'significantly': 31, 'human': 32, 'high': 33, 'repair': 34, 'induce': 35, 'inflammatory': 36, 'role': 37, 'diabetic': 38, 'method': 39, 'heal': 40, 'il': 41, 'potential': 42, 'gene': 43, 'hydrogel': 44, 'vitro': 45, 'formation': 46, 'activity': 47, 'analysis': 48, 'vivo': 49, 'conclusion': 50, 'evaluate': 51, 'matrix': 52, 'regeneration': 53, 'keratinocyte': 54, 'demonstrate': 55, 'patient': 56, 'tgf': 57, 'injury': 58, 'investigate': 59, 'accelerate': 60, 'application': 61, 'scaffold': 62, 'time': 63, 'type': 64, 'mechanism': 65, 'response': 66, 'signal': 67, '

0.8176631

In [5]:
# read_file = pd.read_csv ('proteins.txt',sep='\n',header=None)
# read_file.columns = ['Name']
# read_file.to_csv ('protein.csv', index=None)


In [6]:
# read_file = pd.read_csv ('genes.txt',sep='\n',header=None)
# read_file.columns = ['Name']
# read_file.to_csv ('gene.csv', index=None)

In [7]:
proteins=pd.read_csv('protein.csv')
genes=pd.read_csv('gene.csv')

In [8]:
brief_cleaning = (re.sub(" ", '', str(row)).lower() for row in genes['Name'])
genes=pd.DataFrame({'Name': brief_cleaning})
for index, row in genes. iterrows():
    try:
        #print(row['Name']+"--")
        print(w2v_model.wv.similarity("wound",row['Name'])," ",end='')
    except:
        continue
    print(row['Name'])

0.020083994  antigen
0.46437544  apc
0.15897432  ar
0.34920043  chaperone
0.24896464  cyp
0.35889423  cytokine
0.323793  ecd
0.052909985  enzyme
0.32212725  fas
0.4042222  fgg
-0.07143191  fusion
0.18506783  mutant
0.104458034  mutation
0.012628656  genome
0.45848256  genotype
0.22674103  gst
0.2798312  hbegf
0.28035277  human
0.07933154  kit
0.31689298  mal
0.32676414  microrna
0.36117655  mmp
0.3533918  nat
0.13698491  neuropeptide
0.38684675  oat
0.13907573  poly
0.29004866  sea
0.10615881  structural


In [9]:
brief_cleaning = (re.sub(" ", '', str(row)).lower() for row in proteins['Name'])
proteins=pd.DataFrame({'Name': brief_cleaning})
for index, row in proteins. iterrows():
    try:
        print(w2v_model.wv.similarity("wound",row['Name'])," ",end='')
    except:
        continue
    print(row['Name'])

0.18761311  adhesion
0.07572391  adhesive
0.15370989  alpha
0.39958796  amyloid
0.45824456  animal
0.1982137  beta
0.39357454  coagulation
0.22080112  collagen
0.22113352  cytokeratin
0.35889423  cytokine
0.36413145  dysf
0.24617344  dysferlin
0.09992669  elastin
0.5616356  erythropoietin
0.037524328  f
0.35292676  ferritin
0.027093388  fibrinogen
-0.07143191  fusion
-0.0007917881  glycosaminoglycans
0.4721472  gse
0.02578574  hemoglobin
0.03645341  heparin
0.21898253  insulin
0.09609364  keratin
0.36049956  latex
0.06845178  lectin
0.37183213  lens
0.4754085  lyst
0.03888295  messenger
0.3459046  osteocalcin
0.11132097  osteopontin
0.516757  perforin
0.3115009  pigment
0.24592942  prealbumin
0.15414575  procollagen
0.03216675  c
0.03216675  c
0.1431383  carbonyl
-0.005921446  family
0.034056917  hormone
0.04802285  nitrogen
0.25875115  pattern
0.15133888  s
-0.14701825  proteoglycan
0.21725295  proteome
-0.074939415  receptor
0.42006296  respiratory
0.3763293  scaffold
0.28624082  sec

In [10]:
w2v_model.wv.similarity("cytokine", 'collagen')

0.20952192