In [15]:
#conda install -c conda-forge spacy-model-en_core_web_sm
import re
import pandas as pd
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing


import logging

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)


##Stage1: convert pubmed text to csv. this section can be commented if we have the csv file

#convert abstracts to csv file from txt
filename = 'pubmed-genesprote-set.txt' # this was downloaded from pubmed

df = pd.read_csv(filename, sep='\n', header=None)
# Keep track of keywords and abstracts
keyword_list = []
abstract_list = []
pmid_list=[]

keywords = ''
abstract = ''
for index, element in enumerate(df[0]):
    # Enumerate all rows
    if element[0:2] == 'AB':
        # Add to the abstract string
        abstract += element[6:]
        counter = 1
        while True:
            new_element = df[0][index + counter]
            if new_element[0:4] != '    ':
                break
            else:
                abstract += new_element[6:]
                counter += 1
                

    elif element[0:3] == 'OT ':
        # Add to the keyword string, 6 if no *, 7 if *
        if element[6] == '*':
            keywords += ' | ' + element[7:]
        else:
            keywords += ' | ' + element[6:]
            
    elif element[0:4] == 'PMID' and index > 0:
        # If no abstract, add empty fields
        if len(abstract) == 0:
            keyword_list.append('EMPTY')
            abstract_list.append('EMPTY')
        else:
            # New publication, append keywords and abstract and clear
            pmid_list.append(element[6:])
            keyword_list.append(keywords[3:])
            abstract_list.append(abstract)
            
        keywords = ''
        abstract = ''

# Append last keywords and abstract
keyword_list.append(keywords)
abstract_list.append(abstract)

print('Looped through records and found ' + str(len(keyword_list)) + ' abstracts.')
result_df = pd.DataFrame({'abstract': abstract_list, 'keywords': keyword_list})

result_df.to_csv('woundabstracts.csv', index=False, encoding='utf-8')
print(len(pmid_list))
for i in range(0,len(pmid_list),100):    
        f=open('pmids/pmid'+str(i)+'.txt', 'w')
        for line in pmid_list[i:i+99]:
            f.write(line+"\n")
        f.close()

Looped through records and found 10000 abstracts.
8617


In [2]:

#stage 2:

df =pd.read_csv('woundabstracts.csv')

##print(df['abstract'])
df=df.dropna().reset_index(drop=True)
##send=[row.split(' ') for row in df]
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"]) # disabling Named Entity Recognition for speed

# this function removes stop words and lemmatizes the words
def cleaning(doc):
    txt=[token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)
brief_cleaning = (re.sub("[^A-Za-z1-9']+", ' ', str(row)).lower() for row in df['abstract'])
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

df_clean = pd.DataFrame({'abstract': txt})
df_clean = df_clean.dropna().drop_duplicates()
print(df_clean.shape)

# convert the dataframe to list of lists, so that we can use it as input in Word2Vec
sent = [row.split() for row in df_clean['abstract']]
print(sent[:2])

import multiprocessing

from gensim.models import Word2Vec
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

w2v_model = Word2Vec(min_count=3,window=2,vector_size=50,sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20,workers=cores-1)

t = time()

w2v_model.build_vocab(sent, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()

w2v_model.train(sent, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

##w2v_model.init_sims(replace=True)

# displays the words that were most similar to the word Wound.
print(w2v_model.wv.most_similar(positive=["wound"]))
##w2v_model.wv.similarity("moe_'s", 'tavern')





Time to clean up everything: 1.61 mins
(2207, 1)
[['wound', 'healing', 'important', 'physiological', 'process', 'maintain', 'integrity', 'skin', 'trauma', 'accident', 'intent', 'procedure', 'normal', 'wound', 'healing', 'involve', 'successive', 'overlap', 'phase', 'include', 'hemostasis', 'inflammatory', 'phase', 'proliferative', 'phase', 'remodeling', 'phase', 'aberration', 'wound', 'healing', 'excessive', 'wound', 'heal', 'hypertrophic', 'scar', 'keloid', 'chronic', 'wound', 'ulcer', 'impair', 'normal', 'physical', 'function', 'large', 'number', 'sophisticated', 'experimental', 'study', 'provide', 'insight', 'wound', 'healing', 'article', 'highlight', 'information', '2', '1', 'main', 'text', 'include', 'wound', 'healing', 'ii', 'wound', 'healing', 'fetus', 'adult', 'iii', 'prostaglandin', 'wound', 'healing', 'iv', 'pathogenesis', 'excessive', 'wound', 'healing', 'v', 'epidemiology', 'excessive', 'wound', 'healing', 'vi', 'vitro', 'vivo', 'study', 'excessive', 'wound', 'healing', 'vii

INFO - 16:55:49: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=50, alpha=0.03)', 'datetime': '2022-04-08T16:55:49.375604', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
INFO - 16:55:49: collecting all words and their counts
INFO - 16:55:49: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:55:49: collected 14966 word types from a corpus of 322018 raw words and 2207 sentences
INFO - 16:55:49: Creating a fresh vocabulary
INFO - 16:55:49: Word2Vec lifecycle event {'msg': 'effective_min_count=3 retains 7702 unique words (51.46331685153014%% of original 14966, drops 7264)', 'datetime': '2022-04-08T16:55:49.675145', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
INFO - 16:55:49: Word2Vec lifecycle event {'msg': 'ef

Time to build vocab: 0.01 mins


INFO - 16:55:50: worker thread finished; awaiting finish of 6 more threads
INFO - 16:55:50: worker thread finished; awaiting finish of 5 more threads
INFO - 16:55:50: worker thread finished; awaiting finish of 4 more threads
INFO - 16:55:50: worker thread finished; awaiting finish of 3 more threads
INFO - 16:55:50: worker thread finished; awaiting finish of 2 more threads
INFO - 16:55:50: worker thread finished; awaiting finish of 1 more threads
INFO - 16:55:50: worker thread finished; awaiting finish of 0 more threads
INFO - 16:55:50: EPOCH - 1 : training on 322018 raw words (150692 effective words) took 0.4s, 365098 effective words/s
INFO - 16:55:50: worker thread finished; awaiting finish of 6 more threads
INFO - 16:55:50: worker thread finished; awaiting finish of 5 more threads
INFO - 16:55:50: worker thread finished; awaiting finish of 4 more threads
INFO - 16:55:50: worker thread finished; awaiting finish of 3 more threads
INFO - 16:55:50: worker thread finished; awaiting finish

INFO - 16:55:55: EPOCH - 13 : training on 322018 raw words (150720 effective words) took 0.4s, 360041 effective words/s
INFO - 16:55:56: worker thread finished; awaiting finish of 6 more threads
INFO - 16:55:56: worker thread finished; awaiting finish of 5 more threads
INFO - 16:55:56: worker thread finished; awaiting finish of 4 more threads
INFO - 16:55:56: worker thread finished; awaiting finish of 3 more threads
INFO - 16:55:56: worker thread finished; awaiting finish of 2 more threads
INFO - 16:55:56: worker thread finished; awaiting finish of 1 more threads
INFO - 16:55:56: worker thread finished; awaiting finish of 0 more threads
INFO - 16:55:56: EPOCH - 14 : training on 322018 raw words (150890 effective words) took 0.4s, 362988 effective words/s
INFO - 16:55:56: worker thread finished; awaiting finish of 6 more threads
INFO - 16:55:56: worker thread finished; awaiting finish of 5 more threads
INFO - 16:55:56: worker thread finished; awaiting finish of 4 more threads
INFO - 16:

INFO - 16:56:01: worker thread finished; awaiting finish of 1 more threads
INFO - 16:56:01: worker thread finished; awaiting finish of 0 more threads
INFO - 16:56:01: EPOCH - 26 : training on 322018 raw words (150560 effective words) took 0.4s, 373143 effective words/s
INFO - 16:56:01: worker thread finished; awaiting finish of 6 more threads
INFO - 16:56:01: worker thread finished; awaiting finish of 5 more threads
INFO - 16:56:01: worker thread finished; awaiting finish of 4 more threads
INFO - 16:56:01: worker thread finished; awaiting finish of 3 more threads
INFO - 16:56:01: worker thread finished; awaiting finish of 2 more threads
INFO - 16:56:01: worker thread finished; awaiting finish of 1 more threads
INFO - 16:56:01: worker thread finished; awaiting finish of 0 more threads
INFO - 16:56:01: EPOCH - 27 : training on 322018 raw words (150486 effective words) took 0.4s, 375326 effective words/s
INFO - 16:56:02: worker thread finished; awaiting finish of 6 more threads
INFO - 16:

Time to train the model: 0.22 mins
[('healing', 0.8282631635665894), ('heal', 0.7697956562042236), ('accelerate', 0.703738808631897), ('acceleration', 0.700387179851532), ('closure', 0.6994804739952087), ('cutaneous', 0.6984421610832214), ('epithelialization', 0.6783369183540344), ('fast', 0.670318603515625), ('process', 0.6583046913146973), ('prompt', 0.6541516780853271)]


In [3]:
print(w2v_model.wv.most_similar(positive=["wound"]))

[('healing', 0.8282631635665894), ('heal', 0.7697956562042236), ('accelerate', 0.703738808631897), ('acceleration', 0.700387179851532), ('closure', 0.6994804739952087), ('cutaneous', 0.6984421610832214), ('epithelialization', 0.6783369183540344), ('fast', 0.670318603515625), ('process', 0.6583046913146973), ('prompt', 0.6541516780853271)]


In [4]:
#the words that were captured by the model
print(w2v_model.wv.key_to_index)

#returns the probability of finding wound and healing together
w2v_model.wv.similarity("wound", 'healing')


{'wound': 0, 'healing': 1, 'cell': 2, '1': 3, 'group': 4, 'study': 5, 'tissue': 6, 'skin': 7, '2': 8, 'factor': 9, 'expression': 10, 'result': 11, 'effect': 12, 'collagen': 13, 'growth': 14, 'treatment': 15, 'increase': 16, 'day': 17, 'protein': 18, 'show': 19, '3': 20, 'control': 21, '5': 22, 'fibroblast': 23, 'process': 24, 'mouse': 25, 'model': 26, 'proliferation': 27, 'promote': 28, 'treat': 29, 'migration': 30, 'rat': 31, 'compare': 32, 'level': 33, 'significantly': 34, 'p': 35, 'human': 36, 'high': 37, 'repair': 38, '4': 39, 'induce': 40, 'inflammatory': 41, 'role': 42, 'diabetic': 43, 'method': 44, 'heal': 45, 'potential': 46, 'gene': 47, 'hydrogel': 48, 'vitro': 49, 'formation': 50, 'activity': 51, '6': 52, 'analysis': 53, '7': 54, 'il': 55, 'vivo': 56, 'conclusion': 57, 'evaluate': 58, 'matrix': 59, 'regeneration': 60, 'keratinocyte': 61, 'demonstrate': 62, 'patient': 63, 'tgf': 64, 'injury': 65, 'investigate': 66, 'accelerate': 67, 'application': 68, 'scaffold': 69, 'time': 7

0.8282633

In [5]:
# read_file = pd.read_csv ('proteins.txt',sep='\n',header=None)
# read_file.columns = ['Name']
# read_file.to_csv ('protein.csv', index=None)


In [6]:
# read_file = pd.read_csv ('genes.txt',sep='\n',header=None)
# read_file.columns = ['Name']
# read_file.to_csv ('gene.csv', index=None)

In [18]:
#read the list of protein and gene names
proteins=pd.read_csv('protein.csv')
genes=pd.read_csv('gene.csv')
genes=genes.dropna().reset_index(drop=True)

Unnamed: 0,Name
0,17betaHSD
1,ABHD10
2,ABHD11
3,ABRAXAS2
4,ACADL
...,...
1244,ZNHIT1
1245,ZNHIT2
1246,ZSWIM1
1247,ZSWIM2


In [8]:
# Now check the similarity of each gene with the word Wound.
brief_cleaning = (re.sub(" ", '', str(row)).lower() for row in genes['Name'])
genes=pd.DataFrame({'Name': brief_cleaning})
woundgenelist=[]
for index, row in genes. iterrows():
    try:
        #print(row['Name']+"--")
        print(w2v_model.wv.similarity("wound",row['Name'])," ",end='')
    except:
        continue
    print(row['Name'])
    woundgenelist.append(row['Name'])

-0.06210877  antigen
0.33054984  apc
0.18564358  ar
0.4115094  ccr4
0.29456285  chaperone
0.31324786  crtac1
0.34622633  cytokine
0.29174197  ecd
0.070893124  enzyme
0.2568543  fas
0.3806056  fgg
-0.049625967  fusion
0.29773426  mutant
0.16334134  mutation
0.028555278  genome
0.43478543  genotype
0.30309278  gst
0.33079416  hbegf
0.19336562  human
0.10279703  kit
0.2797026  mal
0.25706524  microrna
0.3627792  mmp
0.3532332  nat
0.18404989  neuropeptide
0.40758902  oat
0.2597593  pdlim5
0.14127308  piezo1
0.09608696  poly
0.26757577  sea
0.48015898  smoc1
0.08402481  structural
0.34528744  tlr5


In [9]:
# Now check the similarity of each protein with the word Wound.
brief_cleaning = (re.sub(" ", '', str(row)).lower() for row in proteins['Name'])
proteins=pd.DataFrame({'Name': brief_cleaning})
woundproteinlist=[]
for index, row in proteins. iterrows():
    try:
        print(w2v_model.wv.similarity("wound",row['Name'])," ",end='')
    except:
        continue
    print(row['Name'])
    woundproteinlist.append(row['Name'])

0.13513903  adhesion
0.1056684  adhesive
0.13862103  alpha
0.40003067  amyloid
0.4055408  animal
0.1523302  beta
0.2953376  coagulation
0.20858525  collagen
0.14605263  cytokeratin
0.34622633  cytokine
0.349051  dysf
0.23021291  dysferlin
0.11935705  elastin
0.39132467  erythropoietin
0.04685525  f
0.31176072  ferritin
0.019891318  fibrinogen
-0.049625967  fusion
0.067769736  glycosaminoglycans
0.08188351  hemoglobin
0.104874805  heparin
0.41047177  insulin
0.14686608  keratin
0.2351565  latex
0.15533501  lectin
0.3486858  lens
0.49209905  lyst
0.068279795  messenger
0.27521804  osteocalcin
0.14280039  osteopontin
0.447207  perforin
0.30273432  pigment
0.30152965  prealbumin
0.14161403  procollagen
0.00023548305  c
0.00023548305  c
0.19592702  carbonyl
-0.011103079  family
0.034150273  hormone
0.11046398  nitrogen
0.1605269  pattern
0.27179062  s
-0.03196948  proteoglycan
0.20911288  proteome
0.01478523  receptor
0.34165075  respiratory
0.3670912  scaffold
0.27464893  secretory
0.19039

In [17]:
##Now check the similarity between each protein and genes that were similar with wound.
print("\t    Protein\tgene\n")
for i in woundgenelist:
    for j in woundproteinlist:
        print(w2v_model.wv.similarity(i,j)," ",end=' ')
        print(j,"--",i)
        

	    Protein	gene

0.42242092   adhesion -- antigen
0.12334871   adhesive -- antigen
0.5473466   alpha -- antigen
0.5020989   amyloid -- antigen
-0.022818007   animal -- antigen
0.5165346   beta -- antigen
0.33238888   coagulation -- antigen
0.19451895   collagen -- antigen
0.76373535   cytokeratin -- antigen
0.26335105   cytokine -- antigen
0.52053416   dysf -- antigen
0.5183158   dysferlin -- antigen
0.13652647   elastin -- antigen
0.5036062   erythropoietin -- antigen
0.35378256   f -- antigen
0.5239065   ferritin -- antigen
0.24725594   fibrinogen -- antigen
0.25909555   fusion -- antigen
0.31444257   glycosaminoglycans -- antigen
0.2744521   hemoglobin -- antigen
0.17329979   heparin -- antigen
0.16390018   insulin -- antigen
0.19330584   keratin -- antigen
0.34998152   latex -- antigen
0.25534445   lectin -- antigen
0.4498467   lens -- antigen
0.40016904   lyst -- antigen
0.49210125   messenger -- antigen
0.6751852   osteocalcin -- antigen
0.59878635   osteopontin -- antigen
0.44

0.48263395   nitrogen -- crtac1
0.44097087   pattern -- crtac1
0.26054022   s -- crtac1
0.529034   proteoglycan -- crtac1
0.744551   proteome -- crtac1
0.46895227   receptor -- crtac1
0.7904924   respiratory -- crtac1
0.44368643   scaffold -- crtac1
0.72242874   secretory -- crtac1
0.3177041   silk -- crtac1
0.51964223   smad2 -- crtac1
0.77826613   smad4 -- crtac1
0.5021701   structural -- crtac1
0.44997108   synthetic -- crtac1
0.3861274   thymosin -- crtac1
0.74236155   transferrin -- crtac1
0.73028445   tropomyosin -- crtac1
0.8859427   ubiquitin -- crtac1
0.75067174   urine -- crtac1
0.63361037   vimentin -- crtac1
0.73717654   vitronectin -- crtac1
0.30460393   adhesion -- cytokine
0.33211583   adhesive -- cytokine
0.3956407   alpha -- cytokine
0.39468342   amyloid -- cytokine
-0.0040859003   animal -- cytokine
0.47276518   beta -- cytokine
0.49439934   coagulation -- cytokine
0.17959036   collagen -- cytokine
0.26881233   cytokeratin -- cytokine
1.0   cytokine -- cytokine
0.2513

0.05229403   insulin -- mutant
0.1502271   keratin -- mutant
0.25114122   latex -- mutant
0.3797458   lectin -- mutant
0.54171646   lens -- mutant
0.8153476   lyst -- mutant
0.28888237   messenger -- mutant
0.3738416   osteocalcin -- mutant
0.46726403   osteopontin -- mutant
0.5821438   perforin -- mutant
0.53926075   pigment -- mutant
0.32724082   prealbumin -- mutant
0.36789304   procollagen -- mutant
0.29133058   c -- mutant
0.29133058   c -- mutant
0.12761518   carbonyl -- mutant
0.16984954   family -- mutant
-0.028392624   hormone -- mutant
0.11737634   nitrogen -- mutant
0.38281408   pattern -- mutant
0.22504093   s -- mutant
0.18661433   proteoglycan -- mutant
0.49130544   proteome -- mutant
0.3509711   receptor -- mutant
0.47767442   respiratory -- mutant
0.11967364   scaffold -- mutant
0.34800285   secretory -- mutant
0.13463151   silk -- mutant
0.3873355   smad2 -- mutant
0.51329386   smad4 -- mutant
0.20582213   structural -- mutant
0.05133016   synthetic -- mutant
0.0382812

0.70417273   smad4 -- hbegf
0.32957345   structural -- hbegf
0.35942653   synthetic -- hbegf
0.42811558   thymosin -- hbegf
0.7423674   transferrin -- hbegf
0.6365121   tropomyosin -- hbegf
0.8118584   ubiquitin -- hbegf
0.56709856   urine -- hbegf
0.49401   vimentin -- hbegf
0.7221571   vitronectin -- hbegf
0.31507346   adhesion -- human
0.17457302   adhesive -- human
0.09930311   alpha -- human
0.28690025   amyloid -- human
0.19294366   animal -- human
0.22018862   beta -- human
0.14751928   coagulation -- human
0.068775184   collagen -- human
0.16240235   cytokeratin -- human
0.270767   cytokine -- human
0.31429404   dysf -- human
0.24021865   dysferlin -- human
0.089630365   elastin -- human
0.5335285   erythropoietin -- human
0.055157267   f -- human
0.32020834   ferritin -- human
0.21022072   fibrinogen -- human
0.3370374   fusion -- human
0.12764898   glycosaminoglycans -- human
-0.056650043   hemoglobin -- human
0.3460936   heparin -- human
0.22068642   insulin -- human
0.43800

0.6835297   erythropoietin -- neuropeptide
0.06925214   f -- neuropeptide
0.60147107   ferritin -- neuropeptide
0.43240476   fibrinogen -- neuropeptide
0.50457954   fusion -- neuropeptide
0.4774242   glycosaminoglycans -- neuropeptide
0.12046111   hemoglobin -- neuropeptide
0.40698087   heparin -- neuropeptide
0.4685275   insulin -- neuropeptide
0.40844142   keratin -- neuropeptide
0.53293216   latex -- neuropeptide
0.62940407   lectin -- neuropeptide
0.5237194   lens -- neuropeptide
0.6038368   lyst -- neuropeptide
0.34307766   messenger -- neuropeptide
0.64607173   osteocalcin -- neuropeptide
0.46995285   osteopontin -- neuropeptide
0.6133086   perforin -- neuropeptide
0.5961139   pigment -- neuropeptide
0.5248334   prealbumin -- neuropeptide
0.43380034   procollagen -- neuropeptide
0.3546018   c -- neuropeptide
0.3546018   c -- neuropeptide
0.16793288   carbonyl -- neuropeptide
0.6117805   family -- neuropeptide
0.5367788   hormone -- neuropeptide
0.24189326   nitrogen -- neuropepti

0.8107159   smad4 -- smoc1
0.35609162   structural -- smoc1
0.18851025   synthetic -- smoc1
0.3831611   thymosin -- smoc1
0.7959815   transferrin -- smoc1
0.7497201   tropomyosin -- smoc1
0.8758912   ubiquitin -- smoc1
0.61506045   urine -- smoc1
0.5578456   vimentin -- smoc1
0.66972816   vitronectin -- smoc1
0.4988346   adhesion -- structural
0.5078004   adhesive -- structural
0.08261898   alpha -- structural
0.40871593   amyloid -- structural
0.041766018   animal -- structural
0.18644518   beta -- structural
0.51160145   coagulation -- structural
0.49271947   collagen -- structural
0.28604117   cytokeratin -- structural
0.21414034   cytokine -- structural
0.38988984   dysf -- structural
0.33480558   dysferlin -- structural
0.47010994   elastin -- structural
0.17903677   erythropoietin -- structural
0.090868145   f -- structural
0.23386484   ferritin -- structural
0.4923634   fibrinogen -- structural
0.2246244   fusion -- structural
0.5947753   glycosaminoglycans -- structural
0.08166