As part of training a model, I included word vectors in the training data to improve performace. 
The word vectors were created based on text from the original research publications and saved as word_vecs.txt. 

In [1]:
import spacy
import string
import csv

import pandas as pd
import numpy as np
import re
import glob
import random
from random import sample
import json
import jsonlines
import pysbd
seg = pysbd.Segmenter(language='en', clean=False)
import spacy
from spacy import displacy

In [2]:
papers_train = glob.glob('./papers_train/*')
print(f"Creating Vectors using {len(papers_train)} TEST papers")

Creating Vectors using 400 TEST papers


In [3]:
## Pull data from the publications' reference sections

def RefEntriesParse(file):
    with open(file, 'r') as f:
        reflist = []    
        data = json.loads(f.read()) ## dictionary
        for k,v in data["pdf_parse"]["ref_entries"].items():
            if k.startswith("FIGREF") or k.startswith("TABREF"):
                ref_label = k
                reflist.append(ref_label)
    return(reflist)

def json2text(file, refID):
    with open(file, 'r') as f:
        data = json.loads(f.read())
        ref_text_df = pd.json_normalize(data["pdf_parse"]["ref_entries"][refID])
    return(ref_text_df) 

def text2sentences(ref_text_df):
    sentences_text = " ".join(list(ref_text_df.text))  #sentences_text is a string
    return(sentences_text)

def sentences2sentencelist(text):
    sentences = seg.segment(text) #sentences is a list of strings
    sentences = [re.sub(r"^\W+", "", sentence) for sentence in sentences] 
    sentences = [re.sub(r"\s+", " ", sentence) for sentence in sentences]
    return(sentences) #sentences is a list of strings



In [4]:
mylist =  []
for file in papers_train:
    reflist = RefEntriesParse(file)
    for refID in reflist:
        try:
            ref_text_df = json2text(file, refID)
            sentences_text = text2sentences(ref_text_df)
            sentencelist = sentences2sentencelist(sentences_text)
            mylist.extend(sentencelist)
        except:
            print(file + " returns error.")
            continue

In [5]:
print(len(mylist))

21795


In [6]:
## Pull data from the publications' body text

def json2text(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.loads(f.read())
        body_text_df = pd.json_normalize(data["pdf_parse"]["body_text"])   #body_text_df is a dataframe object 
    return(body_text_df) #body_text_df is a dataframe object 

def text2sentences(body_text_df):
    sentences_text = " ".join(list(body_text_df.text))  #sentences_text is a string
    #pattern_brackets = re.compile(r'\(.*?\)')
    #sentences_text = re.sub(pattern_brackets, "", sentences_text)
    return(sentences_text) #sentences_text still a string

def sentences2sentencelist(text):
    sentences = seg.segment(text) #sentences is a list of strings
    sentences = [re.sub(r"^\W+", "", sentence) for sentence in sentences] 
    sentences = [re.sub(r"\s+", " ", sentence) for sentence in sentences]
    return(sentences) #sentences is a list of strings

In [7]:
for file in papers_train:
    try:
        body_text_df = json2text(file) #body_text_df is a dataframe object
        sentencetext = text2sentences(body_text_df) #sentencetext is a string
        sentencelist = sentences2sentencelist(sentencetext) #Sentencelist is a list of strings
        mylist.extend(sentencelist)
    except:
        print(file+ " returns error.")
        continue

./papers_train\0e8472a7-dd33-2379-a88f-58c1237d7b90.json returns error.


In [8]:
print(len(mylist))

115544


In [9]:
## Start creating word vectors

nlp = spacy.load("en_core_web_sm")

In [10]:
## Exclude common stopwords

stopwords = ["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves",
             "he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their",
             "theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was",
             "were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and",
             "but","if","or","because","as","until","while","of","at","by","for","with","about","against","between",
             "into","through","during","before","after","above","below","to","from","up","down","in","out","on","off",
             "over","under","again","further","then","once","here","there","when","where","why","how","all","any","both",
             "each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very",
             "s","t","can","will","just","don","should","now"]

In [11]:
sentences = []

In [12]:
corpus = ""

In [13]:
corpus_data = mylist
        
print(corpus_data[114])
print(type(corpus_data))
print(len(corpus_data))

A chipmunk of the quadrivittatus group (see A. H. Howell, Revision of the American Chipmunks, North American Fauna, no. 52, pp.
<class 'list'>
115544


In [14]:
for i in corpus_data:
    corpus_sentence = i
    
    corpus_sentence = corpus_sentence.lower()
    words = corpus_sentence.split()
    
    new_corpus = []
    for word in words:
        if word not in stopwords:
            new_corpus.append(word)
    corpus_sentence = " ".join(new_corpus)
    ##print(corpus_sentence) ##strings
    
    doc = nlp(corpus_sentence)
    for sent in doc.sents:
        sentence = sent.text.translate(str.maketrans('', '', string.punctuation))
        words = sentence.split()
        sentences.append(words)

In [15]:
print(len(sentences)) 
print(sentences[3])

126557
['b', 'model', 'bform', 'dna', 'methylated', 'cytosines', 'two', 'selfcomplementary', 'cpg', 'sequences']


In [16]:
def create_wordvecs(corpus, model_name):
    from gensim.models.word2vec import Word2Vec
    from gensim.models.phrases import Phrases, Phraser
    from collections import defaultdict
    
    print (len(corpus))
    

    phrases = Phrases(corpus, min_count=30, progress_per=10000)
    print ("Made Phrases")
    
    bigram = Phraser(phrases)
    print ("Made Bigrams")
    
    sentences = phrases[corpus]  #uses phrases function on corpus
    print ("Found sentences")
    word_freq = defaultdict(int)

    for sent in sentences:
        for i in sent:
            word_freq[i]+=1

    print (len(word_freq))
    print ("Training model now...")
    w2v_model = Word2Vec(min_count=1,
                        window=2,
                        vector_size=100,
                        sample=6e-5,
                        alpha=0.03,
                        min_alpha=0.0007,
                        negative=20)
    w2v_model.build_vocab(sentences, progress_per=10000)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
    w2v_model.wv.save_word2vec_format(f"{model_name}.txt")
create_wordvecs(sentences, "word_vecs")

126557
Made Phrases
Made Bigrams
Found sentences
103894
Training model now...


In [17]:
with open ("word_vecs.txt", "r", encoding='utf8') as f:
    data = f.readlines()
    print (data[0])

103894 100



In [18]:
print (data[32])

eg -0.563712 0.34385306 1.0083059 -1.0323424 0.28970072 0.40693393 -1.2952132 1.5048887 0.33374208 -0.32921082 0.73212796 -0.51552105 0.39277503 0.6580963 0.9097592 -0.3348809 -0.19832456 -0.62542444 -0.81215334 1.221862 0.7266369 -0.14894828 0.3656226 0.3982717 0.17695053 -0.105870664 -0.4493262 0.90788645 -0.6102477 0.46399173 0.003759885 -0.91160566 -0.09504749 -0.25322843 0.51737046 0.009254119 1.0340173 -0.7096859 0.6994959 0.2039257 -0.07492078 -0.8316059 1.0323558 0.35563767 0.43284747 0.14053516 0.51764256 -0.531015 0.76414245 -0.31208822 -0.4826614 0.44210315 -0.23457038 -0.29152867 0.37288314 0.15079196 -0.6035176 0.2249789 -0.06084129 0.5014859 -0.02693485 0.6821231 -0.45123282 -0.8941378 1.0836748 -0.001101461 0.253695 0.54476374 -0.083126426 -0.14241752 -0.48215535 -0.5296194 0.4788979 0.041297078 1.0733358 -0.7410006 0.6069829 0.51520765 -0.092621356 -0.4145046 -0.11635243 -0.24391577 0.3782005 -0.1578647 1.1697338 -0.69408745 -0.14193478 -0.17343818 -1.0569074 0.13193585