Packages

In [None]:
import nltk
import spacy
import pandas as pd
import re
from gensim.models.phrases import Phrases, Phraser
from collections import defaultdict
import multiprocessing
from gensim.models import Word2Vec

Import Data

In [None]:
df = pd.read_json("./arxiv-metadata-oai-snapshot.json")
df.head()

Prepping Data

In [None]:
df = df.dropna().reset_index(drop=True)
df = df.sort_values(by="year")
abstracts = df['summary'].str.lower()

Cleaning Data

In [None]:
nlp = spacy.load('en', disable=['ner', 'parser'])

In [None]:
def deep_clean(document):
    texts = [token.lemma_ for token in document if not token.is_stop]
    if len(texts) > 2:
        return ' '.join(texts)

In [None]:
prep_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['summary'])
cleaned_text = [deep_clean(doc) for doc in nlp.pipe(prep_cleaning, batch_size=5000, n_threads=-1)]

In [None]:
clean_df = pd.DataFrame({'clean': cleaned_text})
clean_df = clean_df.dropna().drop_duplicates()

Phrases

In [None]:
process_phrases = [row.split() for row in clean_df['clean']]

In [None]:
phrases = Phrases(process_phrases, min_count=30, progress_per=10000)
bigram = Phraser(phrases)

In [None]:
sentences = bigram[process_phrases]

Word Frequencies

In [None]:
frequencies = defaultdict(int)
for each in sentences:
    for i in each:
        frequencies[i] += 1
len(frequencies)
sorted(frequencies, key=frequencies.get, reverse=True)

Training

In [None]:
cores = multiprocessing.cpu_count()

training_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-2)

In [None]:
training_model.build_vocab(sentences, progress_per=10000)

In [None]:
training_model.train(sentences, total_examples=training_model.corpus_count, epochs=30, report_delay=1)

In [None]:
training_model.init_sims(replace=True)

In [None]:
training_model.wv.most_similar(positive=["software"])

In [None]:
training_model.wv.similarity("technology", 'era')