# Modelling Word Vector Embeddings

In [1]:
from gensim.models import Phrases, Word2Vec
import json
import random
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from utils import *

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fabianbeigang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/fabianbeigang/nltk_data...
[nltk_data]   Package words is already up-to-date!


## Load and prepare data

In [2]:
# Load preprocessed data from YouGov/SE
with open('data/preprocessed_sentences_se_and_trusted.json', 'r') as f:
    preprocessed_sentences_se_and_trusted = json.load(f)

# Load p# Load preprocessed data from CAM
with open('data/preprocessed_sentences_pseudoscience.json', 'r') as f:
    preprocessed_sentences_pseudoscience = json.load(f)

In [12]:
# Print the first three sentences for each
print("SE and Trusted Sources:")
for sent in preprocessed_sentences_se_and_trusted[:3]:
    print(sent)
print()
print("Pseudoscience:")
for sent in preprocessed_sentences_pseudoscience[:3]:
    print(sent)


SE and Trusted Sources:
['riley', 'year_token', 'antimicrobial', 'activity', 'major', 'component', 'essential', 'oil', 'rare_token', 'rare_token']
['mode', 'antimicrobial', 'action', 'essential', 'oil', 'rare_token', 'rare_token', 'tea', 'tree', 'oil']
['study', 'minimum', 'inhibitory', 'concentration', 'mode', 'action', 'oregano', 'essential', 'oil', 'transition', 'pore', 'inner', 'mitochondrial', 'membrane', 'operate', 'open', 'state', 'different', 'selectivity']

Pseudoscience:
['disease']
['patient', 'rare_token', 'muscle', 'contract', 'tic', 'rare_token', 'face', 'usually', 'left']
['individual', 'control', 'spasm', 'occur', 'asleep']


In [13]:
# Identify and extract unigram and bigram phrases
bigram_transformer = Phrases(preprocessed_sentences_se_and_trusted+preprocessed_sentences_pseudoscience, min_count=100, threshold=20) 
bigram_sentences_se_trusted = [bigram_transformer[s] for s in preprocessed_sentences_se_and_trusted]
bigram_sentences_pseudoscience = [bigram_transformer[s] for s in preprocessed_sentences_pseudoscience]

In [14]:
# Print example phrases for both corpora
print(f"SE/Trusted: {set([word for sublist in bigram_sentences_se_trusted[:50] for word in sublist if '_' in word])}")
print(f"Pseudoscience: {set([word for sublist in bigram_sentences_pseudoscience[:50] for word in sublist if '_' in word])}")

SE/Trusted: {'tea_tree', 'essential_oil', 'primary_secondary', 'human_papilloma', 'creative_common', 'url_token', 'faculty_member', 'sustainable_development', 'sexually_transmit', 'national_institute', 'broad_range', 'semi_structured', 'inclusion_criterion', 'rare_token', 'islamic_republic', 'year_token', 'maternal_neonatal', 'num_token', 'mode_action'}
Pseudoscience: {'nervous_system', 'calcium_magnesium', 'rare_token', 'john_wort', 'leave_untreated', 'blood_clotting', 'blood_vessel', 'green_leafy', 'regular_basis', 'botulinum_toxin', 'graphene_oxide', 'cranial_nerve', 'pfizer_moderna', 'polyethylene_glycol', 'johnson_johnson', 'num_token', 'middle_aged'}


In [10]:
# Save bigram_transformer vocab
with open('data/bigram_transformer_vocab.json', 'w') as f:
    json.dump(bigram_transformer.vocab, f)

## Train word2vec word vector embedding models

In [15]:
# Set model parameters (parameters based on Rodriguez and Spirling, 2021)
vector_size = 300 #before: 50
window = 6 # before: 5

In [17]:
# Train word2vec model for SE and YouGov corpus
model_se_trusted = Word2Vec(bigram_sentences_se_trusted, vector_size=vector_size, window=window, min_count=5, workers=4)
model_se_trusted.save("word2vec_with_bigrams_se_trusted_300_6.model")

In [18]:
# Train word2vec model for CAM corpus
model_pseudoscience = Word2Vec(bigram_sentences_pseudoscience, vector_size=vector_size, window=window, min_count=5, workers=4)
model_pseudoscience.save("word2vec_with_bigrams_pseudoscience_300_6.model")