
Covid-19 Misinformation Classification: Word2Vec Model
==============

Created by Eric Hsieh and Dongsuk Lim

Based off of Gensim's Word2Vec model tutorial from https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

2021-04-19 13:51:07,831 : INFO : loading projection weights from /Users/ehsieh/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2021-04-19 13:51:48,585 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /Users/ehsieh/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2021-04-19T13:51:48.585731', 'gensim': '4.0.1', 'python': '3.9.2 | packaged by conda-forge | (default, Feb 21 2021, 05:02:20) \n[Clang 11.0.1 ]', 'platform': 'macOS-10.14.6-x86_64-i386-64bit', 'event': 'load_word2vec_format'}


In [3]:
from gensim.test.utils import datapath
from gensim import utils
import pandas as pd
import re
import nltk as nl

def normalize_text(s):
    s = s.lower()
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    return s

data = pd.read_csv("post_data_full.csv")
data = data.drop(columns=['url', 'subreddit', 'score', 'permalink','Source'])
data['text'] = [normalize_text(s) for s in data['title']]
tokens = [nl.word_tokenize(title) for title in data['text']]

2021-04-19 13:51:48,633 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-19 13:51:48,633 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2021-04-19 13:51:48,634 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)", 'datetime': '2021-04-19T13:51:48.634429', 'gensim': '4.0.1', 'python': '3.9.2 | packaged by conda-forge | (default, Feb 21 2021, 05:02:20) \n[Clang 11.0.1 ]', 'platform': 'macOS-10.14.6-x86_64-i386-64bit', 'event': 'created'}


In [4]:
print(type(tokens))

<class 'list'>


In [5]:
import gensim.models

sentences = tokens
model = gensim.models.Word2Vec(sentences=sentences)

2021-04-19 13:51:49,626 : INFO : collecting all words and their counts
2021-04-19 13:51:49,627 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-19 13:51:49,634 : INFO : collected 4920 word types from a corpus of 34715 raw words and 2595 sentences
2021-04-19 13:51:49,635 : INFO : Creating a fresh vocabulary
2021-04-19 13:51:49,640 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1021 unique words (20.752032520325205%% of original 4920, drops 3899)', 'datetime': '2021-04-19T13:51:49.640263', 'gensim': '4.0.1', 'python': '3.9.2 | packaged by conda-forge | (default, Feb 21 2021, 05:02:20) \n[Clang 11.0.1 ]', 'platform': 'macOS-10.14.6-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-04-19 13:51:49,640 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 28635 word corpus (82.48595707907245%% of original 34715, drops 6080)', 'datetime': '2021-04-19T13:51:49.640973', 'gensim': '4.0.1', 'python': '3.9.2 | packaged 

In [6]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said


In [7]:
# import tempfile

# with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
#     temporary_filepath = tmp.name
#     model.save(temporary_filepath)
#     #
#     # The model is now safely stored in the filepath.
#     # You can copy it to other machines, share it with others, etc.
#     #
#     # To load a saved model:
#     #
#     new_model = gensim.models.Word2Vec.load(temporary_filepath)

In [8]:
model = gensim.models.Word2Vec(sentences, min_count=10)

2021-04-19 13:51:49,763 : INFO : collecting all words and their counts
2021-04-19 13:51:49,763 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-19 13:51:49,770 : INFO : collected 4920 word types from a corpus of 34715 raw words and 2595 sentences
2021-04-19 13:51:49,770 : INFO : Creating a fresh vocabulary
2021-04-19 13:51:49,774 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 503 unique words (10.223577235772357%% of original 4920, drops 4417)', 'datetime': '2021-04-19T13:51:49.774847', 'gensim': '4.0.1', 'python': '3.9.2 | packaged by conda-forge | (default, Feb 21 2021, 05:02:20) \n[Clang 11.0.1 ]', 'platform': 'macOS-10.14.6-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-04-19 13:51:49,775 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 25248 word corpus (72.72936770848337%% of original 34715, drops 9467)', 'datetime': '2021-04-19T13:51:49.775460', 'gensim': '4.0.1', 'python': '3.9.2 | packaged

In [9]:
# The default value of vector_size is 100.
model = gensim.models.Word2Vec(sentences, vector_size=200)

2021-04-19 13:51:49,876 : INFO : collecting all words and their counts
2021-04-19 13:51:49,877 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-19 13:51:49,884 : INFO : collected 4920 word types from a corpus of 34715 raw words and 2595 sentences
2021-04-19 13:51:49,885 : INFO : Creating a fresh vocabulary
2021-04-19 13:51:49,890 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1021 unique words (20.752032520325205%% of original 4920, drops 3899)', 'datetime': '2021-04-19T13:51:49.890653', 'gensim': '4.0.1', 'python': '3.9.2 | packaged by conda-forge | (default, Feb 21 2021, 05:02:20) \n[Clang 11.0.1 ]', 'platform': 'macOS-10.14.6-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-04-19 13:51:49,892 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 28635 word corpus (82.48595707907245%% of original 34715, drops 6080)', 'datetime': '2021-04-19T13:51:49.892024', 'gensim': '4.0.1', 'python': '3.9.2 | packaged 

In [10]:
# default value of workers=3 (tutorial says 1...)
model = gensim.models.Word2Vec(sentences, workers=4)

2021-04-19 13:51:50,036 : INFO : collecting all words and their counts
2021-04-19 13:51:50,037 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-19 13:51:50,044 : INFO : collected 4920 word types from a corpus of 34715 raw words and 2595 sentences
2021-04-19 13:51:50,045 : INFO : Creating a fresh vocabulary
2021-04-19 13:51:50,050 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1021 unique words (20.752032520325205%% of original 4920, drops 3899)', 'datetime': '2021-04-19T13:51:50.050047', 'gensim': '4.0.1', 'python': '3.9.2 | packaged by conda-forge | (default, Feb 21 2021, 05:02:20) \n[Clang 11.0.1 ]', 'platform': 'macOS-10.14.6-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-04-19 13:51:50,051 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 28635 word corpus (82.48595707907245%% of original 34715, drops 6080)', 'datetime': '2021-04-19T13:51:50.051455', 'gensim': '4.0.1', 'python': '3.9.2 | packaged 

In [11]:
# instantiating and training the Word2Vec model
model_with_loss = gensim.models.Word2Vec(
    sentences,
    min_count=1,
    compute_loss=True,
    hs=0,
    sg=1,
    seed=42,
)

# getting the training loss value
training_loss = model_with_loss.get_latest_training_loss()
print(training_loss)

2021-04-19 13:51:50,190 : INFO : collecting all words and their counts
2021-04-19 13:51:50,190 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-19 13:51:50,196 : INFO : collected 4920 word types from a corpus of 34715 raw words and 2595 sentences
2021-04-19 13:51:50,197 : INFO : Creating a fresh vocabulary
2021-04-19 13:51:50,213 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 4920 unique words (100.0%% of original 4920, drops 0)', 'datetime': '2021-04-19T13:51:50.213456', 'gensim': '4.0.1', 'python': '3.9.2 | packaged by conda-forge | (default, Feb 21 2021, 05:02:20) \n[Clang 11.0.1 ]', 'platform': 'macOS-10.14.6-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-04-19 13:51:50,213 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 34715 word corpus (100.0%% of original 34715, drops 0)', 'datetime': '2021-04-19T13:51:50.213917', 'gensim': '4.0.1', 'python': '3.9.2 | packaged by conda-forge | (default, Feb 

855485.375


In [13]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)