# Brexit Polarity Tweets - Word Embeddings

In [146]:
from gensim import utils
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim.models.fasttext import FastText

In [149]:
# settings
EMBEDDING_DIMS = 300
SEQ_LENGTH  = 200
WINDOW_SIZE = 5
MIN_COUNT   = 5
EPOCHS      = 5

# paths to data
PATH_TWEETS = "./data/preprocessed/train/0-clean.txt"

In [156]:
def train_embedding(model_type, corpus_file, vector_size, window, min_count, sg, workers):
    if model_type.lower() == "word2vec":
        model = Word2Vec(corpus_file = corpus_file,
                         vector_size = vector_size,
                         window = window,
                         min_count = min_count,
                         sg = sg,
                         workers = workers)
    elif model_type.lower() == "fasttext":
        model = FastText(corpus_file = corpus_file,
                         vector_size = vector_size,
                         window = window,
                         min_count = min_count,
                         workers = workers)
    else:
        return None
    
    return model.wv.index_to_key, model.wv.vectors

In [158]:
vocab, vectors = train_embedding("word2vec",
                                 corpus_file = PATH_TWEETS,
                                 vector_size = EMBEDDING_DIMS,
                                 window = WINDOW_SIZE,
                                 min_count = MIN_COUNT,
                                 sg = 1,
                                 workers = 10)

## Neural Network

In [1]:
PATH_DATA  = "./data/"
PATH_TRAIN = PATH_DATA + "preprocessed/train/"
PATH_TEST  = PATH_DATA + "preprocessed/test/"

In [4]:
# define helper functions
def read_tweet(filepath):
    tweets = []
    with open(filepath, "r") as f:
        for tweet in f:
            tweets.append(tweet.replace("\n", ""))
    return pd.Series(tweets)

In [116]:
tokenizer = nltk.TweetTokenizer()

In [117]:
tweets = read_tweet(PATH_TRAIN + "0-clean.txt")[:10000]
targets = read_tweet(PATH_TRAIN + "0-targets.txt")[:10000]

In [118]:
def get_vocab(tweets, seq_length):
    # create a variable to store frequency distribution based on label
    fdist = nltk.FreqDist()

    # calculate the frequency of tokens based on label
    for index, tweet in zip(tweets.index, tweets):
        for token in tokenizer.tokenize(tweet):
            fdist[token] += 1
   
    return [token for token, count in fdist.items()]

vocab = get_vocab(tweets, SEQ_LENGTH)

In [119]:
vectorizer = TextVectorization(output_sequence_length = SEQ_LENGTH,
                               output_mode = 'int',
                               vocabulary  = vocab)

In [120]:
model_neuralnet = Sequential([
    Embedding(len(vocab) + 2, EMBEDDING_DIMS, input_shape = (SEQ_LENGTH,)),
    GlobalAveragePooling1D(),
    Dense(1, activation = "sigmoid")
])

model_neuralnet.compile(optimizer = 'adam', loss = "BinaryCrossentropy", metrics = ['accuracy'])

In [121]:
label_encoder = LabelEncoder()

train_data = vectorizer(tweets)
train_label = label_encoder.fit_transform(targets)

history = model_neuralnet.fit(train_data, train_label, epochs = EPOCHS, verbose = 1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Saving

In [144]:
def save_embeddings(vocab, vectors, filepath):
    assert len(vocab) == len(vectors)
    
    with open(filepath, "w") as f:
        for word, vector in zip(vocab, vectors):
            f.write(word + " ")
            f.write(" ".join(vector) + "\n")

In [136]:
weights = model_neuralnet.layers[0].get_weights()[0][2:].astype(str)

In [137]:
vocab = vectorizer.get_vocabulary()[2:]

In [138]:
len(vocab)

20223

In [143]:
for w in weights:
    print(w.shape)
    print(" ".join(w[:10]))
    break

(300,)
-0.0030258286 -0.033835553 -0.00038227858 0.061044343 0.028527828 -0.0060541765 0.021054296 -0.013385113 0.0007711328 0.026431696


In [139]:
weights.shape

(20223, 300)

In [145]:
save_embeddings(vocab, weights, "test.txt")