In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn import feature_extraction
from sklearn import manifold

import glove
import text_utils


%load_ext autoreload
%autoreload 2

# tf.debugging.enable_check_numerics()

min_term_frequency = 50
skip_grams_window = 5
batch_size = 2*1024
train_new_model = True
vector_dims = 50

In [None]:
Urk_dataset_Glove = pd.read_csv('data/clusteranglo2.csv')
text = Urk_dataset_Glove['text']
text = text.apply(text_utils.clean_text)

In [None]:
# stopwords <- c(tm::stopwords("english"), tm::stopwords("french"), "wont", "reuters", "gt", "marketsnews", "rbssfinancialservicesandrealestatenews", "com", "update", "bondsnews", "breaking", "amp", "says", "see", "well", "needs", "pr", "re", "ukraines", "think", "go", "mt", "cant", "wants", "doesnt", "said", "ht", "will", "f", "b", "e","h", "w", "isnt", "ap", "pm", "st", "g", "r", "n", "ueu", "ucueuuauu", "uuuuuuu", "x", "uufue", "uuua","uduu", "ufua", "u", "uuueu", "uueuu", "uuuueu", "ufuduuaueuuu", "ufuduuaueuuuu", "le", "la", "du", "s", "un", "et", "des",  "im", "ive", "uu", "t.co", "RT", "rt", "http", "httpt", "u0404u0432u0440u043eu043cu0430u0439u0434u0430u043d", "russia", "russian", "russians", "ukrainians", "crimean", "crimea", "httptco", "uuuueucuuuuud","ukraine", "ukrainian", "russia", "sochi2014", "olympics", "euromaidan", "sochi", "good", "president", "obama", "hockey", "even", "many", "olympic", "start", "much", "just", "another", "last", "calls", "way", "join", "next", "way", "going", "still", "back", "people", "russias", "putins", "support", "presid", "uuauuuudu", "udu", "ask", "httpc", "htt", "photo", "move", "year", "must", "tell", "week", "dec", "let", "eas", "meet", "maidan", "gold", "day", "time", "video", "watch", "digitalmaidan", "today", "live", "peopl", "cdnpoli", "militari", "crisi", "protest", "canada", "kiev", "kyiv", "get", "one", "new", "dont", "news","can", "use", "putin", "russia", "ukrain", "call", "show", "need", "via", "updat", "march", "feb", "want", "rt", "rbssfinancialservicesandrealestatenew", "may", "say", "know", "bondsnew", "marketnews", "marketsnew", "usdollarrpt", "stock", "olymp", "look", "give", "make", "talk", "like", "now","tcot", "take")
# it = itoken(Urk_dataset_Glove$text[132730:224885], tolower, tokenizer = word_tokenizer)

tokenizer = text_utils.create_tokenizer('tweet', preserve_case=False, strip_handles=True)
stopwords = text_utils.load_stopwords('english', 'french', custom_list=True)

# v = create_vocabulary(it, stopwords = stopwords)

word_vectorizer = feature_extraction.text.CountVectorizer(
    tokenizer=tokenizer,
    stop_words=stopwords,
)
word_frequencies = np.sum(word_vectorizer.fit_transform(text), axis=0)


vocabulary_df = pd.DataFrame(word_vectorizer.vocabulary_.items(), columns=['token', 'ix']).set_index(['ix'])
vocabulary_df['freq'] = pd.DataFrame(word_frequencies.T)
vocabulary_df.sort_index(inplace=True)
vocabulary_df

In [None]:
# #remove very common and uncommon words
# v <- prune_vocabulary(v, term_count_min = 600L)

vocabulary_df: pd.DataFrame = vocabulary_df.loc[vocabulary_df['freq'] >= min_term_frequency].reset_index(drop=True)
vocab_size = vocabulary_df.shape[0]
vocabulary = dict(zip(vocabulary_df.token, vocabulary_df.index))

In [None]:
word_vectorizer = feature_extraction.text.CountVectorizer(
    tokenizer=tokenizer,
    vocabulary=vocabulary
)

# dtm = create_dtm(it, vectorizer)
# tcm <- create_tcm(it, vectorizer, skip_grams_window = 5L)
# Document term matrix
dtm = word_vectorizer.fit_transform(text)

In [None]:
# glove <- GlobalVectors$new(rank = 50, x_max = 20)
# wv_main = glove$fit_transform(tcm, n_iter = 150, convergence_tol = 0.01)
if train_new_model:
    tcm = glove.create_tcm(tokenizer=tokenizer, vocabulary=vocabulary, corpus=text, skip_grams_window=skip_grams_window)
    print(tcm[:10,:10])
    plt.figure(figsize=[20, 20])
    plt.matshow(tcm)
    epochs = 1
    model, word_vectors = glove.train_embeddings(
        tcm=tcm,
        vector_dims=vector_dims,
        batch_size=batch_size*2,
        x_max=20,
        epochs=epochs,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5),
        ],
        save_model=f'saved_models/mintf={min_term_frequency},skipgram_w={skip_grams_window},epochs={epochs}/glove1.h5',
        save_embedding=f'saved_models/mintf={min_term_frequency},skipgram_w={skip_grams_window},epochs={epochs}/glove1_emb.npy',
    )
else:
    model = glove.load_pretrained_embeddings(model_address='saved_models/mintf=20,skipgram_w=5,epochs=150/glove1.h5')
    word_vectors = glove.load_pretrained_embeddings(model_address='saved_models/mintf=20,skipgram_w=5,epochs=150/glove1_emb.npy')

In [None]:
import wmd
from collections import Counter
wmd_docs = dict()
for dix, doc in enumerate(text):
    words = Counter(t for t in tokenizer(doc) if t in vocabulary)
    sorted_words = sorted(words)
    wmd_docs[str(dix)] = (
        str(dix),
        glove.doc_to_ids(doc, tokenizer, vocabulary),
        np.array([words[t] for t in sorted_words], dtype=np.float32)
    )
    # print(wmd_docs[str(dix)])


wmd_calc = wmd.WMD(embeddings=word_vectors2, nbow=wmd_docs)

# rwmd_model = RWMD$new(dtm, wv)
# rwmd_dist = dist2(dtm[1:22989, ], dtm[132730:224885, ], method = rwmd_model, norm = 'none')
# head(rwmd_dist)

In [None]:
# library(Rtsne)
# library(plotly)
# count = v$term_count  # vocab_size
# tsne <- Rtsne(word_vectors, perplexity = 20, pca = FALSE)
tSNE = manifold.TSNE(perplexity=10)
results = tSNE.fit_transform(word_vectors)
vocabulary_df['TSNE_X'], vocabulary_df['TSNE_Y'] = results[:,0], results[:,1]

# %%

plt.figure(figsize=[20,25])
plt.scatter(
    x=vocabulary_df['TSNE_X'],
    y=vocabulary_df['TSNE_Y'],
    # labels=vocabulary['token']
    # s=40 * vocabulary_df['freq'] / vocabulary_df['freq'].max()
    s=np.sqrt(vocabulary_df['freq']),
    alpha=.3,
    # c='#FFFF66'
)
for _,item in vocabulary_df.sort_values('freq', ascending=not True).iloc[:100].iterrows():
    plt.text(item['TSNE_X'], item['TSNE_Y'], item['token'])

# tsne_plot <- tsne$Y %>%
#   as.data.frame() %>%
#   mutate(word = row.names(word_vectors)) %>%
#   ggplot(aes(x = V1, y = V2, label = word)) +
#   ggtitle("t-SNE Russian cluster tweets - < 175")+
#   geom_point(aes(V1, V2, size = count, alpha =.1), color = "#FFFF66")+
#   geom_text_repel(size = 4)+
#   scale_size(range = c(.5, 50))+
#   theme(legend.position = "none")
# tsne_plot