In [1]:
import pickle
import re

In [2]:
with open("no_stem_no_hashtag_symbols.pickle", 'rb') as f:
    data = pickle.load(f)

In [3]:
embedding_size = 400

In [4]:
global_document = " ".join([tweet["text"] for tweet in data])
global_document = re.sub(r"\s+", " ", global_document).strip()

In [5]:
def get_all_hts(tweets):
    all_hts = []
    count = {}
    for tweet in tweets:
        for ht in tweet["hashtags"]:
            if ht.lower() in count:
                count[ht.lower()] = count[ht.lower()] + 1
            else:
                count[ht.lower()] = 1
            all_hts.append(ht.lower())
    return all_hts, count

In [6]:
all_hts, count = get_all_hts(data)

In [144]:
def filter_some(count):
    hts = []
    for key, value in count.items():
        if value > 200 and value < 500:
            hts.append(key)
    return hts

In [148]:
new_hts = set(filter_some(count))

In [149]:
len(new_hts)

801

In [152]:
len(set(all_hts))

86660

In [10]:
def build_vocab(data, all_hts):
    global_document = " ".join([tweet["text"] for tweet in data])
    global_document = re.sub(r"\s+", " ", global_document).strip()
    words = global_document.split(" ")
    vocab = {}
    hts_vocab = {}
    for tweet in data:
        for ht in tweet["hashtags"]:
            if ht.lower() not in hts_vocab:
                hts_vocab[ht.lower()] = len(hts_vocab)
                
    for word in words:
        if word.lower() not in vocab:
            vocab[word.lower()] = len(vocab)
    reversed_vocab = dict(zip(vocab.values(), vocab.keys()))
    reversed_hts_vocab = dict(zip(hts_vocab.values(), hts_vocab.keys()))
    return vocab, reversed_vocab, hts_vocab, reversed_hts_vocab

In [11]:
vocab, reversed_vocab, ht_vocab, reversed_ht_vocab = build_vocab(data, all_hts)

In [12]:
len(ht_vocab)

86666

In [10]:
len(vocab)

103327

In [155]:
len(reversed_ht_vocab)

801

In [13]:
def map_tweets(data, vocab, ht_vocab):
    tweet_new = []
    for tweet in data:
        new_hts = []
        for ht in tweet["hashtags"]:
            if ht.lower() in ht_vocab:
                new_hts.append(ht_vocab[ht.lower()])
        words = tweet["text"].split(" ")
        new_words = []
        for word in words:
            if word in vocab:
                new_words.append(vocab[word])
        if new_words != [] and new_hts != []:
            tweet_new.append({"text": new_words, "hashtags": new_hts})
    return tweet_new

In [14]:
tweets_encoded = map_tweets(data, vocab, ht_vocab)

In [15]:
tweets_encoded[87364]

{'hashtags': [3426, 2235],
 'text': [2927, 2928, 336, 18, 19, 2801, 8664, 593, 2348, 1254, 55, 5507]}

In [16]:
def make_continuous(encoded_ls):
    cont = []
    for encoded in encoded_ls:
        for ht in encoded["hashtags"]:
            for word in encoded["text"]:
                    cont.append((word, ht))
#                     break
    return cont

In [17]:
tweets_continuous = make_continuous(tweets_encoded)

In [18]:
len(tweets_continuous)

7643678

In [22]:
import numpy as np

In [19]:
def to_categorical(y, n_classes):
    y_cat = np.zeros((len(y), n_classes), dtype=np.int32)
#     print(y_cat.shape)
    y_cat[np.arange(len(y)), y] = 1
    return y_cat

In [75]:
to_categorical([10, 11], 15)

(2, 15)


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [28]:
import gensim



In [29]:
model_twi = gensim.models.KeyedVectors.load_word2vec_format('word2vec_twitter_model.bin', binary=True, unicode_errors="ignore")  

In [74]:
def word_ids_to_embedding(word_ids, reversed_vocab, model):
    words = [reversed_vocab[word_id] for word_id in word_ids]
    word_embeddings = []
    for word in words:
        if word in model.wv:
            word_embeddings.append(model.wv[word])
        else:
            word_embeddings.append(np.average(model.wv[list(model.wv.vocab.keys())], axis=0))
    return np.array(word_embeddings)

In [65]:
class KerasBatchGenerator(object):

    def __init__(self, data, num_steps, batch_size, n_htags, reversed_vocab, model, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.n_htags = n_htags
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset
        # back to zero
        self.current_idx = 0
        # skip_step is the number of words which will be skipped before the next
        # batch is skimmed from the data set
        self.skip_step = skip_step
        self.reversed_vocab = reversed_vocab
        self.model = model
        
    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps, 400))
        y = np.zeros((self.batch_size, self.num_steps, self.n_htags))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = word_ids_to_embedding(list(map(lambda x: x[0], self.data[self.current_idx:self.current_idx + self.num_steps])), self.reversed_vocab, self.model) 
                temp_y = list(map(lambda x: x[1], self.data[self.current_idx:self.current_idx + self.num_steps ]))
                # convert all of temp_y into a one hot representation
                y[i, :, :] = to_categorical(temp_y, n_classes=self.n_htags)
                self.current_idx += self.skip_step
            yield x, y

In [33]:
np.random.shuffle(tweets_continuous)
train_data = tweets_continuous[:int(len(tweets_continuous) * 0.7)]
test_data = tweets_continuous[int(len(tweets_continuous)* 0.7):]

In [34]:
num_steps = 10
batch_size = 50
n_htags = len(ht_vocab)
num_epochs = 10

In [75]:
train_data_generator = KerasBatchGenerator(train_data, num_steps, batch_size, n_htags, reversed_vocab, model_twi,
                                           skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(test_data, num_steps, batch_size, n_htags, reversed_vocab, model_twi, 
                                           skip_step=num_steps)

In [36]:
hidden_size = 10

In [38]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM, Embedding, Dropout, TimeDistributed, Flatten
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [151]:
len(vocab)

103905

In [169]:
n_htags = len(ht_vocab)

In [67]:
model = Sequential()
# model.add(Embedding(len(vocab), 50, input_length=num_steps))
# model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(20, input_shape=(num_steps, 400), return_sequences=True))
# if use_dropout:
# model.add(Dropout(0.5))
model.add(Dense(n_htags))
model.add(Activation('softmax'))

In [49]:
import keras

In [205]:
optimizer = keras.optimizers.SGD(lr=0.5)

In [68]:

model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['categorical_accuracy'])

In [187]:
checkpointer = ModelCheckpoint(filepath="models" + '/model-{epoch:02d}.hdf5', verbose=1)

In [None]:
model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(test_data)//(batch_size*num_steps), verbose=1)



Epoch 1/10


### Get word 2 vecs

In [27]:
def get_all_hts(tweets):
    all_hts = []
    for tweet in tweets:
        all_hts.extend(tweet["hashtags"])
    return all_hts

In [28]:
def create_document_per_ht(tweets):
    hts_to_doc = {}
    for tweet in tweets:
        for hts in tweet["hashtags"]:
            if hts.lower() in hts_to_doc:
                hts_to_doc[hts.lower()] = hts_to_doc[hts.lower()] + " " + tweet["text"]
            else:
                hts_to_doc[hts.lower()] = tweet["text"]
    return hts_to_doc

In [29]:
def remove_whitespaces(hts_to_doc):
    hts_to_doc_new = {}
    for key, value in hts_to_doc.items():
        hts_to_doc_new[key] = re.sub(r"\s+", " ", value).strip()
    return hts_to_doc_new

In [33]:
import gensim



In [31]:
sentences = global_document.split(" ")

In [109]:
model_wv = gensim.models.Word2Vec(min_count=5, size=embedding_size, workers=4, window=5, sg=1)

In [110]:
model_wv.build_vocab([sentences])  # can be a non-repeatable, 1-pass generator

In [111]:
model_wv.train([sentences], epochs=2000, total_examples=len(sentences)) 

(20000000, 2914388000)

In [37]:
def create_ht_embedding(model, ht_to_doc):
    ht_to_embedding = {}
    for ht, doc in ht_to_doc.items():
        words = doc.split(" ")
        words_in_model = [word for word in words if word in model.wv]
        if words_in_model != []:
            embeddings = model.wv[words_in_model]
            ht_to_embedding[ht] = np.average(embeddings, axis=0)
    return ht_to_embedding

In [52]:
hts_to_doc = create_document_per_ht(data)

In [54]:
hts_to_doc = remove_whitespaces(hts_to_doc)

In [55]:
hts_to_doc["morale"]

'isforsal'

In [40]:
ht_to_embedding = create_ht_embedding(model, hts_to_doc)

In [41]:
def get_embedding_of_tweet(tweet, model):
    words = tweet["text"].split(" ")
    words_in_model = [word for word in words if word in model.wv]
    if words_in_model != []:
        return np.average(model.wv[words_in_model], axis=0)
    else:
        return np.average(model.wv[list(model.wv.vocab.keys())], axis=0)

In [128]:
def get_set(tweets, ht_to_embeddings, model):
    X, y = [], []
    for tweet in tweets:
        tweet_embedding = get_embedding_of_tweet(tweet, model)
        for ht in tweet["hashtags"]:
            if ht.lower() in ht_to_embedding:
                X.append(tweet_embedding)
                y.append(ht_to_embedding[ht.lower()])
                break
    return X, y

In [None]:
def get_set_one_hot()

In [43]:
data_copy = data[:]
np.random.shuffle(data_copy)
train_data = data_copy[:int(len(data_copy) * 0.7)]
test_data = data_copy[int(len(data_copy)* 0.7):]

In [129]:
train_ds = get_set(train_data, ht_to_embedding, model_wv)
test_ds = get_set(test_data, ht_to_embedding, model_wv)

In [131]:
X_train, y_train = train_ds
X_test, y_test = test_ds

In [107]:
def get_all_tweets_embeds(tweets, model):
    out = np.ndarray(shape=(len(tweets), embedding_size))
    i = 0
    for tweet in tweets:
        out[i] = get_embedding_of_tweet(tweet, model)
        i += 1
    return out

In [112]:
test_data_embeds = get_all_tweets_embeds(test_data, model_wv)

In [113]:
test_data_embeds.shape

(64854, 100)

In [66]:
np.array(X_train[:2])

array([[ 0.05372949,  0.23699877, -0.72665703,  0.20959198, -0.37921444,
         0.03314155,  0.6936641 , -0.0411706 ,  0.51279944, -0.17726114,
         0.10804163,  0.24417205,  0.32177594,  0.6527193 , -0.00435733,
        -0.43116927,  0.19104266,  0.2960647 ,  0.3152053 , -0.37343585,
         0.01361017, -0.5840937 , -0.14027278,  0.0232118 ,  0.2142771 ,
         0.74627733, -0.24635406,  0.2243491 , -0.1679705 , -0.4126753 ,
         0.78151166, -0.63271   , -0.77929133, -0.57505333, -0.7550303 ,
        -0.26527995, -0.37601975, -0.20180313, -0.44537887, -0.6992404 ,
        -0.47454947,  0.45448953,  0.30768076, -0.05988231, -0.03970312,
        -0.05408294,  0.05286335, -0.04831612,  1.1823    ,  0.42542905,
         0.05556975, -0.3030563 , -0.17094271,  0.0597505 , -0.3027652 ,
        -0.16452956,  0.305381  , -0.3359519 , -1.1972561 ,  0.07526033,
         0.01893089,  0.8890578 , -0.05066783, -0.58174855, -0.66157156,
         0.27091426,  0.6592773 ,  0.25566757,  0.9

In [134]:
model = Sequential()
# model.add(LSTM(10, input_shape=(1, embedding_size)))
model.add(Dense(1000, input_dim=embedding_size, kernel_initializer='normal', activation='relu'))
model.add(Dense(500, kernel_initializer='normal', activation='relu'))
model.add(Dense(250, kernel_initializer='normal', activation='relu'))
model.add(Dense(embedding_size))
model.compile(loss='mse', optimizer='adam', metrics=["cosine_proximity"])
# fit network


In [135]:
model.fit(np.array(X_train), np.array(y_train), epochs=1, batch_size=72, validation_data=(np.array(X_test), np.array(y_test)), verbose=1, shuffle=False)

Train on 151323 samples, validate on 64854 samples
Epoch 1/1


<keras.callbacks.History at 0x161658c4470>

In [83]:
X_test[0].shape

(100,)

In [90]:
model.predict(np.array(X_test)[:2])

array([[ 0.13959762,  0.01631939, -0.41696516,  0.06313697, -0.12622972,
        -0.03720687,  0.25855145, -0.08734902,  0.4994015 , -0.2005446 ,
        -0.1763522 ,  0.08072495, -0.22234736,  0.11754298, -0.03339034,
        -0.38138884,  0.04073703,  0.43529224,  0.20232275, -0.37518978,
         0.16817363, -0.57410574, -0.11513245, -0.18778026,  0.40004945,
         0.5304459 ,  0.08919945, -0.1270259 ,  0.00998069, -0.16448653,
         0.30753332, -0.41158518, -0.23222166,  0.05929966, -0.14071369,
        -0.27334884, -0.25098664, -0.04289363, -0.17035541, -0.3736592 ,
        -0.28505892, -0.11474848,  0.16708745,  0.27939302, -0.37038994,
        -0.18063855,  0.02629133, -0.19152045,  1.0042369 ,  0.04036198,
         0.28215742,  0.14667591, -0.27827984,  0.13116273,  0.02519442,
        -0.25816816,  0.1905063 , -0.2787228 , -0.5846882 , -0.18519688,
         0.00343838,  0.64238465,  0.01729631, -0.09399755, -0.45759088,
         0.29964268,  0.22073944, -0.0358572 ,  0.6

In [86]:
def create_ht_embedding_table(ht_to_embedding):
    '''return table with embeddings and index of ht in table'''
    ht_embedding_table = np.ndarray(shape=(len(ht_to_embedding.keys()), embedding_size), dtype=np.float32)
    i = 0
    idx_to_ht = {}
    for ht, em in ht_to_embedding.items():
        ht_embedding_table[i] = em
        idx_to_ht[i] = ht
        i += 1
    return ht_embedding_table, idx_to_ht

In [136]:
ht_embedding_table, idx_to_ht = create_ht_embedding_table(ht_to_embedding)

In [93]:
import scipy

In [114]:
def get_closest_ht_to_prediction(prediction, ht_embedding_table, idx_to_ht):
    cosine_sims = (1 -  scipy.spatial.distance.cdist(ht_embedding_table, prediction.reshape(1, -1), 'cosine')).reshape(-1)
    max_idx = np.argmax(cosine_sims)
    return idx_to_ht[max_idx]

In [117]:
def get_predictions(ht_embedding_table, idx_to_ht, model, test_embeds):
    print("Predicting...")
    preds = model.predict(test_embeds)
    counter = 0
    pred_out = []
    print("finding closest...")
    for pred in preds:
        print("\r{}/{}".format(counter, len(test_embeds)), end="")
        closest_ht = get_closest_ht_to_prediction(pred, ht_embedding_table, idx_to_ht)
        pred_out.append((counter, closest_ht))
        counter += 1
    return pred_out

In [137]:
preds = get_predictions(ht_embedding_table, idx_to_ht, model, test_data_embeds[:1000])

Predicting...
finding closest...
999/1000

In [123]:
def f1_aloc(test_set, test_predictions):
    i = 0
    precision = 0
    recall = 0
    for prediction in test_predictions:
        counter , pred_ht  = prediction
        tweet = test_set[counter]
        for ht in tweet["hashtags"]:
            if pred_ht.lower() in ht.lower():
                precision += 1
                recall += 1
                break
    print(precision)
       
    precision /= len(test_set)
    recall /= len(test_set)
    return (2 * precision * recall) / (precision + recall)

In [138]:
f1_aloc(test_data[:1000], preds)

64


0.064

In [None]:
def get_closest_ht_to_tweet(tweet, ht_embedding_table, idx_to_ht, model):
    best_sim = 0
    best_ht = ""
    tweet_embedding = get_embedding_of_tweet(tweet, model)
#     for ht, embedding in ht_to_embedding.items():
    cosine_sims = (1 -  scipy.spatial.distance.cdist(ht_embedding_table, tweet_embedding.reshape(1, -1), 'cosine')).reshape(-1)
    max_idx = np.argmax(cosine_sims)
    return idx_to_ht[max_idx]
#         if cosine_sim > best_sim:
#             best_sim = cosine_sim
#             best_ht = ht
#     return best_ht

In [208]:
with open("no_stem_expanded_hashtags_preserved.pickle", 'rb') as f:
    data_no_stem = pickle.load(f)

In [222]:
def drop_all_hashtags(tweets):
    new_tweets = []
    for tweet in tweets:
        tweet_txt = re.sub(r"#(\w+)", "", tweet["text"])
        tweet_txt = re.sub(' +',' ', tweet_txt)        
        new_hashtags = tweet["hashtags"]
        new_tweets.append({"text": tweet_txt.strip().lower(), "hashtags": new_hashtags})
    return new_tweets

In [223]:
tweets_no_hts = drop_all_hashtags(data_no_stem)

In [224]:
tweets_no_hts[101]

{'hashtags': ["thankgodit'sfriday"],
 'text': 'what fucking absolutely wonderfully delightful friday'}

In [225]:
def drop_all_hashtag_symbols(tweets):
    new_tweets = []
    for tweet in tweets:
        tweet_txt = re.sub(r"#", "", tweet["text"])
        tweet_txt = re.sub(' +',' ', tweet_txt)        
        new_hashtags = tweet["hashtags"]
        new_tweets.append({"text": tweet_txt.strip().lower(), "hashtags": new_hashtags})
    return new_tweets

In [226]:
tweets_no_hts_syms = drop_all_hashtag_symbols(data_no_stem)

In [227]:
with open("no_stem_no_hashtags.pickle", "wb") as f:
    pickle.dump(tweets_no_hts, f)

In [228]:
with open("no_stem_no_hashtag_symbols.pickle", "wb") as f:
    pickle.dump(tweets_no_hts_syms, f)