In [None]:
import numpy as np
from preprocess import preprocess_twitter
from standford_twitter_preprocess import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# loads data
data = preprocess_twitter(preprocess=False)
X_train = data[0]
y_train = data[1]
X_test = data[2]
y_test = data[3]

In [None]:
# Load glove model
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding = "utf8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

nlp = loadGloveModel("../data/glove.twitter.27B.200d.txt")

In [None]:
N = X_train.shape[0]
nlp_objects = list()
nlp_vectors = np.zeros((N, 200))

for i in range(N):
    nlp_objects.append(np.array([nlp[word] if word in nlp else np.zeros(200) for word in tokenize(X_train[i]).split(' ')]))

for i in range(N):
    if nlp_objects[i].any():
        nlp_vectors[i,:] = np.mean(nlp_objects[i], axis = 0)

np.savetxt("../data/X_twitter_train_embedded_avg_glove.csv", nlp_vectors, delimiter=",")

In [None]:
X_train_raw = X_train
tfidf_vec = TfidfVectorizer()
X_train = tfidf_vec.fit_transform(X_train)
index_value={i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}
tfidf_scores = {index_value[column]:value for (column,value) in zip(X_train.indices,X_train.data)}

# computes the weighted sum of the embedded vectors to yield the sentences embedding
for i in range(N):
    # get the tokens of the current sentence
    tokens = [token for token in X_train_raw[i]]
    # get the vectors of the current sentence
    vectors = np.vstack([v for v in nlp_objects[i]])
    # get the weights of the words o the sentence
    weights = np.array([tfidf_scores[token.lower()] if token.lower() in tfidf_scores else 0 for token in tokens])
    # computes the vector embeding of the sentence
    if np.any(weights):
        nlp_vectors[i,:] = np.average(vectors, axis = 0, weights=weights) #normalizes by sum of weights
        
np.savetxt("../data/X_twitter_train_embedded_tfidf_glove.csv", nlp_vectors, delimiter=",")

In [None]:
N = X_test.shape[0]
nlp_objects = list()
nlp_vectors = np.zeros((N, 200))

for i in range(N):
    nlp_objects.append(np.array([nlp[word] if word in nlp else np.zeros(200) for word in tokenize(X_test[i]).split(' ')]))

for i in range(N):
    if nlp_objects[i].any():
        nlp_vectors[i,:] = np.mean(nlp_objects[i], axis = 0)

np.savetxt("../data/X_twitter_test_embedded_avg_glove.csv", nlp_vectors, delimiter=",")

In [None]:
X_test_raw = X_test
tfidf_vec = TfidfVectorizer()
X_test = tfidf_vec.fit_transform(X_test)
index_value={i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}
tfidf_scores = {index_value[column]:value for (column,value) in zip(X_test.indices,X_test.data)}

# computes the weighted sum of the embedded vectors to yield the sentences embedding
for i in range(N):
    # get the tokens of the current sentence
    tokens = [token for token in X_test_raw[i]]
    # get the vectors of the current sentence
    vectors = np.vstack([v for v in nlp_objects[i]])
    # get the weights of the words o the sentence
    weights = np.array([tfidf_scores[token.lower()] if token.lower() in tfidf_scores else 0 for token in tokens])
    # computes the vector embeding of the sentence
    if np.any(weights):
        nlp_vectors[i,:] = np.average(vectors, axis = 0, weights=weights) #normalizes by sum of weights
        
np.savetxt("../data/X_twitter_test_embedded_tfidf_glove.csv", nlp_vectors, delimiter=",")