In [1]:
import numpy as np
import spacy
from preprocess import preprocess_treebank
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [2]:
# loads data
data = preprocess_treebank()
X_train = data[0]
y_train = data[1]
X_test = data[2]
y_test = data[3]

In [3]:
# loads spacy model
nlp = spacy.load('en_core_web_lg')

In [4]:
N = X_train.shape[0]
nlp_objects = list()
nlp_vectors = np.zeros((N, 300))

for i in tqdm(range(N)):
    nlp_objects.append(nlp(X_train[i]))
    
for i in range(N):
    nlp_vectors[i,:] = nlp_objects[i].vector
    
np.savetxt("../data/X_treebank_train_embedded_avg_spacy.csv", nlp_vectors, delimiter=",")

100%|████████████████████████████████████████████████████████████████████████████| 83881/83881 [14:29<00:00, 96.44it/s]


In [5]:
tfidf_vec = TfidfVectorizer()
X_train = tfidf_vec.fit_transform(X_train)
index_value={i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}
tfidf_scores = {index_value[column]:value for (column,value) in zip(X_train.indices,X_train.data)}

# computes the weighted sum of the embedded vectors to yield the sentences embedding
for i in range(N):
    # get the tokens of the current sentence
    tokens = [token.text for token in nlp_objects[i]]
    # get the vectors of the current sentence
    vectors = np.vstack([token.vector for token in nlp_objects[i]])
    # get the weights of the words o the sentence
    weights = np.array([tfidf_scores[token.lower()] if token.lower() in tfidf_scores else 0 for token in tokens])
    # computes the vector embeding of the sentence
    if np.any(weights):
        nlp_vectors[i,:] = np.average(vectors, axis = 0, weights=weights)
        
np.savetxt("../data/X_treebank_train_embedded_tfidf_spacy.csv", nlp_vectors, delimiter=",")

In [6]:
N = X_test.shape[0]
nlp_objects = list()
nlp_vectors = np.zeros((N, 300))

for i in tqdm(range(N)):
    nlp_objects.append(nlp(X_test[i]))
    
for i in range(N):
    nlp_vectors[i,:] = nlp_objects[i].vector
    
np.savetxt("../data/X_treebank_test_embedded_avg_spacy.csv", nlp_vectors, delimiter=",")

100%|██████████████████████████████████████████████████████████████████████████████| 1821/1821 [00:22<00:00, 81.66it/s]


In [7]:
tfidf_vec = TfidfVectorizer()
X_test = tfidf_vec.fit_transform(X_test)
index_value={i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}
tfidf_scores = {index_value[column]:value for (column,value) in zip(X_test.indices,X_test.data)}

# computes the weighted sum of the embedded vectors to yield the sentences embedding
for i in range(N):
    # get the tokens of the current sentence
    tokens = [token.text for token in nlp_objects[i]]
    # get the vectors of the current sentence
    vectors = np.vstack([token.vector for token in nlp_objects[i]])
    # get the weights of the words o the sentence
    weights = np.array([tfidf_scores[token.lower()] if token.lower() in tfidf_scores else 0 for token in tokens])
    # computes the vector embeding of the sentence
    if np.any(weights):
        nlp_vectors[i,:] = np.average(vectors, axis = 0, weights=weights)
        
np.savetxt("../data/X_treebank_test_embedded_tfidf_spacy.csv", nlp_vectors, delimiter=",")