In [1]:
import gensim.downloader as api
model = api.load("glove-twitter-25")


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import numpy as np
import spacy
from tqdm import tqdm
from gensim.models import FastText
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jenny\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
stwords = stopwords.words('english')
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer()
def preprocessing(series):
    
    REPLACE_NO_SPACE = re.compile("(\&)|(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    processed = [REPLACE_NO_SPACE.sub("", text) for text in series]
    processed = [REPLACE_WITH_SPACE.sub(" ", text) for text in processed]
#     processed = [re.sub(r" ?\([^)]+\)", "", text) for text in processed]
    
    url = re.compile(r'http\S+')
    processed = [url.sub("URL.", text) for text in processed]
    processed_sent = []
    for p in processed:
#         processed_sent.append(' '.join(w for w in p.lower().split() if w not in stwords))
        processed_sent.append(' '.join(lemmatizer.lemmatize(w) for w in p.lower().split() if w not in stwords))

    return processed_sent
    
    

# Dataset - Paragraph - Sent

In [26]:
data1 = pd.read_csv("data/training/paragraph2sentence_train_input.tsv", sep='\t', header = None)
data2 = pd.read_csv("data/test/paragraph2sentence.test.input.tsv", sep='\t', header = None)
data3 = pd.read_csv("data/trial/paragraph2sentence.trial.input.tsv", sep = '\t', header = None)

data = pd.concat([data1, data2, data3], ignore_index = True)


key1 = pd.read_csv("keys/training/paragraph2sentence.train.gs.tsv", sep = '\t', header = None)
key2 = pd.read_csv("keys/test/paragraph2sentence.test.gs.tsv", sep = '\t', header = None)
key3 = pd.read_csv("keys/trial/paragraph2sentence.trial.gs.tsv", sep = '\t', header = None)

ground_truth = pd.concat([key1, key2, key3], ignore_index = True)


docs = pd.concat([data[0], data[1]],ignore_index=True)
docs_half = data[0] 
query = data[1] 

processed_docs_half = preprocessing(docs_half)
processed_query = preprocessing(query)
processed_docs = preprocessing(docs)

# Dataset - Sent - Phrase

In [4]:
data1 = pd.read_csv("data/training/sentence2phrase.train.input.tsv", sep='\t', header = None)
data2 = pd.read_csv("data/test/sentence2phrase.test.input.tsv", sep='\t', header = None)
data3 = pd.read_csv("data/trial/sentence2phrase.trial.input.tsv", sep='\t', header = None)


data = pd.concat([data1, data2, data3], ignore_index = True)

key1 = pd.read_csv("keys/training/sentence2phrase.train.gs.tsv", sep='\t', header = None)
key2 = pd.read_csv("keys/test/sentence2phrase.test.gs.tsv", sep='\t', header = None)
key3 = pd.read_csv("keys/trial/sentence2phrase.trial.gs.tsv", sep='\t', header = None)

ground_truth = pd.concat([key1, key2, key3], ignore_index = True)


docs = pd.concat([data[0], data[1]],ignore_index=True)
docs_half = data[0] 
query = data[1] 

processed_docs_half = preprocessing(docs_half)
processed_query = preprocessing(query)
processed_docs = preprocessing(docs)

# Dataset Phrase - Word

In [20]:
data1 = pd.read_csv("data/training/phrase2word.train.input.tsv", sep='\t', header = None)
data2 = pd.read_csv("data/test/phrase2word.test.input.tsv", sep='\t', header = None)
data3 = pd.read_csv("data/trial/phrase2word.trial.input.tsv", sep = '\t', header = None)

data = pd.concat([data1, data2, data3], ignore_index = True)
data = data.drop([84,708])

key1 = pd.read_csv("keys/training/phrase2word.train.gs.tsv", sep = '\t', header = None)
key2 = pd.read_csv("keys/test/phrase2word.test.gs.tsv", sep = '\t', header = None)
key3 = pd.read_csv("keys/trial/phrase2word.trial.gs.tsv", sep = '\t', header = None)

ground_truth = pd.concat([key1, key2, key3], ignore_index = True)
ground_truth = ground_truth.drop([84,708])

docs = pd.concat([data[0], data[1]],ignore_index=True)
docs_half = data[0] 
query = data[1] 

processed_docs_half = preprocessing(docs_half)
processed_query = preprocessing(query)
processed_docs = preprocessing(docs)

# Get Doc/Sentence vector by averaging its word embeddings

In [27]:
wordvecs = []
for doc in processed_docs:
    wordvecs.append([model[word] if word in model else np.zeros(25) for word in doc.split()])


In [28]:
sentvecs = []
for i in range(len(wordvecs)):
    sentvecs.append(np.mean(wordvecs[i], axis = 0))

# Function Shift

In [29]:
# shift the function such at there is no negative values in the vector 
minV = float('inf')
for i in range(len(sentvecs)):
    try:
        minV = min(min(sentvecs[i]), minV)
    except:
        print(i,sentvecs[i])

for i in range(len(sentvecs)):
    sentvecs[i] -= minV

# Extended Jaccard

In [30]:
from numpy import linalg as LA
jaccard = []
for i in range(len(data)):
    numerator = np.transpose(sentvecs[i]).dot(sentvecs[i+len(data)])
    l2norm1sq = (LA.norm(sentvecs[i]))**2
    l2norm2sq = (LA.norm(sentvecs[i+len(data)]))**2
    jaccard_sim = numerator / (l2norm1sq + l2norm2sq - numerator)
    jaccard.append(jaccard_sim)

# Cosine

In [31]:
from scipy import spatial

cos_distance = []
for i in range(len(processed_docs_half)):
    cos_distance.append(spatial.distance.cosine(sentvecs[i], sentvecs[i+len(data)]))


# Word-Mover-Distance Based Similarity

In [32]:
wmd = []
model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.

for i in range(len(processed_docs_half)):
    wmd.append(model.wmdistance(processed_docs_half[i].split(), processed_query[i].split()))  # Compute WMD as normal.



In [33]:
wmd = np.array(wmd)
wmd_sim = 1./(1.+ wmd)

# Fuzzy-Match Jaccard Similarity

In [34]:
query_token = [q.split() for q in processed_query]

In [35]:
Qset = []
count = 0
# index = 0
# while index < len(query_token):
for i in range(len(query_token)):
    tkn = query_token[i]
    q = []
    for t in query_token[i]:
        try:
            q.extend(model.most_similar(positive = [t], topn=5))
#             q.extend(model.most_similar_cosmul(positive = [t], topn = 5))
        except KeyError as e:
            count += 1
            result = re.search("\'(.*)\'", str(e))
            oov = result.group(1)
#             tkn.remove(oov)
    for t in tkn:
        q.append((t,1))
    Qset.append(q)

In [36]:
import numpy as np
similarity = []
ind = []
for i in range(len(Qset)):
    nn_lst = [tup[0] for tup in Qset[i]]
    
    processed_tokens = processed_docs_half[i].split()
    intersection_len = len(list(set(nn_lst) & set(processed_tokens)))
    if len(processed_tokens) != 0:
        
        similarity.append(intersection_len/ (len(processed_tokens)**0.8))
        
    else:
        ind.append(i)

In [37]:
# Calculate standard jaccard similarity
std_jaccard = []
for i in range(len(processed_docs_half)):
    query_tokens = processed_query[i].split()
    processed_tokens = processed_docs_half[i].split()

    query_tokens.extend(processed_tokens)
    intersection_len = len(list(set(query_tokens) & set(processed_tokens)))
    union_len = len(list(set(query_tokens)))
    
    std_jaccard.append(intersection_len/union_len)
     

In [38]:
ground_truth['ext_jac'] = jaccard
ground_truth['cosine'] = cos_distance
ground_truth['wmd_sim'] = wmd_sim
ground_truth['fuzzy_match'] = similarity
ground_truth['std_jaccard'] = std_jaccard


In [39]:
normalized_df=(ground_truth-ground_truth.min())/(ground_truth.max()-ground_truth.min())

In [40]:
n = normalized_df.shape[1]

In [41]:
sum((normalized_df[0] - normalized_df['fuzzy_match'])**2) / n

20.118994911948334

In [42]:
sum((normalized_df[0] - normalized_df['std_jaccard'])**2) / n

32.257316699081315

In [43]:
sum((normalized_df[0] - normalized_df['ext_jac'])**2) / n

54.726756964711335

In [44]:
sum((normalized_df[0] - normalized_df['cosine'])**2) / n

50.60365117504466

In [45]:
sum((normalized_df[0] - normalized_df['wmd_sim'])**2) / n

16.69676701406039

In [27]:
from gensim.similarities import WmdSimilarity

In [39]:
instance = WmdSimilarity(processed_docs_half[1], model, num_best = 1)


In [40]:
sims = instance[processed_query[1]]

In [41]:
sims


[(7, 0.6883245118726495)]

In [None]:
## import numpy as np
similarity = []
for i in range(len(Qset)):
    sim = []
    for j in range(len(processed_docs_half)):
        nn_lst = [tup[0] for tup in Qset[i]]
        
        processed_tokens = processed_docs_half[j].split()

        intersection_len = len(list(set(nn_lst) & set(processed_tokens)))
        sim.append(intersection_len / (len(nn_lst)))
    idx = np.argmax(sim)
    simscore = sim[idx]
    similarity.append((idx, simscore))

In [4]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(processed_docs)]

In [139]:
from numpy import linalg as LA
jaccard = []
for i in range(len(data)):
#     numerator = np.transpose(model.docvecs[i]).dot(test[i])
    numerator = np.transpose(model.docvecs[i]).dot(model.docvecs[i+500])
    l2norm1sq = (LA.norm(model.docvecs[i]))**2
#     l2norm2sq = (LA.norm(test[i]))**2
    l2norm2sq = (LA.norm(model.docvecs[i+500]))**2
    jaccard_sim = numerator / (l2norm1sq + l2norm2sq - numerator)
    jaccard.append(jaccard_sim)

AttributeError: 'Word2VecKeyedVectors' object has no attribute 'docvecs'

In [6]:
model = Doc2Vec.load("d2v.model")

In [5]:
# Doc2Vec

max_epochs = 20
vec_size = 25
alpha = 0.025
    
model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")




iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteratio

In [235]:
# shift the function such at there is no negative values in the vector 
minV = float('inf')
minTest = float('inf')
for i in range(len(model.docvecs)):
    minV = min(min(model.docvecs[i]), minV)

for i in range(len(model.docvecs)):
    model.docvecs[i] -= minV

AttributeError: 'Word2VecKeyedVectors' object has no attribute 'docvecs'