In [9]:
# loading the gensim model into spacy model didn't work (couldn't convert the gensim model) 

# import spacy
# # Load the spacy model that you have installed
# nlp = spacy.load('./word2vec_model_one_gesim.model/')
# # process a sentence using the model
# doc = nlp("This is some text that I am processing with Spacy")
# # It's that simple - all of the vectors and words are assigned after this point
# # Get the vector for 'text':
# doc[3].vector
# # Get the mean vector for the entire sentence (useful for sentence classification etc.)
# doc.vector

In [1]:
import math
import gensim
import pandas as pd
import numpy as np
from scipy import spatial
import string
import csv
import pickle

In [2]:
def load_model(name:str, path:str =""):
    return gensim.models.Word2Vec.load(path + name)
# model = gensim.models.Word2Vec.load("word2vec_model_one_gesim.model")

In [3]:
def read_csv(name: str, path:str=""):
    return pd.read_csv(path + name)

In [4]:
def read_question_data(data, is_training):
    question1_data = data["question1"].values.tolist()
    question2_data = data["question2"].values.tolist()
    if is_training:
        labels = data["is_duplicate"].values.tolist()
        return [question1_data, question2_data, labels]
    return [question1_data, question2_data]

In [5]:
# preprocessing the data (tokenizing and removing punctutation)
def gensim_preprocess(question_data):
    return [gensim.utils.simple_preprocess(str(sentence)) for sentence in question_data] 

In [6]:
def string_preprocess(question_data):
    stop_words=['the', 'a', 'an', 'and', 'is', 'be', 'will']
    new_data = []
    
    for question in question_data:
        processed_question = []
        question = str(question)
        
        #split the question to words (doesn't include stop words)
        words = [word.lower() for word in question.split() if word not in stop_words]
        for word in words:
            # to remove punctutation from string (imported from string library)
            word = word.translate(str.maketrans('', '', string.punctuation))
            word = word.replace('“', '').replace('”', '')

            if len(word) > 0:
                processed_question.append(word)
                
        new_data.append(processed_question)
    
    return new_data

In [7]:
def get_data_embedding (model ,data:str):
    data_embedding = []
    for sentence in data:
        data_embedding.append(get_sentence_embedding(model, sentence))

    return data_embedding

In [8]:
def get_sentence_embedding (model ,sentence:str):
    sentence_embedding = []
    for word in sentence:
        sentence_embedding.append(get_word_embedding(model, word))
    return sentence_embedding

In [9]:
def get_word_embedding (model ,word:str):
    try: 
        return model.wv[word]
    #     model.wv.get_vector[word]
    except: 
        return model.wv["UNK"]
    return model.wv[word]

In [10]:
def get_sum_of_embeddings(data_embedding, embedding_size):
    return [sum(sentence)/len(sentence) if len(sentence)>0 else np.zeros(embedding_size) 
            for sentence in data_embedding]

In [11]:
def get_embedding_size(data):
    for sentence in data:
        if len(sentence) >0:
            return len(sentence[0])

In [12]:
def cosine_similarity(question1_summed_vector, question2_summed_vector): 
    distance = []
    for i in range(len(question1_summed_vector)):
        distance.append(1 - spatial.distance.cosine(question1_summed_vector[i], question2_summed_vector[i]))
        
    for i in range(len(distance)):
        if math.isnan(distance[i]):
            distance[i] = 0
    return distance

In [13]:
# model_gensim = load_model("word2vec_model_gensim_100.model")
# model_manual = load_model("word2vec_model_manual_100.model")

In [81]:
model_gensim = load_model("word2vec_model_gensim_200.model")
model_manual = load_model("word2vec_model_manual_200.model")

In [82]:
df = read_csv('train.csv')

In [84]:
[question1_data, question2_data, labels] = read_question_data(df, True)

In [85]:
gensim_clean_question1_data = gensim_preprocess(question1_data)
gensim_clean_question2_data = gensim_preprocess(question2_data)

manual_clean_question1_data = string_preprocess(question1_data)
manual_clean_question2_data = string_preprocess(question2_data)

In [86]:
gensim_question1_data_vector = get_data_embedding(model_gensim, gensim_clean_question1_data)
gensim_question2_data_vector = get_data_embedding(model_gensim, gensim_clean_question2_data)

manual_question1_data_vector = get_data_embedding(model_manual, manual_clean_question1_data)
manual_question2_data_vector = get_data_embedding(model_manual, manual_clean_question2_data)

In [87]:
embedding_size_gensim = get_embedding_size(gensim_question1_data_vector)
embedding_size_manual = get_embedding_size(manual_question1_data_vector)

print(embedding_size_gensim)

gensim_question1_summed_vector = get_sum_of_embeddings(gensim_question1_data_vector, embedding_size_gensim)
gensim_question2_summed_vector = get_sum_of_embeddings(gensim_question2_data_vector, embedding_size_gensim)

manual_question1_summed_vector = get_sum_of_embeddings(manual_question1_data_vector, embedding_size_manual)
manual_question2_summed_vector = get_sum_of_embeddings(manual_question2_data_vector, embedding_size_manual)

200


In [88]:
distances_gensim = cosine_similarity(gensim_question1_summed_vector, gensim_question2_summed_vector)
distances_manual = cosine_similarity(manual_question1_summed_vector, manual_question2_summed_vector)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [89]:
from sklearn.linear_model import LogisticRegression

clf_gensim = LogisticRegression(random_state=0).fit(np.array(distances_gensim).reshape(-1,1), np.array(labels))
clf_manual = LogisticRegression(random_state=0).fit(np.array(distances_manual).reshape(-1,1), np.array(labels))

In [90]:
# svm model took too long to train compared to logistic regression, so opted not to train it now

# from sklearn.svm import SVC
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler

# svm_gensim = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(np.array(distances_gensim).reshape(-1,1), np.array(labels))

# svm_manual = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(np.array(distances_manual).reshape(-1,1), np.array(labels))

# filename = 'svm_gensim_model.sav'
# pickle.dump(svm_gensim, open(filename, 'wb'))

# filename = 'svm_manual_model.sav'
# pickle.dump(svm_manual, open(filename, 'wb'))

In [91]:
df_test = read_csv('test.csv')
df_test.head(5)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [92]:
[question1_test_data, question2_test_data] = read_question_data(df_test, False)

In [94]:
gensim_clean_question1_test_data = gensim_preprocess(question1_test_data)
gensim_clean_question2_test_data = gensim_preprocess(question2_test_data)

manual_clean_question1_test_data = string_preprocess(question1_test_data)
manual_clean_question2_test_data = string_preprocess(question2_test_data)

In [95]:
gensim_question1_test_data_vector = get_data_embedding(model_gensim, gensim_clean_question1_test_data)
gensim_question2_test_data_vector = get_data_embedding(model_gensim, gensim_clean_question2_test_data)

manual_question1_test_data_vector = get_data_embedding(model_manual, manual_clean_question1_test_data)
manual_question2_test_data_vector = get_data_embedding(model_manual, manual_clean_question2_test_data)

In [96]:
embedding_size_gensim = get_embedding_size(gensim_question1_test_data_vector)
embedding_size_manual = get_embedding_size(manual_question1_test_data_vector)
    

gensim_question1_summed_test_vector = get_sum_of_embeddings(gensim_question1_test_data_vector, embedding_size_gensim)
gensim_question2_summed_test_vector = get_sum_of_embeddings(gensim_question2_test_data_vector, embedding_size_gensim)

manual_question1_summed_test_vector = get_sum_of_embeddings(manual_question1_test_data_vector, embedding_size_manual)
manual_question2_summed_test_vector = get_sum_of_embeddings(manual_question2_test_data_vector, embedding_size_manual)

In [97]:
test_distances_gensim = cosine_similarity(gensim_question1_summed_test_vector, gensim_question2_summed_test_vector)
test_distances_manual = cosine_similarity(manual_question1_summed_test_vector, manual_question2_summed_test_vector)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [98]:
print(len(test_distances_gensim))
print(len(test_distances_manual))

2345796
2345796


In [99]:
gensim_model_pred_logistic = clf_gensim.predict(np.array(test_distances_gensim).reshape(-1,1))
manual_model_pred_logistic = clf_manual.predict(np.array(test_distances_manual).reshape(-1,1))

In [100]:
sample_id = list(range(0,len(gensim_model_pred_logistic)))

In [101]:
submission = pd.DataFrame({'test_id':sample_id,'is_duplicate':list(gensim_model_pred_logistic)})
submission.to_csv('gensim_logistic_pred_200.csv', index=False)

In [102]:
submission2 = pd.DataFrame({'test_id':sample_id,'is_duplicate':list(manual_model_pred_logistic)})
submission2.to_csv('manual_logistic_pred_200.csv', index=False)