In [None]:
# needed libraries: gensim & pyhton-Levenshtein

In [3]:
import pandas as pd
import numpy as np
import gensim
import string

In [17]:
def read_data (path):
    df = pd.read_csv(path)
    return df
#     df = pd.read_csv('train.csv')
    # print(df)
    # print(type(df))

In [5]:
def join_question_data(data):
    data = df["question1"].values.tolist()
    data.extend(df["question2"].values.tolist())
    # print(len(data))
    # print(data[0])
    return data

In [6]:
# preprocessing the data (tokenizing and removing punctutation)
def gensim_preprocess(question_data):
    new_data = [gensim.utils.simple_preprocess(str(sentence)) for sentence in question_data] 
    return new_data

In [7]:
def string_preprocess(question_data):
    stop_words=['the', 'a', 'an', 'and', 'is', 'be', 'will']
    new_data = []
    
    for question in question_data:
        processed_question = []
        question = str(question)
        
        #split the question to words (doesn't include stop words)
        words = [word.lower() for word in question.split() if word not in stop_words]
        for word in words:
            # to remove punctutation from string (imported from string library)
            word = word.translate(str.maketrans('', '', string.punctuation))
            word = word.replace('“', '').replace('”', '')

            if len(word) > 0:
                processed_question.append(word)
                
        new_data.append(processed_question)
    
    return new_data

In [51]:
# default vector_size is 100
def gensim_word2vec(training_data, window_size, min_word_count, vector_size, workers):
    model = gensim.models.Word2Vec(window= window_size, vector_size= vector_size, 
                                   min_count= min_word_count, workers = workers)
#     model.build_vocab(new_data, progress_per= 1000)
    model.build_vocab(training_data, progress_per= 1000)
    model.train(training_data, total_examples= model.corpus_count, epochs=model.epochs)
    
#     for handling unknown vectors
    model.wv["UNK"] = np.random.rand(vector_size)
#     print(model.epochs)
#     print(model.corpus_count)
    return model

In [20]:
# model.save("word2vec_model_one_gesim.model")
def save_gensim_model(model, name):
#     model.save("word2vec_model_one_gensim_vector_300.model")
    model.save(str(name) + ".model")

In [60]:
def find_most_similar(model, word:str):
    try: 
        return model.wv.most_similar(word)
    except:
        return model.wv.most_similar("UNK")

In [52]:
def get_word_embeddings (model ,word:str):
#     model.wv.get_vector[word]
    try:
        return model.wv[word]
    except:
        return model.wv["UNK"]

In [53]:
df = read_data('train.csv')
data = join_question_data(df)

In [54]:
processed_data_manual = string_preprocess(data)
processed_data_gensim = gensim_preprocess(data)

In [55]:
model_1 = gensim_word2vec(processed_data_manual, 5, 2, 100, 12)
model_2 = gensim_word2vec(processed_data_gensim, 5, 2, 100, 12)

In [56]:
save_gensim_model(model_1, "word2vec_model_manual_100")
save_gensim_model(model_2, "word2vec_model_gensim_100")

In [57]:
print(find_most_similar(model_1, "what"))

[('which', 0.5807263851165771), ('whats', 0.5686727166175842), ('youin', 0.45047006011009216), ('groundattack', 0.4348257780075073), ('some', 0.4311452805995941), ('what’s', 0.4174785315990448), ('materialsvideosresources', 0.41740575432777405), ('dedications', 0.39461904764175415), ('weavers', 0.3931557536125183), ('masturbationporn', 0.3927534222602844)]


In [58]:
print(find_most_similar(model_2, "what"))

[('which', 0.6183813810348511), ('the', 0.4920017421245575), ('caesium', 0.3833654224872589), ('brigg', 0.37125155329704285), ('some', 0.37123170495033264), ('walkway', 0.36221301555633545), ('horror', 0.3496111333370209), ('halloween', 0.34662020206451416), ('stoping', 0.3436439335346222), ('podcasts', 0.3425482213497162)]


In [61]:
print(find_most_similar(model_1, "vnfjnvd"))
print(find_most_similar(model_1, "hsudhf"))

[('schumers', 0.3628362715244293), ('hammerbacher', 0.3415323495864868), ('needs', 0.3413182199001312), ('obvious', 0.3345271050930023), ('errands', 0.3241003751754761), ('understands', 0.316386342048645), ('orchestrated', 0.3087090849876404), ('realization', 0.30723509192466736), ('adjuster', 0.30591443181037903), ('musks', 0.3045320212841034)]
[('schumers', 0.3628362715244293), ('hammerbacher', 0.3415323495864868), ('needs', 0.3413182199001312), ('obvious', 0.3345271050930023), ('errands', 0.3241003751754761), ('understands', 0.316386342048645), ('orchestrated', 0.3087090849876404), ('realization', 0.30723509192466736), ('adjuster', 0.30591443181037903), ('musks', 0.3045320212841034)]


In [37]:
print(processed_data_manual[0:5])

[['what', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india'], ['what', 'story', 'of', 'kohinoor', 'kohinoor', 'diamond'], ['how', 'can', 'i', 'increase', 'speed', 'of', 'my', 'internet', 'connection', 'while', 'using', 'vpn'], ['why', 'am', 'i', 'mentally', 'very', 'lonely', 'how', 'can', 'i', 'solve', 'it'], ['which', 'one', 'dissolve', 'in', 'water', 'quikly', 'sugar', 'salt', 'methane', 'carbon', 'di', 'oxide']]


In [38]:
print(processed_data_gensim[0:5])

[['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india'], ['what', 'is', 'the', 'story', 'of', 'kohinoor', 'koh', 'noor', 'diamond'], ['how', 'can', 'increase', 'the', 'speed', 'of', 'my', 'internet', 'connection', 'while', 'using', 'vpn'], ['why', 'am', 'mentally', 'very', 'lonely', 'how', 'can', 'solve', 'it'], ['which', 'one', 'dissolve', 'in', 'water', 'quikly', 'sugar', 'salt', 'methane', 'and', 'carbon', 'di', 'oxide']]
