In [None]:
# needed libraries: gensim & pyhton-Levenshtein

In [1]:
import pandas as pd
import numpy as np
from gensim.models import keyedvectors
import gensim
import string

In [2]:
def read_data (path):
    df = pd.read_csv(path)
    return df
#     df = pd.read_csv('train.csv')
    # print(df)
    # print(type(df))

In [3]:
def join_question_data(data):
    data = df["question1"].values.tolist()
    data.extend(df["question2"].values.tolist())
    # print(len(data))
    # print(data[0])
    return data

In [4]:
# preprocessing the data (tokenizing and removing punctutation)
def gensim_preprocess(question_data):
    new_data = [gensim.utils.simple_preprocess(str(sentence)) for sentence in question_data]

#     for handling unknown vectors (couldn't add after training the model)
    new_data.append(["UNK","UNK"])
    return new_data

In [5]:
def string_preprocess(question_data):
    stop_words=['the', 'a', 'an', 'and', 'is', 'be', 'will']
    new_data = []
    
    for question in question_data:
        processed_question = []
        question = str(question)
        
        #split the question to words (doesn't include stop words)
        words = [word.lower() for word in question.split() if word not in stop_words]
        for word in words:
            # to remove punctutation from string (imported from string library)
            word = word.translate(str.maketrans('', '', string.punctuation))
            word = word.replace('“', '').replace('”', '')

            if len(word) > 0:
                processed_question.append(word)
                
        new_data.append(processed_question)
        
#     for handling unknown vectors (couldn't add after training the model)
    new_data.append(["UNK","UNK"])
    return new_data

In [6]:
# default vector_size is 100
def gensim_word2vec(training_data, window_size, min_word_count, vector_size, workers):
    model = gensim.models.Word2Vec(window= window_size, vector_size= vector_size, 
                                   min_count= min_word_count, workers = workers)

#     model.build_vocab(new_data, progress_per= 1000)
    model.build_vocab(training_data, progress_per= 1000)
    model.train(training_data, total_examples= model.corpus_count, epochs=model.epochs)
#     print(model.epochs)
#     print(model.corpus_count)
    return model

In [7]:
# model.save("word2vec_model_one_gesim.model")
def save_gensim_model(model, name):
#     model.save("word2vec_model_one_gensim_vector_300.model")
    model.save(str(name) + ".model")

In [8]:
def find_most_similar(model, word:str):
    try: 
        return model.wv.most_similar(word)
    except:
        return model.wv.most_similar("UNK")

In [9]:
def get_word_embeddings (model ,word:str):
#     model.wv.get_vector[word]
    try:
        return model.wv[word]
    except:
        return model.wv["UNK"]

In [10]:
df = read_data('train.csv')
data = join_question_data(df)

In [11]:
processed_data_manual = string_preprocess(data)
processed_data_gensim = gensim_preprocess(data)

In [12]:
model_1 = gensim_word2vec(processed_data_manual, 5, 2, 100, 12)
save_gensim_model(model_1, "word2vec_model_manual_100")

KeyboardInterrupt: 

In [None]:
model_2 = gensim_word2vec(processed_data_gensim, 5, 2, 100, 12)
save_gensim_model(model_2, "word2vec_model_gensim_100")


In [13]:
model_3 = gensim_word2vec(processed_data_manual, 5, 2, 200, 12)
save_gensim_model(model_3, "word2vec_model_manual_200")

In [14]:
model_4 = gensim_word2vec(processed_data_gensim, 5, 2, 200, 12)
save_gensim_model(model_4, "word2vec_model_gensim_200")

In [110]:
print(find_most_similar(model_1, "what"))

[('which', 0.5725353956222534), ('whats', 0.5608700513839722), ('what’s', 0.43915098905563354), ('efforttoresult', 0.42689210176467896), ('some', 0.4265798032283783), ('materialsvideosresources', 0.41167664527893066), ('deliverycouriers', 0.4001527726650238), ('psychological', 0.39648371934890747), ('stoping', 0.3740164339542389), ('presentations', 0.3732001781463623)]


In [111]:
print(find_most_similar(model_2, "what"))

[('which', 0.6236398816108704), ('the', 0.46475470066070557), ('some', 0.37875428795814514), ('proverbs', 0.36545121669769287), ('graduations', 0.3605636656284332), ('basic', 0.3572506606578827), ('angkor', 0.3516005277633667), ('outlandish', 0.3498055040836334), ('crusades', 0.3423321843147278), ('horror', 0.3412609398365021)]


In [112]:
print(find_most_similar(model_1, "UNK"))
print(find_most_similar(model_1, "hsudhf"))

[('é', 0.9334388375282288), ('è', 0.9314623475074768), ('reverent', 0.9307461977005005), ('ê', 0.9292716979980469), ('1114', 0.9288990497589111), ('parleg', 0.9283602237701416), ('‘life', 0.9269662499427795), ('leveltriggered', 0.9268814921379089), ('advice’', 0.9262845516204834), ('athiests', 0.9257882833480835)]
[('é', 0.9334388375282288), ('è', 0.9314623475074768), ('reverent', 0.9307461977005005), ('ê', 0.9292716979980469), ('1114', 0.9288990497589111), ('parleg', 0.9283602237701416), ('‘life', 0.9269662499427795), ('leveltriggered', 0.9268814921379089), ('advice’', 0.9262845516204834), ('athiests', 0.9257882833480835)]


In [113]:
print(processed_data_manual[0:5])
print(processed_data_manual[len(processed_data_manual)-1])

[['what', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india'], ['what', 'story', 'of', 'kohinoor', 'kohinoor', 'diamond'], ['how', 'can', 'i', 'increase', 'speed', 'of', 'my', 'internet', 'connection', 'while', 'using', 'vpn'], ['why', 'am', 'i', 'mentally', 'very', 'lonely', 'how', 'can', 'i', 'solve', 'it'], ['which', 'one', 'dissolve', 'in', 'water', 'quikly', 'sugar', 'salt', 'methane', 'carbon', 'di', 'oxide']]
['UNK', 'UNK']


In [114]:
print(processed_data_gensim[0:5])

[['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india'], ['what', 'is', 'the', 'story', 'of', 'kohinoor', 'koh', 'noor', 'diamond'], ['how', 'can', 'increase', 'the', 'speed', 'of', 'my', 'internet', 'connection', 'while', 'using', 'vpn'], ['why', 'am', 'mentally', 'very', 'lonely', 'how', 'can', 'solve', 'it'], ['which', 'one', 'dissolve', 'in', 'water', 'quikly', 'sugar', 'salt', 'methane', 'and', 'carbon', 'di', 'oxide']]
