In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
from scipy import spatial
import string
import csv
import pickle

In [2]:
def read_csv(name: str, path:str=""):
    return pd.read_csv(path + name)

In [3]:
def read_question_data(data, is_training):
    question1_data = data["question1"].values.tolist()
    question2_data = data["question2"].values.tolist()
    if is_training:
        labels = data["is_duplicate"].values.tolist()
        return [question1_data, question2_data, labels]
    return [question1_data, question2_data]

In [4]:
def string_preprocess(question_data):
    stop_words=['the', 'a', 'an', 'and', 'is', 'be', 'will']
    new_data = []
    
    for question in question_data:
        processed_question = []
        question = str(question)
        #split the question to words (doesn't include stop words)
        words = [word.lower() for word in question.split() if word not in stop_words]
        for word in words:
            # to remove punctutation from string (imported from string library)
            word = word.translate(str.maketrans('', '', string.punctuation))
            word = word.replace('“', '').replace('”', '')

            if len(word) > 0:
                processed_question.append(word)
                
        sentence = ' '.join(processed_question)
        new_data.append(sentence)
    
    return new_data

In [5]:
def get_data_embedding (model ,data:str):
    return model.encode(data)

In [6]:
def similarity(question1_vectors, question2_vectors): 
    similarity_score = []
    for i in range(len(question1_vectors)):
        similarity_score.append(util.dot_score(question1_vectors[i], question2_vectors[i]))

    return similarity_score

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2') 

In [24]:
df = read_csv('train.csv')

In [25]:
[question1_data, question2_data, labels] = read_question_data(df, True)

In [26]:
len(question1_data)

404290

In [27]:
clean_question1_data = string_preprocess(question1_data)
clean_question2_data = string_preprocess(question2_data)

In [32]:
question1_data_vector = get_data_embedding(model, clean_question1_data)
question2_data_vector = get_data_embedding(model, clean_question2_data)

In [None]:
print(len(question1_data_vector))

In [None]:
similarity_score = similarity(question1_data_vector, question2_data_vector)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(np.array(similarity_score).reshape(-1,1), np.array(labels))

In [None]:
filename = '/content/LR_built_in_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [8]:
# to load the model
filename = 'LR_built_in_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [34]:
# loaded_model.predict()

ValueError: Expected 2D array, got scalar array instead:
array=what is my name.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [10]:
df_test = read_csv('test.csv')
df_test.head(5)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [13]:
[question1_test_data, question2_test_data] = read_question_data(df_test, False)

In [14]:
clean_question1_test_data = string_preprocess(question1_test_data)
clean_question2_test_data = string_preprocess(question2_test_data)

In [17]:
question1_test_data_vector = get_data_embedding(model, clean_question1_test_data[0:100000])

In [18]:
len(question1_test_data)

2345796

In [23]:
type(question1_test_data_vector[0:100])

numpy.ndarray

In [28]:
mylist = [0,1,2,3,4]

In [31]:
mylist[0:2]
len(mylist)

5

In [30]:
mylist[2:5]

[2, 3, 4]