In [35]:
!pip install sentence_transformers
!pip install pandas==1.3.4



In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
from scipy import spatial
import string
import csv
import pickle

In [10]:
def read_csv(name: str, path:str=""):
    return pd.read_csv(path + name, encoding='utf-8')

In [5]:
def read_question_data(data, is_training):
    question1_data = data["question1"].values.tolist()
    question2_data = data["question2"].values.tolist()
    if is_training:
        labels = data["is_duplicate"].values.tolist()
        return [question1_data, question2_data, labels]
    return [question1_data, question2_data]

In [6]:
def string_preprocess(question_data):
    stop_words=['the', 'a', 'an', 'and', 'is', 'be', 'will']
    new_data = []
    
    for question in question_data:
        processed_question = []
        question = str(question)
        #split the question to words (doesn't include stop words)
        words = [word.lower() for word in question.split() if word not in stop_words]
        for word in words:
            # to remove punctutation from string (imported from string library)
            word = word.translate(str.maketrans('', '', string.punctuation))
            word = word.replace('“', '').replace('”', '')

            if len(word) > 0:
                processed_question.append(word)
                
        sentence = ' '.join(processed_question)
        new_data.append(sentence)
    
    return new_data

In [7]:
def get_data_embedding (model ,data:str):
    return model.encode(data)

In [8]:
def similarity(question1_vectors, question2_vectors): 
    similarity_score = []
    for i in range(len(question1_vectors)):
        similarity_score.append(util.dot_score(question1_vectors[i], question2_vectors[i]))

    return similarity_score

In [20]:
model = SentenceTransformer('all-MiniLM-L6-v2') 

In [None]:
df = read_csv('/content/train.csv')

In [None]:
[question1_data, question2_data, labels] = read_question_data(df, True)

In [None]:
clean_question1_data = string_preprocess(question1_data)
clean_question2_data = string_preprocess(question2_data)

In [None]:
question1_data_vector = get_data_embedding(model, clean_question1_data)
question2_data_vector = get_data_embedding(model, clean_question2_data)

In [None]:
print(len(question1_data_vector))

404290


In [None]:
similarity_score = similarity(question1_data_vector, question2_data_vector)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(np.array(similarity_score).reshape(-1,1), np.array(labels))

In [None]:
# filename = '/content/LR_built_in_model.sav'
# pickle.dump(clf, open(filename, 'wb'))

In [21]:
# to load the model
filename = '/content/LR_built_in_model.sav'
clf = pickle.load(open(filename, 'rb'))

In [12]:
df_test = read_csv('/content/test.csv')
df_test.head(5)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [13]:
[question1_test_data, question2_test_data] = read_question_data(df_test, False)

In [14]:
clean_question1_test_data = string_preprocess(question1_test_data)
clean_question2_test_data = string_preprocess(question2_test_data)

In [22]:
question1_test_data_vector = get_data_embedding(model, clean_question1_test_data)

In [None]:
question2_test_data_vector = get_data_embedding(model, clean_question2_test_data)

In [None]:
test_similarity_score = similarity(question1_test_data_vector, question2_test_data_vector)

In [None]:
model_pred_logistic = clf.predict(np.array(test_similarity_score).reshape(-1,1))

In [None]:
test_id = list(range(0,len(model_pred_logistic)))

In [None]:
submission = pd.DataFrame({'test_id':sample_id,'is_duplicate':list(model_pred_logistic)})
submission.to_csv('/content/huggingface_logistic_pred.csv', index=False)