# A Siamese Network-Based for similarity calculateing between two text


## 1. Environment prepare

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import numpy as np
import random as rnd
import torch
import nltk
from nltk.data import find
import ssl
from collections import defaultdict
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [7]:
# check if punkt tokenizer models are downloaded
try:
    find('tokenizers/punkt')
    print("Punkt Tokenizer Models are already downloaded.")
except LookupError:
    print("Punkt Tokenizer Models not found. Downloading them...")
    nltk.download('punkt')

Punkt Tokenizer Models not found. Downloading them...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 2. Model define

In [2]:
import torch
import torch.nn as nn
class SiameseNetwork(nn.Module):
    def __init__(self, vocab_size=41699, d_model=128):
        super(SiameseNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.lstm = nn.LSTM(d_model, d_model, batch_first=True)
        # No need to explicitly define mean and normalization, can be done in forward

    def forward(self, x1, x2):
        # Assuming x1 and x2 are the input sequences for the two siamese branches
        x1 = self.embedding(x1)
        x2 = self.embedding(x2)

        # LSTM layer
        x1, _ = self.lstm(x1)
        x2, _ = self.lstm(x2)

        # Mean over sequence
        x1 = torch.mean(x1, dim=1)
        x2 = torch.mean(x2, dim=1)

        # Normalization (L2)
        x1 = F.normalize(x1, p=2, dim=1)
        x2 = F.normalize(x2, p=2, dim=1)

        return x1, x2

def triplet_loss_fn(v1, v2, margin=0.25):
    scores = torch.matmul(v1, v2.T)
    batch_size = scores.size(0)

    positive = torch.diag(scores)
    negative_without_positive = scores - 2.0 * torch.eye(batch_size, device=scores.device)
    closest_negative = negative_without_positive.max(dim=1).values

    negative_zero_on_duplicate = scores * (1.0 - torch.eye(batch_size, device=scores.device))
    mean_negative = torch.sum(negative_zero_on_duplicate, dim=1) / (batch_size - 1)

    triplet_loss1 = torch.maximum(torch.zeros_like(positive), margin - positive + closest_negative)
    triplet_loss2 = torch.maximum(torch.zeros_like(positive), margin - positive + mean_negative)

    triplet_loss = torch.mean(triplet_loss1 + triplet_loss2)
    return triplet_loss

def train_model(model, train_loader, val_loader, learning_rate=0.01, epochs=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(epochs):
        model.train()
        for q1_q2 in train_loader:
            q1, q2 = q1_q2[0].to(device), q1_q2[1].to(device)
            optimizer.zero_grad()
            v1, v2 = model(q1, q2)
            loss = triplet_loss_fn(v1, v2)
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for q1_q2 in val_loader:
                q1, q2 = q1_q2[0].to(device), q1_q2[1].to(device)
                v1, v2 = model(q1, q2)
                val_loss += triplet_loss_fn(v1, v2).item()

        val_loss /= len(val_loader)
        print(f"Epoch {epoch + 1}, Val Loss: {val_loss}")

## 3. data preprocessing

In [8]:
import pandas as pd
import numpy as np
import nltk
import random as rnd

# Load the data
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Cyber_Physical_System/questions.csv")
N_train = 300000
N_test  = 10 * 1024
data_train = data[:N_train]
data_test  = data[N_train:N_train+N_test]
del data

# Extracting duplicate and non-duplicate question pairs
td_index = (data_train['is_duplicate'] == 1).to_numpy()
td_index = [i for i, x in enumerate(td_index) if x]

Q1_train_words = np.array(data_train['question1'][td_index])
Q2_train_words = np.array(data_train['question2'][td_index])

Q1_test_words = np.array(data_test['question1'])
Q2_test_words = np.array(data_test['question2'])
y_test  = np.array(data_test['is_duplicate'])

# Building the vocabulary
from collections import defaultdict
Q1_train = np.empty_like(Q1_train_words)
Q2_train = np.empty_like(Q2_train_words)

Q1_test = np.empty_like(Q1_test_words)
Q2_test = np.empty_like(Q2_test_words)

vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1

for idx in range(len(Q1_train_words)):
    Q1_train[idx] = nltk.word_tokenize(Q1_train_words[idx])
    Q2_train[idx] = nltk.word_tokenize(Q2_train_words[idx])
    q = Q1_train[idx] + Q2_train[idx]
    for word in q:
        if word not in vocab:
            vocab[word] = len(vocab) + 1


# Tokenizing and numerically encoding the questions
for idx in range(len(Q1_train)):
    Q1_train[idx] = [vocab[word] for word in Q1_train[idx]]
    Q2_train[idx] = [vocab[word] for word in Q2_train[idx]]

for idx in range(len(Q1_test_words)):
    Q1_test[idx] = [vocab[word] for word in nltk.word_tokenize(Q1_test_words[idx])]
    Q2_test[idx] = [vocab[word] for word in nltk.word_tokenize(Q2_test_words[idx])]

def split_data(Q1, Q2, split_ratio=0.8):
    """ Split the data into training and validation sets. """
    cut_off = int(len(Q1) * split_ratio)
    train_Q1, train_Q2 = Q1[:cut_off], Q2[:cut_off]
    val_Q1, val_Q2 = Q1[cut_off:], Q2[cut_off:]
    return train_Q1, train_Q2, val_Q1, val_Q2

def data_generator(Q1, Q2, batch_size, pad=1):
    while True:
        for i in range(0, len(Q1), batch_size):
            yield (np.array([q + [pad] * (max(map(len, Q1[i:i+batch_size])) - len(q)) for q in Q1[i:i+batch_size]]),
                   np.array([q + [pad] * (max(map(len, Q2[i:i+batch_size])) - len(q)) for q in Q2[i:i+batch_size]]))

In [44]:
max_length = max(max(len(q) for q in Q1_train), max(len(q) for q in Q2_train))

Q1_train = [q + [vocab['<PAD>']] * (max_length - len(q)) for q in Q1_train]
Q2_train = [q + [vocab['<PAD>']] * (max_length - len(q)) for q in Q2_train]


train_Q1, train_Q2, val_Q1, val_Q2 = split_data(Q1_train, Q2_train)

from torch.utils.data import DataLoader, TensorDataset, Dataset

In [10]:
# Assuming train_Q1 and train_Q2 are lists of tokenized questions
print(train_Q1[0])  # Print the first question in Q1
print(train_Q2[0])  # Print the first question in Q2

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[4, 22, 6, 23, 7, 24, 8, 25, 26, 11, 27, 28, 7, 29, 30, 16, 31, 18, 19, 20, 21, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [46]:
# Convert to tensors
train_Q1_tensor = torch.tensor(train_Q1, dtype=torch.long)
train_Q2_tensor = torch.tensor(train_Q2, dtype=torch.long)

# Check shapes and types
print("Shape of train_Q1_tensor:", train_Q1_tensor.shape)
print("Shape of train_Q2_tensor:", train_Q2_tensor.shape)

Shape of train_Q1_tensor: torch.Size([89188, 81])
Shape of train_Q2_tensor: torch.Size([89188, 81])


In [47]:
# Convert to tensors
val_Q1_tensor = torch.tensor(val_Q1, dtype=torch.long)
val_Q2_tensor = torch.tensor(val_Q2, dtype=torch.long)

# Check shapes and types
print("Shape of val_Q1_tensor:", val_Q1_tensor.shape)
print("Shape of val_Q2_tensor:", val_Q2_tensor.shape)

Shape of val_Q1_tensor: torch.Size([22298, 81])
Shape of val_Q2_tensor: torch.Size([22298, 81])


### 3.1 DataLoader

In [24]:
# Create a simple dataset
simple_dataset = TensorDataset(train_Q1_tensor, train_Q2_tensor)

# Create a simple DataLoader
simple_loader = DataLoader(simple_dataset, batch_size=256, shuffle=True)

# Try to get one batch
try:
    sample_data = next(iter(simple_loader))
    print("DataLoader works! Sample data shapes:", [d.shape for d in sample_data])
except Exception as e:
    print("Error with DataLoader:", e)

DataLoader works! Sample data shapes: [torch.Size([256, 81]), torch.Size([256, 81])]


In [28]:
inverse_vocab = {v: k for k, v in vocab.items()}

In [34]:
def tokens_to_sentence(tokenized_sentence, inverse_vocab):
    return ' '.join([inverse_vocab[token] for token in tokenized_sentence if token in inverse_vocab])

# Example usage
tokenized_example = np.array(sample_data[0][0])
original_sentence = tokens_to_sentence(tokenized_example, inverse_vocab)
print(original_sentence)

Would social media have helped Steve Bartman ? <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [33]:
tokenized_example = np.array(sample_data[1][0])
original_sentence = tokens_to_sentence(tokenized_example, inverse_vocab)
print(original_sentence)

If there had been social media in 2003 , how would the Steve Bartman story been different ? <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### 3.2 create DataLoader formally

In [48]:
# Assuming train_Q1 and train_Q2 are lists of tokenized questions
train_dataset = TensorDataset(train_Q1_tensor, train_Q2_tensor)
val_dataset = TensorDataset(val_Q1_tensor, val_Q2_tensor)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)


# Check the first batch in the train_dataset
train_features = next(iter(train_loader))

print("Shape of train_features Q1:", train_features[0].shape)
print("Data type of train_features Q1:", train_features[0].dtype)
print("Shape of train_features Q2:", train_features[1].shape)
print("Data type of train_features Q2:", train_features[1].dtype)

Shape of train_features Q1: torch.Size([256, 81])
Data type of train_features Q1: torch.int64
Shape of train_features Q2: torch.Size([256, 81])
Data type of train_features Q2: torch.int64


In [43]:
print(len(train_loader))

349


## 4. Instantiate the model and Trainning

In [49]:
# Instantiate the model
model = SiameseNetwork(vocab_size=len(vocab), d_model=128)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

# Train the model
train_model(model, train_loader, val_loader, learning_rate=0.01, epochs=5)

Epoch 1, Val Loss: 0.06327838808382777
Epoch 2, Val Loss: 0.055659063493790614
Epoch 3, Val Loss: 0.05284677206707949
Epoch 4, Val Loss: 0.05149382970888506
Epoch 5, Val Loss: 0.05156079766509885


In [52]:
# Save the entire model
torch.save(model, '/content/model/model_entire.pth')

# To load the model later
loaded_model = torch.load('/content/model/model_entire.pth')
loaded_model.eval()  # Set the model to evaluation mode

SiameseNetwork(
  (embedding): Embedding(41708, 128)
  (lstm): LSTM(128, 128, batch_first=True)
)

## 5. Model Testing

In [58]:
print(Q1_test)

[list([32, 38, 4, 107, 65, 1015, 65, 11509, 21])
 list([30, 156, 78, 216, 8908, 39, 716, 286, 8317, 21])
 list([32, 38, 4, 521, 1340, 735, 0, 65, 47, 1476, 1341, 735, 21]) ...
 list([30, 87, 116, 2932, 142, 131, 1747, 2324, 97, 482, 483, 1145, 33, 4075, 21])
 list([30, 156, 78, 1342, 352, 131, 18010, 21])
 list([32, 16, 111, 521, 6, 1215, 131, 6170, 21])]


In [60]:
import torch
import numpy as np

def classify(test_Q1, test_Q2, y, threshold, model, vocab, batch_size=64):
    """Function to test the accuracy of the model in PyTorch.

    Args:
        test_Q1 (numpy.ndarray): Array of Q1 questions.
        test_Q2 (numpy.ndarray): Array of Q2 questions.
        y (numpy.ndarray): Array of actual target.
        threshold (float): Desired threshold.
        model (torch.nn.Module): The Siamese model.
        vocab (collections.defaultdict): The vocabulary used.
        batch_size (int, optional): Size of the batches. Defaults to 64.

    Returns:
        float: Accuracy of the model.
    """
    model.eval()  # Set the model to evaluation mode
    accuracy = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        for i in range(0, len(test_Q1), batch_size):
            q1 = torch.tensor(test_Q1[i:i + batch_size], dtype=torch.long).to(device)
            q2 = torch.tensor(test_Q2[i:i + batch_size], dtype=torch.long).to(device)
            y_test = y[i:i + batch_size]

            # Get model predictions
            v1, v2 = model(q1, q2)

            # Calculate cosine similarity
            cos_sim = torch.nn.functional.cosine_similarity(v1, v2)

            # Make predictions based on the threshold
            predictions = cos_sim > threshold

            # Update accuracy
            accuracy += torch.sum(predictions == torch.tensor(y_test, dtype=torch.bool)).item()

    # Compute overall accuracy
    accuracy = accuracy / len(test_Q1)

    return accuracy

def pad_sequences(sequences, max_len, pad_value=0):
    return np.array([seq + [pad_value] * (max_len - len(seq)) for seq in sequences])

# Determine the maximum length
max_length = max(max(len(seq) for seq in Q1_test), max(len(seq) for seq in Q2_test))

# Pad the sequences
test_Q1_padded = pad_sequences(Q1_test, max_length, vocab['<PAD>'])
test_Q2_padded = pad_sequences(Q2_test, max_length, vocab['<PAD>'])

# Example usage:
accuracy = classify(test_Q1_padded, test_Q2_padded, y_test, 0.7, model, vocab, batch_size=64)
print("accuracy ", accuracy)

accuracy  0.70751953125


## 6. Custom testing based on casual text

In [68]:
import torch
import nltk

def predict(question1, question2, threshold, model, vocab, verbose=False):
    """Function for predicting if two questions are duplicates in PyTorch.

    Args:
        question1 (str): First question.
        question2 (str): Second question.
        threshold (float): Desired threshold.
        model (torch.nn.Module): The Siamese model.
        vocab (collections.defaultdict): The vocabulary used.
        verbose (bool, optional): If the results should be printed out. Defaults to False.

    Returns:
        bool: True if the questions are duplicates, False otherwise.
    """
    model.eval()  # Set the model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Tokenize and numericalize questions
    q1 = [vocab.get(word, vocab['<UNK>']) for word in nltk.word_tokenize(question1)]
    q2 = [vocab.get(word, vocab['<UNK>']) for word in nltk.word_tokenize(question2)]

    # Pad questions
    max_len = max(len(q1), len(q2))
    q1 += [vocab['<PAD>']] * (max_len - len(q1))
    q2 += [vocab['<PAD>']] * (max_len - len(q2))

    # Convert to PyTorch tensors
    q1_tensor = torch.tensor([q1], dtype=torch.long).to(device)
    q2_tensor = torch.tensor([q2], dtype=torch.long).to(device)

    with torch.no_grad():
        v1, v2 = model(q1_tensor, q2_tensor)
        cos_sim = torch.nn.functional.cosine_similarity(v1, v2)
        res = cos_sim.item() > threshold

    if verbose:
        print("Q1 = ", question1)
        print("Q2 = ", question2)
        print("Cosine Similarity = ", cos_sim.item())
        print("Result = ", res)

    return res

# Example usage
question1 = "When will I see you?"
question2 = "When can I see you again?"
predict(question1, question2, 0.7, loaded_model, vocab, verbose=True)

Q1 =  When will I see you?
Q2 =  When can I see you again?
Cosine Similarity =  0.724124550819397
Result =  True


True

In [76]:
question1 = "When will I see you?"
question2 = "When will I see you?"
predict(question1, question2, 0.7, loaded_model, vocab, verbose=True)

Q1 =  When will I see you?
Q2 =  When will I see you!
Cosine Similarity =  0.8320255279541016
Result =  True


True

In [69]:
speechOne ='''Halifax is a historic port city and the capital of the Canadian province of Nova Scotia. Nestled on the country's east coast, Halifax is known for its rich maritime heritage and vibrant cultural scene. The city is home to the iconic Halifax Citadel, a star-shaped fortress that offers panoramic views of the harbor. Boasting a diverse population, Halifax is celebrated for its friendly locals and welcoming atmosphere. Visitors can explore its charming waterfront, enjoy fresh seafood, and immerse themselves in the city's lively arts and music scene'''
print(speechOne)
speechTwo ='''Halifax is a historic port city located in the Canadian province of Nova Scotia. Nestled on the eastern coast of the country, Halifax is renowned for its rich maritime heritage, dating back to its founding in 1749. The city is home to the iconic Citadel Hill, a National Historic Site that offers panoramic views of the harbor and serves as a reminder of Halifax's military past. Boasting a vibrant cultural scene, Halifax hosts numerous festivals, museums, and galleries that celebrate its diverse history and artistic achievements. With its friendly locals, picturesque waterfront, and a thriving culinary scene, Halifax offers a welcoming atmosphere that attracts both residents and visitors alike'''
print(speechTwo)
speechThree = '''
Toronto, the largest city in Canada, is a dynamic and cosmopolitan metropolis situated in the province of Ontario. Known for its striking skyline dominated by the iconic CN Tower, Toronto is a global financial and cultural hub. The city is celebrated for its cultural diversity, reflected in vibrant neighborhoods like Kensington Market and Chinatown, where various ethnic communities thrive. Toronto is also home to world-class attractions such as the Royal Ontario Museum and the Art Gallery of Ontario, contributing to its reputation as a cultural powerhouse. With a bustling downtown core, diverse culinary offerings, and a robust public transportation system, Toronto exemplifies a modern, inclusive urban experience. '''
print(speechThree)

Halifax is a historic port city and the capital of the Canadian province of Nova Scotia. Nestled on the country's east coast, Halifax is known for its rich maritime heritage and vibrant cultural scene. The city is home to the iconic Halifax Citadel, a star-shaped fortress that offers panoramic views of the harbor. Boasting a diverse population, Halifax is celebrated for its friendly locals and welcoming atmosphere. Visitors can explore its charming waterfront, enjoy fresh seafood, and immerse themselves in the city's lively arts and music scene
Halifax is a historic port city located in the Canadian province of Nova Scotia. Nestled on the eastern coast of the country, Halifax is renowned for its rich maritime heritage, dating back to its founding in 1749. The city is home to the iconic Citadel Hill, a National Historic Site that offers panoramic views of the harbor and serves as a reminder of Halifax's military past. Boasting a vibrant cultural scene, Halifax hosts numerous festivals, 

In [70]:
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
print('\n-----\n'.join(sent_detector.tokenize(speechOne.strip())))

Halifax is a historic port city and the capital of the Canadian province of Nova Scotia.
-----
Nestled on the country's east coast, Halifax is known for its rich maritime heritage and vibrant cultural scene.
-----
The city is home to the iconic Halifax Citadel, a star-shaped fortress that offers panoramic views of the harbor.
-----
Boasting a diverse population, Halifax is celebrated for its friendly locals and welcoming atmosphere.
-----
Visitors can explore its charming waterfront, enjoy fresh seafood, and immerse themselves in the city's lively arts and music scene


In [71]:
print('\n-----\n'.join(sent_detector.tokenize(speechTwo.strip())))

Halifax is a historic port city located in the Canadian province of Nova Scotia.
-----
Nestled on the eastern coast of the country, Halifax is renowned for its rich maritime heritage, dating back to its founding in 1749.
-----
The city is home to the iconic Citadel Hill, a National Historic Site that offers panoramic views of the harbor and serves as a reminder of Halifax's military past.
-----
Boasting a vibrant cultural scene, Halifax hosts numerous festivals, museums, and galleries that celebrate its diverse history and artistic achievements.
-----
With its friendly locals, picturesque waterfront, and a thriving culinary scene, Halifax offers a welcoming atmosphere that attracts both residents and visitors alike


In [72]:
speechOneSplit = sent_detector.tokenize(speechOne.strip())
speechTwoSplit = sent_detector.tokenize(speechTwo.strip())

for i in range(0, len(speechOneSplit)):
  question1 = speechOneSplit[i]
  question2 = speechTwoSplit[i]
  # 1/True means it is duplicated, 0/False otherwise
  predict(question1 , question2, 0.7, model, vocab, verbose=True)

Q1 =  Halifax is a historic port city and the capital of the Canadian province of Nova Scotia.
Q2 =  Halifax is a historic port city located in the Canadian province of Nova Scotia.
Cosine Similarity =  0.882203221321106
Result =  True
Q1 =  Nestled on the country's east coast, Halifax is known for its rich maritime heritage and vibrant cultural scene.
Q2 =  Nestled on the eastern coast of the country, Halifax is renowned for its rich maritime heritage, dating back to its founding in 1749.
Cosine Similarity =  0.7190892696380615
Result =  True
Q1 =  The city is home to the iconic Halifax Citadel, a star-shaped fortress that offers panoramic views of the harbor.
Q2 =  The city is home to the iconic Citadel Hill, a National Historic Site that offers panoramic views of the harbor and serves as a reminder of Halifax's military past.
Cosine Similarity =  0.8000532984733582
Result =  True
Q1 =  Boasting a diverse population, Halifax is celebrated for its friendly locals and welcoming atmosph

In [73]:
speechOneSplit = sent_detector.tokenize(speechOne.strip())
speechThreeSplit = sent_detector.tokenize(speechThree.strip())

for i in range(0, len(speechOneSplit)):
  question1 = speechOneSplit[i]
  question2 = speechThreeSplit[i]
  # 1/True means it is duplicated, 0/False otherwise
  predict(question1 , question2, 0.7, model, vocab, verbose=True)

Q1 =  Halifax is a historic port city and the capital of the Canadian province of Nova Scotia.
Q2 =  Toronto, the largest city in Canada, is a dynamic and cosmopolitan metropolis situated in the province of Ontario.
Cosine Similarity =  0.42908966541290283
Result =  False
Q1 =  Nestled on the country's east coast, Halifax is known for its rich maritime heritage and vibrant cultural scene.
Q2 =  Known for its striking skyline dominated by the iconic CN Tower, Toronto is a global financial and cultural hub.
Cosine Similarity =  0.4907105267047882
Result =  False
Q1 =  The city is home to the iconic Halifax Citadel, a star-shaped fortress that offers panoramic views of the harbor.
Q2 =  The city is celebrated for its cultural diversity, reflected in vibrant neighborhoods like Kensington Market and Chinatown, where various ethnic communities thrive.
Cosine Similarity =  0.5867418646812439
Result =  False
Q1 =  Boasting a diverse population, Halifax is celebrated for its friendly locals and

In [74]:
question3 = "The Great Wall of China, a marvel of engineering stretching over 13,000 miles. is a series of fortifications made of stone, brick, tamped earth, wood, and other materials."
question4 = "a The great or China marvel of engineering stretching over 13,000 miles is"
predict(question3, question4, 0.7, loaded_model, vocab, verbose=True)

Q1 =  The Great Wall of China, a marvel of engineering stretching over 13,000 miles. is a series of fortifications made of stone, brick, tamped earth, wood, and other materials.
Q2 =  a The great or China marvel of engineering stretching over 13,000 miles is
Cosine Similarity =  0.7929869890213013
Result =  True


True