## Task 1

In [1]:
import numpy as np

def qwerty_distance(char1, char2):
    qwerty_keyboard = {
        'q': [(0, 0), (0, 1)],
        'w': [(0, 1), (0, 2)],
        'e': [(0, 2), (0, 3)],
        'r': [(0, 3), (0, 4)],
        't': [(0, 4), (0, 5)],
        'y': [(0, 5), (0, 6)],
        'u': [(0, 6), (0, 7)],
        'i': [(0, 7), (0, 8)],
        'o': [(0, 8), (0, 9)],
        'p': [(0, 9), (0, 10)],
        'a': [(1, 0), (1, 1)],
        's': [(1, 1), (1, 2)],
        'd': [(1, 2), (1, 3)],
        'f': [(1, 3), (1, 4)],
        'g': [(1, 4), (1, 5)],
        'h': [(1, 5), (1, 6)],
        'j': [(1, 6), (1, 7)],
        'k': [(1, 7), (1, 8)],
        'l': [(1, 8), (1, 9)],
        'z': [(2, 0), (2, 1)],
        'x': [(2, 1), (2, 2)],
        'c': [(2, 2), (2, 3)],
        'v': [(2, 3), (2, 4)],
        'b': [(2, 4), (2, 5)],
        'n': [(2, 5), (2, 6)],
        'm': [(2, 6), (2, 7)]
    }
    
    if char1 == char2:
        return 0
    elif char1 not in qwerty_keyboard or char2 not in qwerty_keyboard:
        return 1
    else:
        positions1 = qwerty_keyboard[char1]
        positions2 = qwerty_keyboard[char2]
        min_distance = float('inf')
        for pos1 in positions1:
            for pos2 in positions2:
                distance = abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])
                min_distance = min(min_distance, distance)
        return min_distance

def wagner_fischer_distance(s1, s2):
    n = len(s1)
    m = len(s2)
    distance_matrix = np.zeros((n+1, m+1), dtype=int)
    
    for i in range(1, n+1):
        distance_matrix[i][0] = i
    for j in range(1, m+1):
        distance_matrix[0][j] = j
    
    for j in range(1, m+1):
        for i in range(1, n+1):
            if s1[i-1] == s2[j-1]:
                cost = 0
            else:
                cost = qwerty_distance(s1[i-1], s2[j-1])
            distance_matrix[i][j] = min(distance_matrix[i-1][j] + 1, # deletion
                                         distance_matrix[i][j-1] + 1, # insertion
                                         distance_matrix[i-1][j-1] + cost) # substitution
    
    return distance_matrix[n][m]

# Example usage:
s1 = "kitten"
s2 = "sitting"
distance = wagner_fischer_distance(s1, s2)
print("Wagner-Fischer distance between '{}' and '{}' is: {}".format(s1, s2, distance))


Wagner-Fischer distance between 'kitten' and 'sitting' is: 5


In [2]:
def qwerty_distance(char1, char2):
    qwerty_keyboard = {
        'q': [(0, 0), (0, 1)],
        'w': [(0, 1), (0, 2)],
        'e': [(0, 2), (0, 3)],
        'r': [(0, 3), (0, 4)],
        't': [(0, 4), (0, 5)],
        'y': [(0, 5), (0, 6)],
        'u': [(0, 6), (0, 7)],
        'i': [(0, 7), (0, 8)],
        'o': [(0, 8), (0, 9)],
        'p': [(0, 9), (0, 10)],
        'a': [(1, 0), (1, 1)],
        's': [(1, 1), (1, 2)],
        'd': [(1, 2), (1, 3)],
        'f': [(1, 3), (1, 4)],
        'g': [(1, 4), (1, 5)],
        'h': [(1, 5), (1, 6)],
        'j': [(1, 6), (1, 7)],
        'k': [(1, 7), (1, 8)],
        'l': [(1, 8), (1, 9)],
        'z': [(2, 0), (2, 1)],
        'x': [(2, 1), (2, 2)],
        'c': [(2, 2), (2, 3)],
        'v': [(2, 3), (2, 4)],
        'b': [(2, 4), (2, 5)],
        'n': [(2, 5), (2, 6)],
        'm': [(2, 6), (2, 7)]
    }
    
    if char1 == char2:
        return 0
    elif char1 not in qwerty_keyboard or char2 not in qwerty_keyboard:
        return 1
    else:
        positions1 = qwerty_keyboard[char1]
        positions2 = qwerty_keyboard[char2]
        min_distance = float('inf')
        for pos1 in positions1:
            for pos2 in positions2:
                distance = abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])
                min_distance = min(min_distance, distance)
        return min_distance

def damerau_levenshtein_distance(s1, s2):
    n = len(s1)
    m = len(s2)
    distance_matrix = np.zeros((n+1, m+1), dtype=int)
    
    for i in range(1, n+1):
        distance_matrix[i][0] = i
    for j in range(1, m+1):
        distance_matrix[0][j] = j
    
    for j in range(1, m+1):
        for i in range(1, n+1):
            if s1[i-1] == s2[j-1]:
                cost = 0
            else:
                cost = qwerty_distance(s1[i-1], s2[j-1])
            distance_matrix[i][j] = min(distance_matrix[i-1][j] + 1,
                                         distance_matrix[i][j-1] + 1,
                                         distance_matrix[i-1][j-1] + cost)
            
            if i > 1 and j > 1 and s1[i-1] == s2[j-2] and s1[i-2] == s2[j-1]:
                distance_matrix[i][j] = min(distance_matrix[i][j], distance_matrix[i-2][j-2] + cost) # transposition
    
    return distance_matrix[n][m]

# Example usage:
s1 = "kitten"
s2 = "sitting"
distance = damerau_levenshtein_distance(s1, s2)
print("Damerau-Levenshtein distance between '{}' and '{}' is: {}".format(s1, s2, distance))


Damerau-Levenshtein distance between 'kitten' and 'sitting' is: 5


## Task 2

In [4]:
import nltk
from nltk.corpus import gutenberg
from collections import Counter

text = gutenberg.raw('bible-kjv.txt')

# Tokenize the text into sentences
sentences = nltk.sent_tokenize(text)

# Tokenize each sentence into words
tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]

# Create a vocabulary
vocabulary = set(word for sentence in tokenized_sentences for word in sentence)
word_to_index = {word: i+1 for i, word in enumerate(vocabulary)} # Assign indices to words starting from 1

# Create Bag-of-Words representation for each sentence
bow_tagged_sentences = []
for sentence in tokenized_sentences:
    bow_vector = Counter(sentence)
    bow_tagged_sentence = [(word_to_index[word], count) for word, count in bow_vector.items()]
    bow_tagged_sentences.append(bow_tagged_sentence)

print("Vocabulary:")
print(word_to_index)

print("\nExample of Bag-of-Words representation for the first sentence:")
print(bow_tagged_sentences[0])


Vocabulary:

Example of Bag-of-Words representation for the first sentence:
[(8371, 1), (10124, 7), (2831, 2), (15055, 2), (16759, 2), (8618, 1), (14022, 1), (5050, 1), (14257, 2), (10717, 1), (10246, 1), (16595, 1), (5395, 1), (5599, 1), (1136, 1), (7771, 1), (324, 1), (10144, 1), (9125, 1), (8327, 1), (3356, 1), (4608, 1), (15160, 1), (6240, 1)]


In [5]:
from collections import defaultdict

# Sample document
d = "This is a sample document. We will count the occurrences of each word in this document."

tokenized_document = nltk.word_tokenize(d.lower())

# Count the occurrences of each word in the document
word_counts = defaultdict(int)
for word in tokenized_document:
    if word in word_to_index:
        word_index = word_to_index[word] # Retrieve the index of the word from the vocabulary
        word_counts[word_index] += 1

print("Word counts in document:")
for word_index, count in word_counts.items():
    print(f"({word_index}, d): {count}")


Word counts in document:
(3535, d): 2
(2487, d): 1
(849, d): 1
(6240, d): 2
(10993, d): 1
(1748, d): 1
(10439, d): 1
(10124, d): 1
(14257, d): 1
(10497, d): 1
(9221, d): 1
(324, d): 1


In [6]:
# Map the document to an N-dimensional vector
N = len(word_to_index)
document_vector = [0] * N

# Update the vector with the counts of each word in the document
for word_index, count in word_counts.items():
    document_vector[word_index - 1] = count  # Adjust index since word indices start from 1

print("Document vector:")
print(document_vector)


Document vector:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0

In [8]:
import numpy as np

d1 = "This is a sample document."
d2 = "Another document with some different words."

# Tokenize the documents into words
tokenized_d1 = nltk.word_tokenize(d1.lower())
tokenized_d2 = nltk.word_tokenize(d2.lower())

# Count the occurrences of each word in the documents
word_counts_d1 = defaultdict(int)
word_counts_d2 = defaultdict(int)

for word in tokenized_d1:
    if word in word_to_index:
        word_index = word_to_index[word]
        word_counts_d1[word_index] += 1

for word in tokenized_d2:
    if word in word_to_index:
        word_index = word_to_index[word]
        word_counts_d2[word_index] += 1

# Map the documents to N-dimensional vectors
document_vector_d1 = np.array([word_counts_d1.get(i, 0) for i in range(1, N+1)])
document_vector_d2 = np.array([word_counts_d2.get(i, 0) for i in range(1, N+1)])

# Compute the dot product
dot_product = np.dot(document_vector_d1, document_vector_d2)

print("Dot product between the two documents:", dot_product)


Dot product between the two documents: 1
