# Assignment 1.2
## Practical Deep Learning for Language Processing

11/27/2022


### 1

In [1]:
with open("brown.txt", "r") as file:
    brown_corpus = file.read()

### 2

In [2]:
import numpy as np
import pandas as pd

def get_word_frequencies(corpus):
    # get words and their frequencies for the split corpus
    words, frequencies = np.unique(corpus.split(), return_counts = True)
    # return a series with the words as indices
    return pd.Series(frequencies, index = words)

In [3]:
# print an example
get_word_frequencies(brown_corpus[0:43])

County    1
Friday    1
Fulton    1
Grand     1
Jury      1
The       1
an        1
said      1
dtype: int64

### 3

In [4]:
def create_vocab(frequencies, vocab_size = 20000):
    # sort all values and select the top |v| entries
    selection = frequencies.sort_values(ascending = False).iloc[0:vocab_size]
    # replace the values with their indices
    selection.iloc[:] = range(selection.shape[0])
    return selection  

In [5]:
# print an example
create_vocab(get_word_frequencies(brown_corpus))[0:10]

the     0
of      1
and     2
to      3
a       4
in      5
that    6
is      7
was     8
for     9
dtype: int64

### 4

In [6]:
def windowizer(corpus):
    # split the string into its tokens and turn into iterator
    split = iter(corpus.split())
    # return the pairs as list
    return list(zip(split, split))

In [7]:
# print an example
windowizer(brown_corpus[0:43])

[('The', 'Fulton'), ('County', 'Grand'), ('Jury', 'said'), ('Friday', 'an')]

### 5
For this task, I choose not to use `sklearn.feature_extraction.text.CountVectorizer` but my own algorithm instead. I also implement some optional preprocessing that turns all words to lower case, and removes specials and stopwords.

In [8]:
def perform_preprocessing(corpus):
    import re
    # turn the corpus to lower case
    corpus = corpus.lower()
    # remove all specials
    corpus = re.sub(r"[^A-Za-z0-9\s\n]", "", corpus)
    # replace single line breaks but not section breaks
    corpus = re.sub(r"(?<!\n)\n", "", corpus)
    # remove stopwords
    from nltk.corpus import stopwords
    corpus = re.sub(re.compile("|".join(["\s" + x + "(?=\s)" for x in stopwords.words("english")])), "", corpus)
    return corpus

In [9]:
def create_embedding_matrix(corpus):
    
    # get the vocab mapping from string to integer
    vocabulary = create_vocab(get_word_frequencies(corpus))
    
    # get all grams
    grams = windowizer(corpus)

    # create a blueprint embedding matrix of all zeros
    embedding_matrix = np.zeros((vocabulary.shape[0], vocabulary.shape[0]))
    
    # iterate over all grams and add to count if both are in vocab
    for i in range(len(grams)):
        # if one or both are not in vocab a KeyError will be thrown: ignore this error
        try:
            embedding_matrix[ vocabulary[ grams[i][0] ] ][ vocabulary[ grams[i][1] ] ] += 1
            embedding_matrix[ vocabulary[ grams[i][1] ] ][ vocabulary[ grams[i][0] ] ] += 1
        except KeyError:
            pass
        
    return embedding_matrix

In [10]:
# an example not using preprocessing
create_embedding_matrix(brown_corpus[0:43])

array([[0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.]])

### 6

In [11]:
# define a function for the cosine similarity
def cosine_sim(x, y):
    return((x @ y) / ((sum(x ** 2) ** .5) * (sum(y ** 2) ** .5)))

In [12]:
def most_similar_words(embedding_matrix, vocabulary, test_word):
    
    # get the index of the desired word
    test_word_index = vocabulary[ test_word ]
    
    # create a blueprint series for the similarities
    similarities = pd.Series(0, index = vocabulary.index)
    
    # iterate over the vocabulary and calculate similarties between the esired and all other words
    for i in range(vocabulary.shape[0]):
        similarities[i] = cosine_sim(embedding_matrix[test_word_index], embedding_matrix[i])
      
    # return the words with highest similarity scores  
    return similarities.drop(test_word).sort_values(ascending = False).head(5)

In [13]:
# create embeddings and vocabulary w/o preprocessing
embedding_matrix = create_embedding_matrix(brown_corpus)
vocabulary = create_vocab(get_word_frequencies(brown_corpus))

In [14]:
# get the most similar words to "County"
most_similar_words(embedding_matrix, vocabulary, "County")

  return((x @ y) / ((sum(x ** 2) ** .5) * (sum(y ** 2) ** .5)))


selects    0.272772
"same      0.267261
invaded    0.239046
touring    0.231455
glue       0.222718
dtype: float64

In [15]:
# create embeddings and vocabulary with preprocessing
embedding_matrix = create_embedding_matrix(perform_preprocessing(brown_corpus))
vocabulary = create_vocab(get_word_frequencies(perform_preprocessing(brown_corpus)))

In [16]:
# again get the most similar words to "county" (in lower case this time)
most_similar_words(embedding_matrix, vocabulary, "county")

  return((x @ y) / ((sum(x ** 2) ** .5) * (sum(y ** 2) ** .5)))


tex              0.228218
superior         0.225000
supreme          0.222159
parkhouse        0.218218
desegregation    0.217597
dtype: float64