In [None]:
"""
Question 1
Consider the following corpus of three documents.
•      d1= “information information data train”
•      d2 = “computer information cpu computer”
•      d3 = “computer retrieval information”
Use the TF-IDF approach to represent the documents with vectors. Calculate the values yourself. (Of course, you may use Python to do the math, but do the calculations for each word/document yourself, do not simply feed them into a tool/library. Show your work (i.e., if you do the calculations manually, show the math; if you do them in Python, include your code). As discussed in class, there are variations on the TF-IDF calculation; please use the formulas provided in the slides.
Be sure that your answer makes clear each of the values in each vector, and the word to which each value applies.
"""
import pandas as pd
import math, json

d1 = "information information data train"
print("d1:", d1)
d2 = "computer information cpu computer"
print("d2:", d2)
d3 = "computer retrieval information"
print("d3:", d3)

# Split documents into lists of words
docs = [d1.split(), d2.split(), d3.split()]
print("\nDocuments split into words:")
print(json.dumps(docs, indent=2))

# Build vocabulary (sorted for fixed order)
vocab = sorted(list(set(word for doc in docs for word in doc)))
print("\nVocabulary:", vocab)

# Compute TF (term frequency)
tf = []
for doc in docs:
    vec = []
    for word in vocab:
        vec.append(doc.count(word))
    tf.append(vec)
df__tf = pd.DataFrame(tf, columns=vocab, index=[f'd{i+1}' for i in range(len(docs))])
print("\nTF Table:")
# for i, row in enumerate(tf):
#     print(f"d{i+1}:", row)
print(df__tf)

# Compute IDF (using log2 for this example)
N = len(docs)
idf = []
for word in vocab:
    df = sum(1 for doc in docs if word in doc)
    idf_val = math.log2(N/df)
    idf.append(idf_val)
df__idf = pd.DataFrame([idf], columns=vocab, index=['values'])
print("\nIDF:") # , [round(val, 3) for val in idf]
print(df__idf)

# Compute TF-IDF
tfidf = []
for vec in tf:
    tfidf_vec = []
    for i, tf_val in enumerate(vec):
        tfidf_vec.append(tf_val * idf[i])
    tfidf.append(tfidf_vec)
df__tfidf = pd.DataFrame(tfidf, columns=vocab, index=[f'd{i+1}' for i in range(len(docs))])
print("\nTF-IDF Table:")
# for i, row in enumerate(tfidf):
#     print(f"d{i+1}:", [round(val, 3) for val in row])
print(df__tfidf)

d1: information information data train
d2: computer information cpu computer
d3: computer retrieval information

Documents split into words:
[
  [
    "information",
    "information",
    "data",
    "train"
  ],
  [
    "computer",
    "information",
    "cpu",
    "computer"
  ],
  [
    "computer",
    "retrieval",
    "information"
  ]
]

Vocabulary: ['computer', 'cpu', 'data', 'information', 'retrieval', 'train']

TF Table:
    computer  cpu  data  information  retrieval  train
d1         0    0     1            2          0      1
d2         2    1     0            1          0      0
d3         1    0     0            1          1      0

IDF:
        computer       cpu      data  information  retrieval     train
values  0.584963  1.584963  1.584963          0.0   1.584963  1.584963

TF-IDF Table:
    computer       cpu      data  information  retrieval     train
d1  0.000000  0.000000  1.584963          0.0   0.000000  1.584963
d2  1.169925  1.584963  0.000000          0.0   0

In [None]:
"""
Question 2
Given your TF-IDF vectors calculated above, calculate the following similarities between the documents. Use cosine similarity measure. Show your work, as above.
σ(d1,d2) =?
σ(d1,d3) =?
σ(d1,d1) =?
Comment on your findings.
"""
# import numpy as np

try: # If Run Continuously from previous code of (Question 1)
    d1_tfidf = df__tfidf.loc['d1'].tolist()
    d2_tfidf = df__tfidf.loc['d2'].tolist()
    d3_tfidf = df__tfidf.loc['d3'].tolist()
except Exception as e: # Output from previous code of (Question 1)
    d1_tfidf = [0.0, 0.0, 1.58496, 0.0, 0.0, 1.58496]
    d2_tfidf = [1.16993, 1.58496, 0.0, 0.0, 0.0, 0.0]
    d3_tfidf = [0.58496, 0.0, 0.0, 0.0, 1.58496, 0.0]

# d1_tfidf = np.array(d1_tfidf)  # Convert to numpy array for easier calculations
# d2_tfidf = np.array(d2_tfidf)  # Convert to numpy array for easier calculations
# d3_tfidf = np.array(d3_tfidf)  # Convert to numpy array for easier calculations

def cosine_similarity(a, b): # Calculate cosine similarity between two vectors using the formula.
    # return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) # Using numpy for dot product and norms
    pass
    # Dot product ~ (a . b) = a1 * b1 + a2 * b2 + ... + an * bn
    dot_product = sum(ai * bi for ai, bi in zip(a, b))
    # Norms (magnitudes) ~ ||x|| = sqrt(x1^2 + x2^2 + ... + xn^2) 
    norm_a = sum(ai * ai for ai in a) ** 0.5
    norm_b = sum(bi * bi for bi in b) ** 0.5
    # Cosine similarity ~ σ(a,b) = (a . b) / (||a|| * ||b||)
    if norm_a == 0 or norm_b == 0: # Avoid division by zero
        return 0.0
    else:
        cosine_similarity = dot_product / (norm_a * norm_b)
    return cosine_similarity

print("σ(d1, d2) =", cosine_similarity(d1_tfidf, d2_tfidf))
print("σ(d1, d3) =", cosine_similarity(d1_tfidf, d3_tfidf))
print("σ(d1, d1) =", cosine_similarity(d1_tfidf, d1_tfidf))

σ(d1, d2) = 0.0
σ(d1, d3) = 0.0
σ(d1, d1) = 1.0


In [25]:
"""
Question 3
Download the attached corpus (corpus5.txt). Each line of the corpus represents one document. Thus, there are 6 documents in this corpus. Write a python program to represent each document with a vector, using the following two approaches: frequency and TF-IDF. (The code covered in class/text should be directly applicable here.)
"""
import pandas as pd
import math, re

# Show full DataFrame outputs, no row/column truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Read the corpus file
docs = []
with open('corpus5.txt', 'r') as f:
    for line in f:
        # Lowercase and remove all non-letters (keep spaces)
        text_clean = re.sub(r"[^a-zA-Z\s]", "", line.lower())
        tokens = text_clean.split()
        docs.append(tokens)
print("\nDocuments split into words:")
print(docs)

# Build vocabulary (sorted for fixed order)
vocab = sorted(list(set(word for doc in docs for word in doc)))
print("\nVocabulary:", vocab)

# Compute TF (term frequency)
tf = []
for doc in docs:
    vec = []
    for word in vocab:
        vec.append(doc.count(word))
    tf.append(vec)
df__tf = pd.DataFrame(tf, columns=vocab, index=[f'd{i+1}' for i in range(len(docs))])
print("\nTF Table:")
# for i, row in enumerate(tf):
#     print(f"d{i+1}:", row)
print(df__tf)

# Compute IDF (using log2 for this example)
N = len(docs)
idf = []
for word in vocab:
    df = sum(1 for doc in docs if word in doc)
    idf_val = math.log2(N/df)
    idf.append(idf_val)
df__idf = pd.DataFrame([idf], columns=vocab, index=['values'])
print("\nIDF:") # , [round(val, 3) for val in idf]
print(df__idf)

# Compute TF-IDF
tfidf = []
for vec in tf:
    tfidf_vec = []
    for i, tf_val in enumerate(vec):
        tfidf_vec.append(tf_val * idf[i])
    tfidf.append(tfidf_vec)
df__tfidf = pd.DataFrame(tfidf, columns=vocab, index=[f'd{i+1}' for i in range(len(docs))])
print("\nTF-IDF Table:")
# for i, row in enumerate(tfidf):
#     print(f"d{i+1}:", [round(val, 3) for val in row])
print(df__tfidf)


Documents split into words:
[['in', 'the', 'shadows', 'of', 'the', 'forest', 'that', 'flanks', 'the', 'crimson', 'plain', 'by', 'the', 'side', 'of', 'the', 'lost', 'sea', 'of', 'korus', 'in', 'the', 'valley', 'dor', 'beneath', 'the', 'hurtling', 'moons', 'of', 'mars', 'speeding', 'their', 'meteoric', 'way', 'close', 'above', 'the', 'bosom', 'of', 'the', 'dying', 'planet', 'i', 'crept', 'stealthily', 'along', 'the', 'trail', 'of', 'a', 'shadowy', 'form', 'that', 'hugged', 'the', 'darker', 'places', 'with', 'a', 'persistency', 'that', 'proclaimed', 'the', 'sinister', 'nature', 'of', 'its', 'errand'], ['had', 'phaidors', 'slim', 'blade', 'found', 'that', 'beloved', 'heart'], ['time', 'only', 'would', 'reveal', 'the', 'truth'], ['half', 'of', 'them', 'had', 'passed', 'or', 'would', 'on', 'the', 'morrow', 'yet', 'vivid', 'in', 'my', 'memory', 'obliterating', 'every', 'event', 'that', 'had', 'come', 'before', 'or', 'after', 'there', 'remained', 'the', 'last', 'scene', 'before', 'the', 'gust

In [None]:
"""
Question 4
Now, we will use a distributed model to generate word vectors for words from the corpus.
Important steps:
Ensure gensim has been installed in your Python environment.
GloVe vectors are available for download from the Stanford NLP Group website: https://nlp.stanford.edu/projects/glove/ You can choose the desired GloVe vectors based on the corpus size and vector dimensionality. For this exercise, download the glove.6B.zip file, containing vectors trained on the 6 billion token corpus with various dimensions (50, 100, 200, or 300 dimensions). (If the file is taking too long to download, you might obtain a copy locally from another student or the instructor.)
Extract the downloaded GloVe vector files to a directory in your project folder.
We will use the glove.6B.100d.txt file. For gensim to be able to load the file, a line must be added at the top of the file indicating the vocabulary size and number of dimensions. Add the following line at the top of the file:
400000 100
Load GloVe word vectors using Gensim, using code like the following:
```
from gensim.models import KeyedVectors 
# Path to the pre-trained GloVe word vectors file
glove_file = 'path/to/glove.6B.100d.txt'
# Load GloVe word vectors into Gensim KeyedVectors format
word_vectors = KeyedVectors.load_word2vec_format(glove_file, binary=False)
```
Tokenize the words from the corpus loaded from the corpus5.txt file. (You can use the same tokenization as above; put text in lowercase form and drop punctuation.)
For the first document in the corpus, print each word, followed by its vector. (The KeyedVectors object created above can be used like a dictionary--provide a word as the key, and it will return the corresponding vector.)
"""
# NOTE: As Instructed I've added `400000 100` to the top of the glove.6B.100d.txt file manually as per instructions.
from gensim.models import KeyedVectors # pip install gensim
import re

# Path to the pre-trained GloVe word vectors file
glove_file = 'ignore\\glove.6B.100d.txt'

# Load GloVe word vectors into Gensim KeyedVectors format
word_vectors = KeyedVectors.load_word2vec_format(glove_file, binary=False)

# Tokenize the first document in corpus5.txt (cleaning symbols, lowercase)
with open('corpus5.txt', 'r') as f:
    first_line = f.readline()
    # Lowercase and remove non-letter characters
    text_clean = re.sub(r"[^a-zA-Z\s]", "", first_line.lower())
    tokens = text_clean.split()
print("First document tokens:", tokens)

# Print each word followed by its vector
for word in tokens:
    if word in word_vectors:
        print(f"\nWord: `{word}`\nVector: {word_vectors[word]}")
    else:
        print(f"\nWord: `{word}` (not found in GloVe vocabulary)")

First document tokens: ['in', 'the', 'shadows', 'of', 'the', 'forest', 'that', 'flanks', 'the', 'crimson', 'plain', 'by', 'the', 'side', 'of', 'the', 'lost', 'sea', 'of', 'korus', 'in', 'the', 'valley', 'dor', 'beneath', 'the', 'hurtling', 'moons', 'of', 'mars', 'speeding', 'their', 'meteoric', 'way', 'close', 'above', 'the', 'bosom', 'of', 'the', 'dying', 'planet', 'i', 'crept', 'stealthily', 'along', 'the', 'trail', 'of', 'a', 'shadowy', 'form', 'that', 'hugged', 'the', 'darker', 'places', 'with', 'a', 'persistency', 'that', 'proclaimed', 'the', 'sinister', 'nature', 'of', 'its', 'errand']

Word: `in`
Vector: [ 0.085703 -0.22201   0.16569   0.13373   0.38239   0.35401   0.01287
  0.22461  -0.43817   0.50164  -0.35874  -0.34983   0.055156  0.69648
 -0.17958   0.067926  0.39101   0.16039  -0.26635  -0.21138   0.53698
  0.49379   0.9366    0.66902   0.21793  -0.46642   0.22383  -0.36204
 -0.17656   0.1748   -0.20367   0.13931   0.019832 -0.10413  -0.20244
  0.55003  -0.1546    0.98655  