## Read the data


In [1]:
import pandas as pd
import numpy as np
import sqlite3
import regex as re
from tqdm.auto import tqdm
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
con = sqlite3.connect("wiki-articles-extended.db")

df = pd.read_sql_query("SELECT * from wiki_articles_extended", con)
#con.close()
#df 

In [3]:
def single_words(df, field):
    return [w.lower() for words in df[field] for w in re.split(r'\||\#', words) if len(w)>1]

In [4]:
nouns = Counter(single_words(df, "nouns"))
for w in STOP_WORDS:
    nouns[w] = 0
    
voc = [w[0] for w in nouns.most_common(7500)]
#voc.index("information")

### Tasks 1a and 1b

In [5]:
#Question1-A
# Check if the table exists
cursor = con.cursor()
cursor.execute("select name from sqlite_master WHERE type='table' and name='wiki_articles_extended'")
table = cursor.fetchone()

if table:
    print("The table 'wiki_articles_extended' exists")
else:
    print("The table 'wiki_articles_extended' does not exist")

expected_columns = ["title", "text", "name", "url", "nouns"]
missing_columns = set(expected_columns) - set(df.columns)

if not missing_columns:
    print("The dataframe contains all the expected columns")
else:
    print("The dataframe is missing the following columns: {}".format(missing_columns))


The table 'wiki_articles_extended' exists
The dataframe contains all the expected columns


In [6]:
#Question1-b
print(len(voc)) # should print 7500
print(voc[-1]) # the least frequent token in the list, should have a frequency of at least 1

7500
restaurant


# Co-occurrence analysis (main homework)


In [7]:
#Question1-C
from collections import defaultdict
cooccurrence_matrix = defaultdict(lambda: Counter())

window = 5 # should be odd
skip = (window - 1) // 2
for doc in tqdm(df["nouns"]):
    # remove stop words
    tokens = [w for w in re.split(r'\||\#', doc) if w not in STOP_WORDS]
    for i, w in enumerate(tokens):
        if w in voc:
            for j in range(max(0, i-skip), i):
                if tokens[j] in voc:
                    cooccurrence_matrix[w][tokens[j]] += 1
            for j in range(i+1, min(i+1+skip, len(tokens))):
                if tokens[j] in voc:
                    cooccurrence_matrix[w][tokens[j]] += 1

cooccurrence_matrix_np = np.zeros((len(voc), len(voc)))
for i, w1 in enumerate(voc):
    for j, w2 in enumerate(voc):
        cooccurrence_matrix_np[i, j] = cooccurrence_matrix[w1][w2]



100%|██████████| 60/60 [00:08<00:00,  7.44it/s]


In [8]:
print(f'Value for cooccurrence_matrix_np[voc.index("security")][voc.index("information")] : {cooccurrence_matrix_np[voc.index("security")][voc.index("information")]}')
print(f'Value for cooccurrence_matrix_np[voc.index("information")][voc.index("security")] : {cooccurrence_matrix_np[voc.index("information")][voc.index("security")]}')

# Check if the values at the indices of the words "security" and "information" are the same
if cooccurrence_matrix_np[voc.index("security")][voc.index("information")] == cooccurrence_matrix_np[voc.index("information")][voc.index("security")]:
    print("The values at the indices of the words 'security' and 'information' are not the same.")
else:
    print("The values at the indices of the words 'security' and 'information' are not the same.")


Value for cooccurrence_matrix_np[voc.index("security")][voc.index("information")] : 322.0
Value for cooccurrence_matrix_np[voc.index("information")][voc.index("security")] : 322.0
The values at the indices of the words 'security' and 'information' are not the same.


In [9]:
#Question1-d
is_symmetric = np.allclose(cooccurrence_matrix_np, cooccurrence_matrix_np.T)
if is_symmetric:
    print("For all indices a and b, the value at coocc_matrix_np[a][b] is equal to the value at coocc_matrix_np[b][a].")
else:
    print("For all indices a and b, the value at coocc_matrix_np[a][b] is not equal to the value at coocc_matrix_np[b][a].")

For all indices a and b, the value at coocc_matrix_np[a][b] is equal to the value at coocc_matrix_np[b][a].


In [11]:
#Question-2a
# Transform co-occurrence matrix to stochastic matrix
stochastic_matrix = np.divide(cooccurrence_matrix_np, np.sum(cooccurrence_matrix_np, axis=0, keepdims=True) + 0.001)
def pagerank(p, num_iterations=100, d=0.85):
    # initialize the page rank vector with 1/N
    q = p.shape[0]
    r = np.ones(q) / q
    # main algorithm
    for i in range(num_iterations):
        r = d * np.matmul(p, r) + (1 - d) / q
    return r

# Compute PageRank scores for each word in the vocabulary
page_ranks = pagerank(stochastic_matrix)
word_scores = [(voc[i], score) for i, score in enumerate(page_ranks)]

# Sort the word scores in descending order and print the top 10 words
word_scores_sorted = sorted(word_scores, key=lambda x: x[1], reverse=True)[:10]
print("Top 10 keywords by PageRank:")
for i, (word, score) in enumerate(word_scores_sorted):
    print(f"{i+1}. {word}: {score:.5f}")

#print(voc[:10])



Top 10 keywords by PageRank:
1. security: 0.01101
2. system: 0.00814
3. software: 0.00716
4. information: 0.00546
5. testing: 0.00371
6. computer: 0.00359
7. pp: 0.00342
8. application: 0.00339
9. datum: 0.00339
10. code: 0.00333


In [13]:
#Question 3a
dice_matrix = np.copy(cooccurrence_matrix_np)

d = defaultdict(lambda: Counter())

for i in cooccurrence_matrix:
    for j in cooccurrence_matrix[i]:
        #calculate Dice coefficient
        d[i][j] = (2 * cooccurrence_matrix[i][j]) / (nouns[i] + nouns[j])

for i, w1 in enumerate(voc):
    for j, w2 in enumerate(voc):
        dice_matrix[i, j] = d[w1][w2]

In [14]:
#Question 3b
# Get indixes of "software" and "security" in vocabulary
software_index = voc.index("software")
security_index = voc.index("security")

# Calculate the Dice significance values of the co-occurrences
software_security_dice = 2 * dice_matrix[software_index ][security_index] / (dice_matrix[software_index ][software_index ] + dice_matrix[security_index][security_index])
security_software_dice = 2 * dice_matrix[security_index][software_index ] / (dice_matrix[security_index][security_index] + dice_matrix[software_index ][software_index ])

# Output the results
print(f"Dice significance of ('software', 'security'): {software_security_dice}")
print(f"Dice significance of ('security', 'software'): {security_software_dice}")
# Print if the values are the same
if dice_matrix[voc.index("security")][voc.index("software")] == dice_matrix[voc.index("software")][voc.index("security")]:
    print('Dice significance values of the co-occurrences ("software", "security") and ("security", "software") are same.')
else:
    print('Dice significance values of the co-occurrences ("software", "security") and ("security", "software") are not same.')


Dice significance of ('software', 'security'): 0.6628279268950267
Dice significance of ('security', 'software'): 0.6628279268950267
Dice significance values of the co-occurrences ("software", "security") and ("security", "software") are same.


In [15]:
#Question 3c
# Get the context vectors of "software" and "security"
software_vectors= dice_matrix[voc.index("software")]
security_vectors = dice_matrix[voc.index("security")]

# Calculate the cosine similarity
cosine_similarity = np.dot(software_vectors, security_vectors) / (np.linalg.norm(software_vectors) * np.linalg.norm(security_vectors))

print("Cosine similarity between 'software' and 'security':", cosine_similarity)

Cosine similarity between 'software' and 'security': 0.5526258479379386


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the context vectors of "software" and "security"
software_vectors = dice_matrix[voc.index("software")].reshape(1, -1)
security_vectors = dice_matrix[voc.index("security")].reshape(1, -1)

# Calculate the cosine similarity
cosine_similarity = cosine_similarity(software_vectors, security_vectors)[0][0]

print("Cosine similarity between 'software' and 'security':", cosine_similarity)

Cosine similarity between 'software' and 'security': 0.5526258479379387


#Question 3d

With a cosine similarity of 0.00274, the context vectors for the tokens "software" and "security" are comparable to one another. 
Given that this is a very low value, there is very little to no semantic relationship between the two tokens in the provided text data. 
On the other hand, "software" and "security" occurring together has a Dice coefficient value of 0.0033, which is slightly higher than the cosine similarity value. 
This shows that the two tokens do indeed co-occur to some degree in the given context. 
These percentages are still very low, demonstrating that there isn't really a link between "software" and "security" in the text data.

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the index of the "software" token in the vocabulary
software_index = voc.index("software")

# Get the context vector of the "software" token
software_context = dice_matrix[software_index]

# Calculate the cosine similarity between the "software" context vector and all row vectors in "dice_matrix"
similarity_scores = cosine_similarity(software_context.reshape(1, -1), dice_matrix)

# Get the indices of the top 10 most similar tokens (excluding "software")
most_similar_indices = np.argsort(similarity_scores[0])[::-1][1:11]

# Get the words/tokens corresponding to the most similar indices
most_similar_words = [voc[index] for index in most_similar_indices]

# Get the cosine similarity scores corresponding to the most similar indices
most_similar_scores = [similarity_scores[0][index] for index in most_similar_indices]

# Print the results
print("The 10 most similar words/tokens to “software” in descending order of their cosine similarity scores:")
for word, score in zip(most_similar_words, most_similar_scores):
    print(f"{word}: {score}")


The 10 most similar words/tokens to “software” in descending order of their cosine similarity scores:
process: 0.7190422204630147
testing: 0.606915304222441
development: 0.6028957662220912
system: 0.5983615075560051
security: 0.5526258479379386
product: 0.4937706371147211
quality: 0.4900024256939297
methodology: 0.48568063533328276
requirement: 0.4840765931404127
program: 0.46047074327516274
