# Word2Vec Vector Manipulations
- the following code explores word2vec and spaCy's data processing and manipulation options

In [28]:

# # !pip install gensim
from gensim.models import KeyedVectors
# import gensim.downloader as api
# path = api.load("word2vec-google-news-300", return_path=True)

# Load pre-trained word2vec model (Google News vectors)
# word_vectors = KeyedVectors.load_word2vec_format(path, binary=True)

# observing how you can manipulate words regarding weather/temperatures
weather_changes_results_1 = word_vectors.most_similar(positive=["cold", "low"], negative=["high"], topn=1)[0][0]
weather_changes_results_2 = word_vectors.most_similar(positive=["cold", "high"], negative=["low"], topn=1)[0][0]
weather_changes_results_3 = word_vectors.most_similar(positive=["hot", "winter"], negative=["summer"], topn=1)[0][0]

# manipulation in terms of emotional shifts
emotional_changes_1 = word_vectors.most_similar(positive=["happy", "negative"], negative=["positive"], topn=1)[0][0]
emotional_changes_2 = word_vectors.most_similar(positive=["happy", "positive"], negative=["negative"], topn=1)[0][0]
emotional_changes_3 = word_vectors.most_similar(positive=["sad", "negative"], negative=["positive"], topn=1)[0][0]
print("The Good (manipulation in emotion): ", emotional_changes_1, emotional_changes_2, emotional_changes_3)
print("The Good (manipulation in weather): ", weather_changes_results_1, weather_changes_results_2, weather_changes_results_3)

# THE BAD:
# it can't interpret irony/sarcasm
sarcasm_ability_1 = word_vectors.most_similar(positive=["fun", "car", "accident"], topn=1)[0][0] # ignores the word 'fun' and takes it literally
sarcasm_ability_2 = word_vectors.most_similar(positive=["love", "paying", "taxes"], topn=1)[0][0] # ignores that 'taxes' is negative and 'love' is positive; takes it literally
sarcasm_ability_3 = word_vectors.most_similar(positive=["failed", "good", "job"], topn=1)[0][0] # ignores that there is praise of something that 'failed'; takes it positively
print("\n\nThe Bad (detecting sarcasm and irony): ", sarcasm_ability_1,sarcasm_ability_2, sarcasm_ability_3)



# THE UGLY: word2vec suggests a bias against athiests, as the word honest minus christian plus athiest derives the word "idiot"
# versus, the other way around word2vec results in "brutally_honest". Athiest receives the more negative, insulting word
religious_bias_1 = word_vectors.most_similar(positive=["honest", "athiest"], negative=["Christian"], topn=1)[0][0]
religious_bias_2 = word_vectors.most_similar(positive=["honest", "Christian"], negative=["athiest"], topn=1)[0][0]
print("\n\nThe Ugly (religious bias): ", religious_bias_1, religious_bias_2)


The Good (manipulation in emotion):  unhappy pleased saddening
The Good (manipulation in weather):  chilly frigid cold


The Bad (detecting sarcasm and irony):  crash pay decent


The Ugly (religious bias):  idiot brutally_honest


# spaCy Word Processing

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")


I PRON nsubj
know VERB ROOT
it PRON nsubj
's AUX ccomp
over ADV advmod
, PUNCT punct
still ADV advmod
I PRON nsubj
cling VERB ccomp
. PUNCT punct


In [18]:
# Sentence
doc = nlp("Steven Morrissey wrote my favorite song, which is \"Cemetry Gates\".") # this is a song by The Smiths; Morrissey is one of the members of the band

print("Tokenization: ");
# Tokenization
for token in doc:
    print(token.text)

print("\n\nPOS Tagging");
# POS Tagging
for token in doc:
    print(token.text, "-->", token.pos_, "-->", token.tag_)

print("\n\nNamed Entity Recognition (NER): ");
# Named Entity Recognition
for word in doc.ents:
    print(word.text,"-->", word.label_)

print("\n\nDependency Parsing: ");
# Dependency Parsing
for token in doc:
    print(token.text, "-->", token.pos_, "-->", token.dep_)

# Rules based NER
print("\n\nRules based NER: ");

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Pattern to match words inside quotes (assuming song titles are quoted)
pattern = [
    {"TEXT": "\""},  # Opening quote
    {"IS_ALPHA": True, "OP": "+"},  # One or more words
    {"TEXT": "\""}  # Closing quote
]

matcher.add("SONG_TITLE", [pattern])

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]  # The matched phrase
    print(span.text, "--> SONG_TITLE")


# Visualizer
print("\n\nDependency Visualization")
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
displacy.serve(doc, style="dep")



Tokenization: 
Steven
Morrissey
wrote
my
favorite
song
,
which
is
"
Cemetry
Gates
"
.


POS Tagging
Steven --> PROPN --> NNP
Morrissey --> PROPN --> NNP
wrote --> VERB --> VBD
my --> PRON --> PRP$
favorite --> ADJ --> JJ
song --> NOUN --> NN
, --> PUNCT --> ,
which --> PRON --> WDT
is --> AUX --> VBZ
" --> PUNCT --> ``
Cemetry --> PROPN --> NNP
Gates --> PROPN --> NNP
" --> PUNCT --> ''
. --> PUNCT --> .


Named Entity Recognition (NER): 
Steven Morrissey --> PERSON


Dependency Parsing: 
Steven --> PROPN --> compound
Morrissey --> PROPN --> nsubj
wrote --> VERB --> ROOT
my --> PRON --> poss
favorite --> ADJ --> amod
song --> NOUN --> dobj
, --> PUNCT --> punct
which --> PRON --> nsubj
is --> AUX --> relcl
" --> PUNCT --> punct
Cemetry --> PROPN --> compound
Gates --> PROPN --> attr
" --> PUNCT --> punct
. --> PUNCT --> punct


Rules based NER: 
"Cemetry Gates" --> SONG_TITLE


Visualization



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [32]:
result_dict = {
"Name": "Haniyyah Hamid", # Enter your university ID (e.g., "kxa190001"), ensure it's a string
'Q1': '''
# observing how you can manipulate words regarding weather/temperatures
weather_changes_results_1 = word_vectors.most_similar(positive=["cold", "low"], negative=["high"], topn=1)[0][0]
weather_changes_results_2 = word_vectors.most_similar(positive=["cold", "high"], negative=["low"], topn=1)[0][0]
weather_changes_results_3 = word_vectors.most_similar(positive=["hot", "winter"], negative=["summer"], topn=1)[0][0]

# manipulation in terms of emotional shifts
emotional_changes_1 = word_vectors.most_similar(positive=["happy", "negative"], negative=["positive"], topn=1)[0][0]
emotional_changes_2 = word_vectors.most_similar(positive=["happy", "positive"], negative=["negative"], topn=1)[0][0]
emotional_changes_3 = word_vectors.most_similar(positive=["sad", "negative"], negative=["positive"], topn=1)[0][0]
print("The Good (manipulation in emotion): ", emotional_changes_1, emotional_changes_2, emotional_changes_3)
print("The Good (manipulation in weather): ", weather_changes_results_1, weather_changes_results_2, weather_changes_results_3)
''',
'Q2': '''
# THE BAD:
# it can't interpret irony/sarcasm
sarcasm_ability_1 = word_vectors.most_similar(positive=["fun", "car", "accident"], topn=1)[0][0] # ignores the word 'fun' and takes it literally
sarcasm_ability_2 = word_vectors.most_similar(positive=["love", "paying", "taxes"], topn=1)[0][0] # ignores that 'taxes' is negative and 'love' is positive; takes it literally
sarcasm_ability_3 = word_vectors.most_similar(positive=["failed", "good", "job"], topn=1)[0][0] # ignores that there is praise of something that 'failed'; takes it positively
print("\n\nThe Bad (detecting sarcasm and irony): ", sarcasm_ability_1,sarcasm_ability_2, sarcasm_ability_3)
''',
'Q3': '''
# THE UGLY: word2vec suggests a bias against athiests, as the word honest minus christian plus athiest derives the word "idiot"
# versus, the other way around word2vec results in "brutally_honest". Athiest receives the more negative, insulting word
religious_bias_1 = word_vectors.most_similar(positive=["honest", "athiest"], negative=["Christian"], topn=1)[0][0]
religious_bias_2 = word_vectors.most_similar(positive=["honest", "Christian"], negative=["athiest"], topn=1)[0][0]
print("\n\nThe Ugly (religious bias): ", religious_bias_1, religious_bias_2)
''',
'Q4': '''
# Sentence
doc = nlp("Steven Morrissey wrote my favorite song, which is \"Cemetry Gates\".") # this is a song by The Smiths; Morrissey is one of the members of the band

print("Tokenization: ");
# Tokenization
for token in doc:
    print(token.text)

print("\n\nPOS Tagging");
# POS Tagging
for token in doc:
    print(token.text, "-->", token.pos_, "-->", token.tag_)

print("\n\nNamed Entity Recognition (NER): ");
# Named Entity Recognition
for word in doc.ents:
    print(word.text,"-->", word.label_)

print("\n\nDependency Parsing: ");
# Dependency Parsing
for token in doc:
    print(token.text, "-->", token.pos_, "-->", token.dep_)

# Rules based NER
print("\n\nRules based NER: ");

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Pattern to match words inside quotes (assuming song titles are quoted)
pattern = [
    {"TEXT": "\""},  # Opening quote
    {"IS_ALPHA": True, "OP": "+"},  # One or more words
    {"TEXT": "\""}  # Closing quote
]

matcher.add("SONG_TITLE", [pattern])

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]  # The matched phrase
    print(span.text, "--> SONG_TITLE")


# Visualizer
print("\n\nDependency Visualization")
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
displacy.serve(doc, style="dep")
''',
'Bonus': 'None'
}
print(result_dict)

{'Name': 'Haniyyah Hamid', 'Q1': '\n# observing how you can manipulate words regarding weather/temperatures\nweather_changes_results_1 = word_vectors.most_similar(positive=["cold", "low"], negative=["high"], topn=1)[0][0]\nweather_changes_results_2 = word_vectors.most_similar(positive=["cold", "high"], negative=["low"], topn=1)[0][0]\nweather_changes_results_3 = word_vectors.most_similar(positive=["hot", "winter"], negative=["summer"], topn=1)[0][0]\n\n# manipulation in terms of emotional shifts\nemotional_changes_1 = word_vectors.most_similar(positive=["happy", "negative"], negative=["positive"], topn=1)[0][0]\nemotional_changes_2 = word_vectors.most_similar(positive=["happy", "positive"], negative=["negative"], topn=1)[0][0]\nemotional_changes_3 = word_vectors.most_similar(positive=["sad", "negative"], negative=["positive"], topn=1)[0][0]\nprint("The Good (manipulation in emotion): ", emotional_changes_1, emotional_changes_2, emotional_changes_3) \nprint("The Good (manipulation in we

In [34]:
# Save and submit your result
import json
from google.colab import files
# Convert dictionary to JSON string
json_data = json.dumps(result_dict, indent=4) # Use indent for pretty-printing
# Save JSON to a file
with open("assignment3_answer.json", "w") as json_file: # Don't change the name here

  json.dump(result_dict, json_file, indent=4, ensure_ascii=False) # Use indent for pretty-printing

files.download("assignment3_answer.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>