In [14]:
# Load Necessary Libraries
import conllu
import random
random.seed(123)
import math
import gensim.downloader as gensim_api
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from scipy import stats
from functools import reduce

# Task 1.2: English Syntax Generalization Practice
We used the English GUM corpus with 233k entries maintained by Georgetown University Linguistics students. The text comes from multiple media formats. Our goal was to examine a generalization on verbs and their direct object pronouns within the corpus. The generalization accurately described the relevant sentences in the corpus.

## Our Generalization
**Expected Generalization:** Pronoun direct objects immediately follow their linked verbs.

An example that illustrates this generalization is provided below.

“First, our experimental subjects lived in a large enclosure under conditions that **allowed them** to exercise all day long.”

## Results
We used a function called 'finderskeepers' to search the corpus for sentences that contain pronoun (direct) objects linked to verbs. If those pronouns immediately followed the verb, then they fit our generalization. Otherwise, they were flagged as exceptions.

About 86% of the corpus fit our generalization; direct object pronouns indeed follow their linked verbs. The exceptions to this are when adverbs modify the verb, adjectives modify the pronoun, wh-questions, a passivized verb requires argument movement to bring the direct object pronoun *forward* in the sentence, among others. Examples are provided below.

## Examples of Exceptions
### Wh-Question
"If your professor comes into an early morning class holding a mug of liquid, **what** do you assume she is **drinking**?"

### Stammer
“You know, **documenting**, uh, uh, **whatever**.”

### Modifying Preposition
"And I said ‘Ben, **pick** me out **something**, you've got fifty bucks to spend.'"

### Dialogue
"Whitmore told LA Weekly that on October 6 after traveling back from Florida, Montalvo ‘walked into lobby of the East L.A. station and turned himself in’, and **told** the police, ‘**everything** he did’.”

### Wh-Movement
“Malaysia and Indonesia have maintained a policy of turning away boats of migrants **which**, according to AFP, the United Nations and United States have both **criticised**.”

### Movement
"**This** I would have **asked** him had he not been so far away, but he was very far, and could not be seen at all when he drew nigh that gigantic reef."

In [20]:
def englishGen(sentences, sample_size):
    fit_generalization = []
    possible_exceptions = []
    current = {}
    for sentence in sentences:
        for word in sentence:
            if word["upos"] == "PRON" and word["deprel"] == "obj": # and word["lemma"] != "that"
                if word["head"] != None and sentence[word["head"]-1]["upos"] == "VERB":
                    current["sentence"] = sentence.metadata['text']
                    current["PRON"] = word
                    current["VERB"] = sentence[word["head"]-1]
                    if word["id"] != 0 and word["id"]-1 == word["head"]:
                        if current not in fit_generalization:
                            fit_generalization.append(current.copy())
                    elif word["id"]-1 != word["head"]: # != gets all possible exceptions
                        if current not in possible_exceptions:
                            possible_exceptions.append(current.copy())
    
    print(f"Sentences that fit the generalization: {len(fit_generalization)}\n")
    
    fitted_samples = random.sample(fit_generalization, sample_size)
    
    for entry in fitted_samples:
        print(f'PRON: {entry["PRON"]}, VERB: {entry["VERB"]}\n Sentence: {entry["sentence"]}\n')
    
    print(f"\nSentences that may (or may not) be exceptions: {len(possible_exceptions)}\n")
    
    fitted_samples = random.sample(possible_exceptions, sample_size)
    
    for entry in fitted_samples:
        print(f'PRON: {entry["PRON"]}, VERB: {entry["VERB"]}\n Sentence: {entry["sentence"]}\n')
    
    #print(count)
    print(f"\nTotal sentences with Pronouns linked to Verbs: {len(fit_generalization)+len(possible_exceptions)}")

In [21]:
with open("en_gum-ud-train.conllu", encoding="utf8") as f:
    data = f.read()
sentences_english = conllu.parse(data)
englishGen(sentences_english, 5)

Sentences that fit the generalization: 995

PRON: something, VERB: putting
 Sentence: Putting, putting something out there, getting it smashed down in front of everyone, and then immediately, like having to jump back up and like, do it again.

PRON: it, VERB: started
 Sentence: "They started it."

PRON: herself, VERB: picking
 Sentence: Jenna was picking herself up off the floor of the bathroom.

PRON: it, VERB: making
 Sentence: Birkholm Is only 1km² and home of a stubborn but loving 8 people, making it one of the smallest populated islands in the country, how long the island can sustain a full year population remains to be seen, but permanent life on the island will probably be a thing of the past in a not too distant future.

PRON: it, VERB: like
 Sentence: And I really like it.


Sentences that may (or may not) be exceptions: 156

PRON: what, VERB: do
 Sentence: How were the plaintiffs supposed to -- what were they supposed to do when the notice gave no notice whatsoever, as to how

# Tasks 1.1 & 1.3: Swedish Verb Negation Generalization
We used the Swedish LinES corpus (from the Parallel Treebank of the same name) that includes just over 100k Swedish translations from English text. Our goal was to examine a generalization on Swedish verb negation. Our results conflicted with our expectations, and so we performed further examination to search for a different possible generalization.

## Our Generalization
**Expectated Generalization:** Negation words immediately follow the verb they negate.

The below example illustrates this generalization. 

“Hon **svarade inte**.”
(She didn't answer.)
('inte' = 'not', linked words are bolded)

## Results & Discussion
We used a function called 'swedishchecker' to search the corpus for negation words linked to verbs. Then we filtered instances where the negation came immediately after the verb; those examples fit our generalization. The other sentences were cached as exceptions.

Only about 21% of our corpus fit the expected generalization. There were a lot of exceptions ranging from embedded sentences, questions, auxiliaries, verb-object switches, and the list goes on. Some examples of these are provided below:


## Secondary Corpus Test
When tested with a slightly smaller corpus (96k entries) called Talbanken from Lund University. The sentences were taken from various text genres like textbooks, brochures, and newspaper articles. We found similar results to the above, where 22% of the sentences with negated verbs actually fit our generalization. This leads us to believe that the translation bias in our first corpus may not be the reason that our generalization fits so poorly.

In [26]:
def swedishGen(sentences, sample_size):
    fit_generalization = []
    possible_exceptions = []
    current = {}
    exceptions = []
    for sentence in sentences:
        for word in sentence:
            if word["xpos"] == "NEG" and word["head"] != None and sentence[word["head"]-1]["upos"] == "VERB":
            #if word["feats"] != None and "Polarity" in word["feats"].keys() and word["feats"]["Polarity"] == "Neg" and and word["head"] != None and sentence[word["head"]-1]["upos"] == "VERB": # xpos "NEG", upos --
                current["sentence"] = sentence.metadata['text']
                try:
                    current["sentence-E"] = sentence.metadata['text_en']
                except:
                    current["sentence-E"] = None
                current["NEG"] = word
                current["VERB"] = sentence[word["head"]-1]
                if word["id"] != 0 and word["id"]-1 == word["head"]:
                    if current not in fit_generalization:
                        fit_generalization.append(current.copy())
                elif word["id"]-1 != word["head"]:
                    if current not in possible_exceptions:
                        possible_exceptions.append(current.copy())
                        exceptions.append(sentence)
                            
    print(f"Sentences that fit the generalization: {len(fit_generalization)}\n")
    fitted_samples = random.sample(fit_generalization, min(sample_size, len(fit_generalization)))
    for entry in fitted_samples:
        print(f'NEG: {entry["NEG"]}, VERB: {entry["VERB"]}\n'
              f'Sentence: {entry["sentence"]}\n'
              f'English Translation: {entry["sentence-E"]}\n')

    print(f"\nSentences that may (or may not) be exceptions: {len(possible_exceptions)}\n")
    fitted_samples = random.sample(possible_exceptions, min(sample_size, len(possible_exceptions)))
    for entry in fitted_samples:
        print(f'NEG: {entry["NEG"]}, VERB: {entry["VERB"]}\n'
              f'Sentence: {entry["sentence"]}\n'
              f'English Translation: {entry["sentence-E"]}\n')

    return exceptions

In [28]:
with open("sv_lines-ud-train.conllu", encoding="utf8") as f:
    data = f.read()
sentences_swedish = conllu.parse(data)
exceptions = swedishGen(sentences_swedish,5)

Sentences that fit the generalization: 99

NEG: inte, VERB: ansågs
Sentence: Detta ansågs inte speciellt egendomligt: andra amerikanska ambassadörer och ministrar i arabvärlden stödde helhjärtat "äkta" revolutioner för att störta gamla jordägare, rika skurkar och politiker.
English Translation: This was not considered particularly bizarre; other American ambassadors and ministers in the Arab world were entirely in favor of "genuine" revolution to overthrow old landowners, rich crooks, and politicians.

NEG: inte, VERB: finns
Sentence: Man kan på datorn få ut en piratversion, men den finns inte på franska.
English Translation: A pirate electronic version is available but not in French.

NEG: inte, VERB: gick
Sentence: Den gick inte att äta.
English Translation: This was not for eating.

NEG: inte, VERB: uppfattade
Sentence: Vem var det? Jag vet inte... en av passagerarna i planet... en lätt flintskallig, blond karl med främmande brytning, jag uppfattade inte namnet.
English Translation:

# Generalization Exceptions
Our generalization seems to hold on simple sentences with little to no nuance.

 Examples include:

“Hon svarade inte.” -“She didn't answer.”

"Hon talar inte jiddisch?" - “She doesn't speak Yiddish?”

These sentences relay straightforward information and do not contain many flourishes in speech. If we were to only consider such sentencs, our generalization holds with 21% accuracy on verb negations.

However, if we take into account more detailed sentences, we see a diffrerent result. In examining our initial results, there were two main exceptions we identified that change the location of negation. If we include this nuance, accuracy increases to 44%

- **Auxiliary verbs:** if auxilary verbs are present, the negation is placed between the auxiliary and the main (head) verb

“Hans sekreterare hade inte ringt det samtal hon hade fått instruktioner om.”

“His secretary had not made the instructed call.”
	
- **Embedded clauses:** if there is an embedded clause, the negation follows the subject of the clause and preceedes the main (head) verb. This seems to happen because of rules regarding VPs in Swedish

“Jag har suttit här tålmodigt och jag finner det anmärkningsvärt att ni inte ropar upp mig.”

“I have sat here patiently and I find it quite extraordinary that you are not calling me.”

While 44% accuracy might seem low, this result can be explained by recognizing a feature of Swedish that lets words be reordered to put emphasis on certain aspects of the sentence. For example, Object Shift allows for the object of a verb to swap places with the negation, while still producing a gramatical sentence

“Jag förstår det inte alls.”

“I do not understand it at all.”


In [8]:
# Substitute into swedishGen for Aux checking
            if word["upos"] == "AUX" and word["head"] is not None:
                head_idx = word["head"] - 1
                if sentence[head_idx]["upos"] == "VERB":

                    if word["id"] < len(sentence) and sentence[word["id"]]["xpos"] == "NEG":
                        current = {}
                        current["sentence"] = sentence.metadata.get("text")
                        try:
                            current["sentence-E"] = sentence.metadata["text_en"]
                        except KeyError:
                            current["sentence-E"] = sentence.metadata.get("Text_en", None)

                        current["AUX"] = word
                        current["NEG"] = sentence[word["id"]] 
                        current["VERB"] = sentence[head_idx]

                        if word["id"] - 1 <= word["head"]:
                            if current not in fit_generalization:
                                fit_generalization.append(current.copy())
                        elif word["id"] - 1 != word["head"]:
                            if current not in possible_exceptions:
                                possible_exceptions.append(current.copy())
                                exceptions.append(sentence)
            if word["xpos"] == "NEG":
                if word["head"] != None and sentence[word["head"]-1]["upos"] == "VERB":
                    current["sentence"] = sentence.metadata['text']
                    try:
                        current["sentence-E"] = sentence.metadata['text_en']
                    except:
                        try:
                            current["sentence-E"] = sentence.metadata['Text_en']
                        except:
                            print(f"Error for sentence: {sentence.metadata}")
                            current["sentence-E"] = None
                    current["NEG"] = word
                    current["VERB"] = sentence[word["head"]-1]
                    current["AUX"] = None
                    if word["id"] != 0 and word["id"]-1 >= word["head"]:
                        if current not in fit_generalization:
                            fit_generalization.append(current.copy())
                    elif word["id"]-1 != word["head"]:
                        if current not in possible_exceptions:
                            possible_exceptions.append(current.copy())
                            exceptions.append(sentence)

# Task 2
## Verb Frequency

In [9]:
# verb_frequencies method

In [None]:
verb_frequencies(sentences_swedish)

The verbs chosen are randomly sampled from the top 20% of verbs when sorted by frequency and the next 20% of most frequently used verbs -- five from each category. Frequency was determined by tallying the lemmas of each verb.

## Verb Sets

In [10]:
def gen_sets(sentences, verb):
    sets = {"verb": verb, "subjects": set(), "objects": set(), "modifiers": set(), "before": set(), "after": set()}
    for sentence in sentences:
        words = [x['lemma'] for x in sentence]
        if (verb in words):
            word_id = words.index(verb)+1
            sets["before"].add(words[word_id-2])
            sets["after"].add(words[word_id])
            for word in sentence:
                if(word["deprel"] in ["obj", "nsubj", "iobj", "advmod"] and word["head"] == word_id):
                    match word["deprel"]:
                        case "obj" | "iobj":
                            sets["objects"].add(word["lemma"])
                        case "nsubj":
                            sets["subjects"].add(word["lemma"])
                        case "advmod":
                            sets["modifiers"].add(word["lemma"])
                    
    return sets

For any verb's *lemma* in the set of Swedish sentences, this method generates a dictionary containing sets of each subjects, objects, modifiers, preceding words, and following words corresponding to the given verb. The set of modifiers for the verb only contains adverbs, (or those with the dependency relationship "advmod" to the verb), but not other modifiers like negation, prepositions, or auxiliaries. Including would have likely skewed our results by adding more noise, since semantically, adverbs might be more significant. However, it may have been beneficial to have include other modifiers as well.

## Word 2 Vector Model

After we have the sets of words that we need, we need to make the Word2Vec model. This is what the following function does, returning it in the form of a space so that we can use gensim library functions on it:

In [12]:
# load the word2vec model
def make_W2V(conllu_corpus):
    sentences = []
    for tokList in conllu_corpus:
        sent = []
        for token in tokList:
            if token != "metadata":
                sent.append(token["lemma"])
        sentences.append(sent)
        
    space = Word2Vec(sentences, epochs=10, min_count=10, vector_size=300, sg = 1)
    return space.wv

Now that we have the Word2Vec, we can compute the k nearest words semantically from the Word2Vec vectors. We can also find the centroid of each set by summing all of the word vectors in the set and finding the most similar vector to the sum. Note that the similarity is done via cosine similarity, so we do not need to divide the sum by the number of words (which would give the average vector).

In [13]:
def k_nearest(k, space, vector):
    return space.most_similar(vector)[:k]

def find_centroid(set: set, space):
    total = []
    for token in set:
        total.append(space[token])
    
    sum = reduce(lambda x, y: x + y, total)
    return space.similar_by_vector(sum)[0]

sets = gen_sets(sentences_swedish, "heta")
space = make_W2V(sentences_swedish)
centroid = find_centroid(sets["subjects"], space)
k_nearest(5, space, centroid[0])

NameError: name 'gen_sets' is not defined