<a name="POS-1"></a>

In [None]:
from gensim.models import Word2Vec
import re

def return_documents():
    """
    Returns a list of documents, where each document is a string
    """
    from sklearn.datasets import fetch_20newsgroups
    dataset = fetch_20newsgroups()
    corpus = dataset.data
    return corpus

def tokenize_and_tag_documents(documents, nlp, sep_char="||||"):
    """
    Returns a list of lists of tokens. Each token has been 
    concatenated with its part of speech
    """
    for doc in nlp.pipe(documents, parse=False, entity=False):
        yield [word.lower_ + sep_char + word.pos_ for word in doc]
        

def build_model(tokenized_docs):
    """
    Returns a gensim Word2Vec model trained on our corpus
    """
    return Word2Vec(list(tokenized_docs), size=128)


documents = return_documents()
tokenized_and_tagged_documents = tokenize_and_tag_documents(documents, nlp)
model = build_model(tokenized_and_tagged_documents)


def return_alts(word):
    pattern = re.compile("^" + word + "\|\|\|\|")
    return list(filter(lambda x: bool(pattern.match(x)), model.wv.vocab.keys()))

if model:
    print(model.most_similar('real||||ADJ'))
    print()
    print(model.most_similar('real||||ADV'))

<a name="POS-2">Solution for POS Exercise 2</a>

In [None]:
from sklearn.utils import shuffle
class PerceptronClassifier(object):
    def __init__(self):
        self.weights = None
        self.bias = None
        self.iter = 0

    def fit(self, X, y, epochs=100):
        """Fits self.weights, self.biases """
        self.initialize(X)
        
        for _ in range(epochs):
            for row, label in zip(X, y):
                prediction = self.predict(row)
                self.update(prediction, label, row)
                self.iter += 1
            X, y = shuffle(X, y) #important to reshuffle to avoid getting trapped
            
    def initialize(self, X):
        self.bias = 0
        self.weights = np.zeros(X.shape[1])#[:, np.newaxis]
                    
    def update(self, prediction, label, row):
        """Updates weights and biases based on the ground truth label
        and the row"""
        
        if prediction*label <= 0:
            update = label *  row            
            self.weights += update
            self.bias += label
        
    def predict_score(self, x):
        """Generates scores of "x". Uses self.weights and self.bias"""
        return np.dot(x, self.weights) + self.bias
    
    def predict(self, x):
        """ Converts prediction scores to 1s ands 0s."""
        predictions = np.where(self.predict_score(x) > 0, 1, -1)
        return predictions

<a name="DEP-2">Solution to Dependency Parsing Exercise 2</a>

In [None]:
from spacy.matcher import Matcher
from spacy import attrs

def merge_matches(matcher, doc, i, matches):
    '''
    Merge a phrase. We have to be careful here because we'll change the token indices.
    To avoid problems, merge all the phrases once we're called on the last match.
    '''
    if i != len(matches)-1:
        return None

    spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
    for ent_id, label, span in spans:
        span.merge(label=label, tag='NNP' if label else span.root.tag_)


# DEFINE my_python_pattern        
#this should be a list of dictionaries, where each dictionary is {TOKEN PROPERTY: TOKEN VALUE}
#PROPERTIES are found in spacy.attrs, e.g. POS
my_python_pattern = [{attrs.LOWER:'python'}]        


matcher = Matcher(nlp.vocab)
matcher.add_entity("PYTHON", on_match = merge_matches)
matcher.add_pattern("PYTHON", my_python_pattern, label='Python')
nlp.pipeline = [nlp.tagger, nlp.parser, matcher, nlp.entity]

equivalence_verbs = ['be']

def get_all_properties_of_python(text):
    """Converts text to document, and extracts all relations that define python equivalences"""
    doc = nlp(text)
    python_properties = []
    for relation in extract_relations(doc):
        ### Add your code here to define my_relation
        ### We Want to grab relations that:
        ### A) have python has a subject
        ### B) relate python with an equivalence verb (e.g. "be")
        ### note to get normalized "be" from is, are, etc, use token.lemma_
        
        my_relation = None
        
        python_properties.append(my_relation)
    return python_properties


get_all_properties_of_python(page)