In [6]:
import csv
import numpy as np
import time

In [7]:
def read_data(file_name):
    """ Reads a vocabulary from a .txt file into a list of strings
    
    Parameters
    ----------
    file_name : str
        a path to a vocabulary file
    
    Returns
    -------
    review : a list of raw strings
        a list of raw strings from a .csv file
    sentiment: a list of 1 and 0, where 1 is positive and 0 is negative sentiment
    """
    review = []
    sentiment = []
    
    # YOUR CODE HERE
    with open(file_name, newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            review.append(row[0].replace('<br /><br />', ' '))
            # review.append(row[0])
            if row[-1] == "positive":
                sentiment.append(1)
            else:
                sentiment.append(0)
                
    return review[1:], sentiment[1:]

In [8]:
file_name = "toy_dataset.csv"
review, sentiment = read_data(file_name)
# print(review[0])
# print(sentiment[0])

## Basic Tokenization (NOT USED -- skip to next section)

Different tokenization schemes can lead to different results later on. Here we try two types of tokenization:

(1) Regular expression tokenization where

- all alpha strings with one hyphen or apostrophe inside (i.e should be able to find "a-ha" or "it's", but not "hi--ppo" or "44.44")

(2) Penn Treebank tokenizer

### Stop words

We also remove stop words from our dataset.

In [4]:


# import re #regular expression usage

# # #Example of a regular expression tokenizer:
# # regex_tokenizer = re.compile("\w\w*[-'.]*\w\w*|\S\w*")

# def tokenize(data, tokenizer):
#     """ Reads a vocabulary from a .txt file into a list of strings
    
#     Parameters
#     ----------
#     data : str
#         a list of raw strings
#     tokenizer:
#         a regular expression re.compile() object
        
#     Returns
#     -------
#     tokenized_review : a list of lists - each raw string is tokenized 
#         and lowercased into a list using the tokenizer

#     """
#     tokenized_review = []
    
#     for review in data:
#         tokenized_review.append(re.findall(tokenizer, review.lower()))
    
#     return tokenized_review



In [5]:
# regex_tokenizer = re.compile("[a-z][a-z]*[-']?[a-z][a-z]*|[a-z]")

# tokenized_reviews = tokenize(review, regex_tokenizer)


## Normalization: Tokenization and Lemmatization

Here we tokenize and lemmatize our words into its stems with Stanza.

Here are the steps given the name of a file:

- read it
- tokenize and lemmatize it
- lowercase lemmas
- remove lemmas that are present in a stop word list



In [6]:
!pip install -q stanza
import stanza
print("Downloading English model...")
stanza.download('en', verbose=False)

: 

: 

In [1]:
!pip install -U -q spacy
import spacy

: 

: 

In [9]:
print("Getting stopwords...")
import nltk
# import ssl
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

nltk.download("stopwords", quiet=True)
from nltk.corpus import stopwords
stop_words_english = stopwords.words('english')

Getting stopwords...


In [10]:
def tokenize_and_normalize(reviews, stopwords=None, tokenize_with_spacy=True):
    """Tokenizes, lemmatizes, lowercases and removes stop words if specified
    
    this function takes in a path to a song, reads the song file,
    tokenizes it into words, then lemmatizes and lowercases these words.
    finally, stopwords given to the function are removed from the list of song lemmas
    
    Parameters
    ----------
    file_name : str
        a path to a text file
    stopwords : (optional) list of strings
        stopwords that should be removed
    
    Returns
    -------
    normalized_song : list of strings
        a song represented as a list of its lemmas
    """
    # create stanza Pipeline object: first tokenize, then lemmatize
#     nlp = stanza.Pipeline(lang='en', processors='tokenize, lemma',  verbose=False)
    
#     song = open(file_name, 'r').read();
#     song_doc = nlp(song);
#     normalized_song = []
    
#     for i, sent in enumerate(song_doc.sentences):
#         for word in sent.words:
#             lem = word.lemma.lower()
#             if lem not in stopwords:
#                 normalized_song.append(lem)

    stopword_filter = lambda word : True if stopwords==None else word in stopwords 
    nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma',  verbose=False, tokenize_no_ssplit=True, tokenize_with_spacy=tokenize_with_spacy)

    #nlp = stanza.Pipeline(lang='en', processors={'tokenize': '', 'lemma': ''}, verbose=False, tokenize_no_ssplit=True)
    print('Pipeline made')
    start_time = time.time()
    reviews_lemmatized = []
    for review in reviews:
        review = nlp(review)
        reviews_lemmatized.append([word.lemma.lower() for sent in review.sentences for word in sent.words if stopword_filter(word.lemma.lower())])
    print(f'Time elapsed: {time.time()-start_time}')

    return reviews_lemmatized

In [11]:
reviews_lemmatized_nostopwords = tokenize_and_normalize(reviews)

NameError: name 'reviews' is not defined

In [None]:
reviews_lemmatized_withstopwords = tokenize_and_normalize(reviews, stop_words_english)