# Data processing stage


In [66]:
import csv
import numpy as np

In [36]:
def read_data(file_name):
    """ Reads a vocabulary from a .txt file into a list of strings
    
    Parameters
    ----------
    file_name : str
        a path to a vocabulary file
    
    Returns
    -------
    review : a list of raw strings
        a list of raw strings from a .csv file
    sentiment: a list of 1 and 0, where 1 is positive and 0 is negative sentiment
    """
    review = []
    sentiment = []
    
    # YOUR CODE HERE
    with open(file_name, newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            review.append(row[0].replace('<br /><br />', ' '))
            # review.append(row[0])
            if row[-1] == "positive":
                sentiment.append(1)
            else:
                sentiment.append(0)
                
    return review[1:], sentiment[1:]

In [38]:
file_name = "toy_dataset.csv"

review, sentiment = read_data(file_name)

## Tokenization

Different tokenization schemes can lead to different results later on. Here we try two types of tokenization: 

(1) Regular expression tokenization where
- all alpha strings with one hyphen or apostrophe inside (i.e should be able to find "a-ha" or "it's", but not "hi--ppo" or "44.44")

(2) Penn Treebank tokenizer

### Stop words
We also remove stop words from our dataset.

In [37]:
import re #regular expression usage

# #Example of a regular expression tokenizer:
# regex_tokenizer = re.compile("\w\w*[-'.]*\w\w*|\S\w*")

def tokenize(data, tokenizer):
    """ Reads a vocabulary from a .txt file into a list of strings
    
    Parameters
    ----------
    data : str
        a list of raw strings
    tokenizer:
        a regular expression re.compile() object
        
    Returns
    -------
    tokenized_review : a list of lists - each raw string is tokenized 
        and lowercased into a list using the tokenizer

    """
    tokenized_review = []
    
    for review in data:
        tokenized_review.append(re.findall(tokenizer, review.lower()))
    
    return tokenized_review

In [64]:
regex_tokenizer = re.compile("[a-z][a-z]*[-']?[a-z][a-z]*|[a-z]")

tokenized_reviews = tokenize(review, regex_tokenizer)

In [65]:
tokenized_reviews[0]

['one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 'oz',
 'episode',
 "you'll",
 'be',
 'hooked',
 'they',
 'are',
 'right',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 'trust',
 'me',
 'this',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 'sex',
 'or',
 'violence',
 'its',
 'is',
 'hardcore',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 'it',
 'is',
 'called',
 'oz',
 'as',
 'that',
 'is',
 'the',
 'nickname',
 'given',
 'to',
 'the',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'it',
 'focuses',
 'mainly',
 'on',
 'emer