# Data processing stage


In [1]:
import csv
import numpy as np
import time
import json

In [6]:
def read_data(file_name):
    """ Reads a vocabulary from a .txt file into a list of strings
    
    Parameters
    ----------
    file_name : str
        a path to a vocabulary file
    
    Returns
    -------
    review : a list of raw strings
        a list of raw strings from a .csv file
    sentiment: a list of 1 and 0, where 1 is positive and 0 is negative sentiment
    """
    review = []
    # sentiment = []
    
    # YOUR CODE HERE
    with open(file_name, newline='', encoding="utf8") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            review.append(row[0].replace('<br /><br />', ' '))
            # review.append(row[0])
            # if row[-1] == "positive":
            #     sentiment.append(1)
            # else:
            #     sentiment.append(0)
                
    return review[1:] #, sentiment[1:]

In [7]:
# review, sentiment = read_data("toy_dataset.csv")
review = read_data("IMDB Dataset.csv")
print(review[-1])
# print(sentiment[0])

No one expects the Star Trek movies to be high art, but the fans do expect a movie that is as good as some of the best episodes. Unfortunately, this movie had a muddled, implausible plot that just left me cringing - this is by far the worst of the nine (so far) movies. Even the chance to watch the well known characters interact in another movie can't save this movie - including the goofy scenes with Kirk, Spock and McCoy at Yosemite. I would say this movie is not worth a rental, and hardly worth watching, however for the True Fan who needs to see all the movies, renting this movie is about the only way you'll see it - even the cable channels avoid this movie.


## Basic Tokenization (NOT USED -- skip to next section)

Different tokenization schemes can lead to different results later on. Here we try two types of tokenization: 

(1) Regular expression tokenization where
- all alpha strings with one hyphen or apostrophe inside (i.e should be able to find "a-ha" or "it's", but not "hi--ppo" or "44.44")

(2) Penn Treebank tokenizer

### Stop words
We also remove stop words from our dataset.

In [8]:
# import re #regular expression usage

# # #Example of a regular expression tokenizer:
# # regex_tokenizer = re.compile("\w\w*[-'.]*\w\w*|\S\w*")

# def tokenize(data, tokenizer):
#     """ Reads a vocabulary from a .txt file into a list of strings
    
#     Parameters
#     ----------
#     data : str
#         a list of raw strings
#     tokenizer:
#         a regular expression re.compile() object
        
#     Returns
#     -------
#     tokenized_review : a list of lists - each raw string is tokenized 
#         and lowercased into a list using the tokenizer

#     """
#     tokenized_review = []
    
#     for review in data:
#         tokenized_review.append(re.findall(tokenizer, review.lower()))
    
#     return tokenized_review

In [9]:
# regex_tokenizer = re.compile("[a-z][a-z]*[-']?[a-z][a-z]*|[a-z]")

# tokenized_reviews = tokenize(review, regex_tokenizer)

## Normalization: Tokenization and Lemmatization

Here we tokenize and lemmatize our words into its stems with [Stanza](https://stanfordnlp.github.io/stanza/index.html).

Here are the steps given the name of a file:
1. read it
2. tokenize and lemmatize it
3. lowercase lemmas
4. remove lemmas that are present in a stop word list

In [8]:
!pip install -q stanza
import stanza
print("Downloading English model...")
stanza.download('en', verbose=False)

Downloading English model...


In [9]:
!pip install -U --user -q spacy
import spacy

In [10]:
import nltk
# import ssl
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

nltk.download("stopwords", quiet=True)
from nltk.corpus import stopwords
stop_words_english = stopwords.words('english')
print("Got stopwords")

Got stopwords


In [13]:
def tokenize_and_normalize(reviews, stopwords=None, tokenize_with_spacy=True):
    """Tokenizes, lemmatizes, lowercases and removes stop words if specified
    
    this function takes in a path to a song, reads the song file,
    tokenizes it into words, then lemmatizes and lowercases these words.
    finally, stopwords given to the function are removed from the list of song lemmas
    
    Parameters
    ----------
    file_name : str
        a path to a text file
    stopwords : (optional) list of strings
        stopwords that should be removed
    
    Returns
    -------
    normalized_song : list of strings
        a song represented as a list of its lemmas
    """
    # create stanza Pipeline object: first tokenize, then lemmatize
#     nlp = stanza.Pipeline(lang='en', processors='tokenize, lemma',  verbose=False)
    
#     song = open(file_name, 'r').read();
#     song_doc = nlp(song);
#     normalized_song = []
    
#     for i, sent in enumerate(song_doc.sentences):
#         for word in sent.words:
#             lem = word.lemma.lower()
#             if lem not in stopwords:
#                 normalized_song.append(lem)

    stopword_filter = lambda word : True if stopwords==None else word in stopwords 
    nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma',  verbose=False, tokenize_no_ssplit=True, tokenize_with_spacy=tokenize_with_spacy)

    #nlp = stanza.Pipeline(lang='en', processors={'tokenize': '', 'lemma': ''}, verbose=False, tokenize_no_ssplit=True)
    print('Pipeline made')
    start_time = time.time()
    
    reviews_lemmatized = []
    reviews_lemmatized_checkpoint = []
    count = 10001
    checkpoint_size = 1000
    for review in reviews:
        review = nlp(review)
        temp = []
        for sent in review.sentences:
            for word in sent.words:
                try:
                    lemma_lowered = word.lemma.lower()
                    if stopword_filter(lemma_lowered):
                        temp.append(lemma_lowered)
                except:
                    print(f"Couldn't add {word.text} to lemmatized review. Skip!")
        reviews_lemmatized_checkpoint.append(temp)
        # reviews_lemmatized_checkpoint.append([word.lemma.lower() for sent in review.sentences for word in sent.words if stopword_filter(word.lemma.lower())])
        # print('Review #: ', count)
        if count%checkpoint_size == 0:
            pathname = "dataset_lemmatized/dataset_lemmatized_checkpoint_" + str(count).zfill(5) + ".json"
            print("writing checkpoint into: " + pathname)
            with open(pathname, 'w') as f:
                json.dump(reviews_lemmatized_checkpoint, f)
            reviews_lemmatized += reviews_lemmatized_checkpoint
            reviews_lemmatized_checkpoint = []
        count+=1
        
    reviews_lemmatized += reviews_lemmatized_checkpoint
    print(f'Done. Time elapsed: {time.time()-start_time}')

    with open("dataset_lemmatized/dataset_lemmatized.json", 'w') as f:
        json.dump(reviews_lemmatized, f)

    # return reviews_lemmatized

In [14]:
# tokenize_and_normalize(reviews, stop_words_english)
tokenize_and_normalize(review[10000:])

Pipeline made
Couldn't add wookie to lemmatized review. Skip!
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_11000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_12000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_13000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_14000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_15000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_16000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_17000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_18000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_19000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_20000.json
writing checkpoint into: dataset_lemmatized/dataset_lemmatized_checkpoint_21000.json
wri

In [3]:
# # DO NOT USE!!! Reading the .json dataset
# reviews_lemmatized = []
# for count in range(1000, 11000, 1000):
#     pathname = "dataset_lemmatized/dataset_lemmatized_checkpoint_" + str(count).zfill(5) + ".json"
#     with open(pathname, 'r') as f:
#         reviews_lemmatized += json.load(f)
        
# with open("dataset_lemmatized/dataset_lemmatized.json", 'r') as f:
#     reviews_lemmatized += json.load(f)
    
# np.size(reviews_lemmatized)
# with open("dataset_lemmatized/dataset_lemmatized.json", 'w') as f:
#     json.dump(reviews_lemmatized, f)

50000

### Normalization - removing non-alphabetical lemmas using regular expressions

In [31]:
# remove lemmas that contain non-word chars
import re
regex_alpha = re.compile('\A[a-z]+\Z')
reviews_normalized = [[word for word in review if regex_alpha.match(word)!=None] for review in reviews_lemmatized]

In [60]:
# remove lemmas that contain non-word chars
with open("dataset_lemmatized/dataset_normalized.json", 'w') as f:
     json.dump(reviews_normalized, f)

#### TFI - singleton words removal for decreasing sparsity
Implemented TFI stopwords removal from normalized reviews from cell above, where words that only appear once in the corpus are removed. Improves sparsity and accuracy.

Saif et al. "On Stopwords, Filtering and Data Sparsity for Sentiment Analysis of Twitter" http://www.lrec-conf.org/proceedings/lrec2014/pdf/292_Paper.pdf

In [55]:
def slim_dictionary_TFI(counter):
    counter.subtract(counter.keys())
    return +counter

reviews_vocab_TFI = slim_dictionary_TFI(reviews_vocab)

In [66]:
print(f'Original vocabulary size: {len(reviews_vocab.keys())} \nVocabulary size with singletons removed: {len(reviews_vocab_TFI.keys())}') 

Original vocabulary size: 82764 
Vocabulary size with singletons removed: 50429


In [56]:
reviews_normalized_TFI = [[word for word in review if word in reviews_vocab_TFI] for review in reviews_normalized]

In [68]:
with open("dataset_lemmatized/dataset_normalized_TFIstopword.json", 'w') as f:
     json.dump(reviews_normalized_TFI, f)