In [1]:
import pandas as pd

train = pd.read_csv('./data/labeledTrainData.tsv', header=0, quoting=3, delimiter='\t')
unlabeled_train = pd.read_csv('./data/unlabeledTrainData.tsv', header=0, quoting=3, delimiter='\t')
test = pd.read_csv('./data/testData.tsv', header=0, quoting=3, delimiter='\t')

In [14]:
# define functions to (1) turn sentences into wordlist, and
# (2) turn review paragraphs into list of (word)lists

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
# nltk.download('punkt')

def review_to_wordlist(review, remove_stopwords=False):
    # 1. Remove html tags and markup
    text_only = BeautifulSoup(review).get_text()
    
    # 2. Remove non-letters
    letters_only = re.sub('[^a-zA-Z]', ' ', text_only)
    
    # 3. individualize, lower case
    words = letters_only.lower().split
    
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [word for word in words if word not in stops]

    return words

# Use tokenizer from punkt (punctuations, capital letters, etc are 
# not reliable indicators of sentence division)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Use nltk tokenizer to split the paragraph into sentences
    # strip(): Removes spaces at the beginning and at the end of the string
    raw_sentences = tokenizer.tokenize(review.strip())
    
    # 2. call review_to_wordlist (and get list of words) for each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))

    return sentences

In [17]:
# Empty list
sentences = []

# Parse sentences from training set
for review in train['review']:
    sentences += review_to_sentences(review, tokenizer)

# Parse sentences from unlabeled set
for review in unlabeled_train['review']:
    sentences += review_to_sentences(review, tokenizer)



In [18]:
# sentences = list of all parsed sentences 
# from labeled & unlabeled train data reviews
print(len(sentences))

795538
