# Reading in the Data

In [15]:
import pandas as pd
from bs4 import BeautifulSoup
import re

In [16]:
train = pd.read_csv('../data/labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [17]:
unlabeled_train = pd.read_csv('../data/unlabeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [18]:
test = pd.read_csv('../data/testData.tsv', header=0, delimiter='\t', quoting=3)

In [19]:
train.shape

(25000, 3)

In [20]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [21]:
train.review[0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

Testing out BeautifulSoup on its cleaning mechanism

In [22]:
example1 = BeautifulSoup(train.review[0])

In [23]:
example1.get_text()

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 2

Adding to BeautifulSoup's automatic cleaning by using regular expressions to keep only letters

In [24]:
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search

In [25]:
letters_only

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    m

Transform all the characters into lowercase

In [26]:
lower_case = letters_only.lower()

Now split all those lowercase reviews into separate words. 

In [27]:
words = lower_case.split()

---

Importing stopwords from NLTK

In [28]:
from nltk.corpus import stopwords

Just a sample of the first 10 words in the stopword list

In [29]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

Removing stopwords from our corpus

In [30]:
words = [w for w in words if not w in stopwords.words('english')]

Removing URLs from the review data

In [46]:
train.review = train.review.map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))

In [48]:
train.review = train.review.map(lambda x: re.sub('http?:\/\/[^\s]*', ' ', x))

Creating a custom function to convert reviews to a sequence of words, with the option of removing stopwords (because in some later analyses looking at entire sentences, I might not want to remove stops). 

In [38]:
def review_to_wordlist( review, remove_stopwords=False ):
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

Loading NLTK's punkt tokenizer for sentence splitting

In [39]:
import nltk.data

In [40]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

Creating a custom function similar to the one above, but this time for entire sentences

In [41]:
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words, so this returns a list of lists
    return sentences

In [51]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set
Parsing sentences from unlabeled set


In [52]:
len(sentences)

795534

In [58]:
sentences[0]

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again']

Saving my sentences to then do some EDA and modeling in the succeeding notebooks

In [76]:
import pickle

In [79]:
with open('../data/sentences.txt', 'wb') as fp:
    pickle.dump(sentences, fp)