File tree Expand file tree Collapse file tree 1 file changed +2
-11
lines changed
Expand file tree Collapse file tree 1 file changed +2
-11
lines changed Original file line number Diff line number Diff line change 3434
3535# load the reviews
3636# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
37- positive_reviews = BeautifulSoup (open ('electronics/positive.review' ).read ())
37+ positive_reviews = BeautifulSoup (open ('electronics/positive.review' ).read (), features = "html5lib" )
3838positive_reviews = positive_reviews .findAll ('review_text' )
3939
40- negative_reviews = BeautifulSoup (open ('electronics/negative.review' ).read ())
40+ negative_reviews = BeautifulSoup (open ('electronics/negative.review' ).read (), features = "html5lib" )
4141negative_reviews = negative_reviews .findAll ('review_text' )
4242
43- # there are more positive reviews than negative reviews
44- # so let's take a random sample so we have balanced classes
45- # np.random.shuffle(positive_reviews)
46- # positive_reviews = positive_reviews[:len(negative_reviews)]
4743
48- # we can also oversample the negative reviews
49- diff = len (positive_reviews ) - len (negative_reviews )
50- idxs = np .random .choice (len (negative_reviews ), size = diff )
51- extra = [negative_reviews [i ] for i in idxs ]
52- negative_reviews += extra
5344
5445# first let's just try to tokenize the text using nltk's tokenizer
5546# let's take the first review for example:
You can’t perform that action at this time.
0 commit comments