In [1]:
import csv
import numpy as np
import pandas as pd

In [2]:
def remove_enclosing_quotes(s):
    return s[1:-1]

In [3]:
from dplython import DplyFrame, X, mutate, select, rename

dataset = DplyFrame(pd.read_csv('../datasets/imdb_reviews.tsv', sep='\t', quoting=csv.QUOTE_NONE))
dataset = (dataset >> 
           mutate(id=X.id.map(remove_enclosing_quotes),
                  review=X.review.map(remove_enclosing_quotes)))
id_rating = dataset['id'].str.split('_', expand=True)
dataset = pd.concat((dataset, id_rating), axis=1)
dataset = (dataset >> 
           select(X[0], X[1], X.sentiment, X.review) >>
           rename(id=X[0], rating=X[1]))

In [4]:
def clean_review(review_text):
    from bs4 import BeautifulSoup 
    import nltk.data
    from nltk.corpus import stopwords
    import codecs
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
    text = BeautifulSoup(review_text, "html.parser").get_text().lower().encode('utf-8').decode('string_escape')
    sentences = tokenizer.tokenize(text.strip().decode('utf-8'))
    
    return [[w for w in s.split() if w not in stopwords.words('english')] for s in sentences if len(s) > 0]

In [5]:
from tqdm import tqdm, tqdm_notebook

tqdm.pandas(tqdm_notebook, desc='Cleaning reviews')

dataset['review'] = dataset['review'].progress_map(clean_review)

Cleaning reviews: 100%|██████████| 25000/25000 [12:38<00:00, 32.97it/s]


In [6]:
with open('../datasets/imdb_reviews.pcl2', 'wb') as pcl_file:
    import pickle
    pickle.dump(pd.DataFrame(dataset), pcl_file)