In [1]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

Our data preparation is based on examples from *Natural Language Processing with pytorch* (in particular example of [*YELP review dataset*](https://github.com/joosthub/PyTorchNLPBook/blob/master/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb). 

- We start from simple ratings (only 2 classes: 0-1). As in this example we create small subset of training examples to tune our model and then retrain it on the full set. We have to preserve number of training examples of each class.
- Next step - split data on train/dev/test sets.

In [26]:
args = Namespace(
    raw_dataset_csv="../data/employee_reviews_to_explore_simple.csv",
    output_munged_csv="../data/reviews_with_splits_simple_5K.csv",
    proportion_subset=.1,
    train_proportion=0.9,
    val_proportion=0.05,
    test_proportion=0.05,
    seed=42
)

In [15]:
reviews = pd.read_csv(args.raw_dataset_csv)
reviews_summary_columns = ['simple_ratings', 'summary']
reviews_summary = reviews[reviews_summary_columns]
reviews_summary = reviews_summary.rename(index=str, 
                                         columns={"simple_ratings": "rating", 
                                                  "summary": "review"})
reviews_summary.head()

Unnamed: 0,rating,review
0,1,Best Company to work for
1,1,"Moving at the speed of light, burn out is inev..."
2,1,Great balance between big-company security and...
3,1,The best place I've worked and also the most d...
4,1,"Unique, one of a kind dream job"


In [86]:
reviews_summary = reviews_summary.dropna()

In [88]:
by_rating = collections.defaultdict(list)
for _, row in reviews_summary.iterrows():
    by_rating[row.rating].append(row.to_dict())

review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = len(item_list)
    n_subset = int(args.proportion_subset * n_total)
    review_subset.extend(item_list[:n_subset])
    
review_subset = pd.DataFrame(review_subset)

In [89]:
review_subset.shape

(5476, 2)

In [90]:
reviews_summary.rating.value_counts()

1    45613
0     9156
Name: rating, dtype: int64

In [91]:
review_subset.rating.value_counts()

1    4561
0     915
Name: rating, dtype: int64

In [93]:
# splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
# create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # add to final list
    final_list.extend(item_list)

In [94]:
# write split data to file
final_reviews = pd.DataFrame(final_list)

In [95]:
final_reviews.split.value_counts()

train    4927
test      273
val       273
Name: split, dtype: int64

In [98]:
# preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [100]:
final_reviews['rating'] = final_reviews.rating.apply({0: 'negative', 1: 'positive'}.get)

In [101]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,good place to start work and transit to other ...,train
1,negative,year tier review about amazon,train
2,negative,corporate victims,train
3,negative,"life changing experience , fabulous products ,...",train
4,negative,"too big , full of politics !",train


In [102]:
final_reviews.to_csv(args.output_munged_csv, index=False)