In [4]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

Our data preparation is based on examples from *Natural Language Processing with pytorch* (in particular example of [*YELP review dataset*](https://github.com/joosthub/PyTorchNLPBook/blob/master/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb). 

- In the first part of preparation we prepare data for processing mentioned above. Also we have a *serios problem*: `summary` field is quite different from standard reviews (that we use in sentiment analysis). For example: in many cases it contains the job title, not the review. So we *remove all reviews that are shorter than 5 words*.
- We start from simple ratings (only 2 classes: 0-1). As in this example we create small subset of training examples to tune our model and then retrain it on the full set. We have to preserve number of training examples of each class.
- Next step - split data on train/dev/test sets.

In [91]:
args = Namespace(
    initial_dataset_csv="../data/employee_reviews.csv",
    raw_dataset_csv_3w="../data/employee_reviews_raw_3w.csv",
    raw_dataset_csv_5w="../data/employee_reviews_raw_5w.csv",
    output_munged_csv="../data/reviews_with_splits_simple_5K.csv",
    proportion_subset=.1,
    train_proportion=0.9,
    val_proportion=0.05,
    test_proportion=0.05,
    seed=42
)

### data preparation: part 1

So the main steps in the part 1 are:

- remove all columns except `overall-ratings`, `summary`;
- change `overall-ratings` to simple ratings: `negative` `(1-2)`, `positive` `(4-5)`, remove rows with rating `3.0`;
- preprocess reviews: remove `NaN`; remove punctuation except `!`; make lowercase; **remove all reviews with length less than `3` words**;

In [34]:
df = pd.read_csv(args.initial_dataset_csv)

In [35]:
# remove all columns except 'overall-ratings', 'summary'
columns_to_explore = ['overall-ratings', 'summary']
df = df[columns_to_explore]
df = df.rename(index=str,columns={"overall-ratings": "rating", 
                                  "summary": "review"})
df.head()

Unnamed: 0,rating,review
0,5.0,Best Company to work for
1,4.0,"Moving at the speed of light, burn out is inev..."
2,5.0,Great balance between big-company security and...
3,5.0,The best place I've worked and also the most d...
4,5.0,"Unique, one of a kind dream job"


In [36]:
print(df.shape)

# remove NaN - we may see that not too many of them
df = df.dropna()

print(df.shape)

(67529, 2)
(67409, 2)


In [37]:
# we may see that we have 18% of 3.0 reviews; 
# but we have to remove them to get simplified rating
df['rating'].value_counts() / df.shape[0]

5.0    0.343515
4.0    0.333145
3.0    0.187512
2.0    0.078120
1.0    0.057707
Name: rating, dtype: float64

In [38]:
df = df[df['rating'] != 3.0]
df.shape

(54769, 2)

In [39]:
df['rating'].value_counts()

5.0    23156
4.0    22457
2.0     5266
1.0     3890
Name: rating, dtype: int64

In [40]:
# simplify ratings as specified above
simple_dict = {1: 'negative', 2: 'negative', 4: 'positive', 5: 'positive'}
df['rating'] = df.rating.apply(simple_dict.get)
df['rating'].value_counts()

positive    45613
negative     9156
Name: rating, dtype: int64

In [41]:
# preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([!])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z!]+", r" ", text)
    return text
    
df.review = df.review.apply(preprocess_text)

In [42]:
df.head()

Unnamed: 0,rating,review
0,positive,best company to work for
1,positive,moving at the speed of light burn out is inevi...
2,positive,great balance between big company security and...
3,positive,the best place i ve worked and also the most d...
4,positive,unique one of a kind dream job


In [46]:
# analyze how many reviews shorter than 5 words we have
def count_words(sentence):
    return len(sentence.split())
df['length'] = df.review.apply(count_words)

In [60]:
# let's look at what we are going to remove
# as we can see - probably 5 or 6 review out of 20 
# have some sense; it's quite complicated (if possible)
# to distinguish those cases; also we're probably not able to use
# LSTMs with such short reviews
df[df.length < 3].head(20)

Unnamed: 0,rating,review,length
6,positive,software engineer,2
11,positive,software engineer,2
14,positive,mba intern,2
15,positive,review,1
17,positive,analysts,1
23,positive,software engineer,2
24,positive,accountant,1
26,positive,great culture,2
29,positive,great,1
30,positive,it employee,2


In [57]:
# we have around 36% of such reviews but we have to drop them
df.length[df.length < 3].count() / df.shape[0]

0.36800014606803116

In [61]:
df = df[df.length >= 3]

In [63]:
# basically we reduce our set 2 times
df.shape

(34614, 3)

In [65]:
# we may try to work with reviews of length 10, 15 or 20
# we may also experiment with only long reviews - say 5 or more words;
df.length.value_counts()

3     8001
4     6902
5     5098
6     3632
7     2644
8     1982
9     1433
10    1102
11     859
12     636
13     527
14     376
15     321
16     260
17     215
18     161
19     140
20     109
21      91
22      60
23      26
24      23
25      13
26       2
69       1
Name: length, dtype: int64

In [69]:
# still 3 out of 10 reviews are not relevant
df[df.length < 5].head(10)

Unnamed: 0,rating,review,length
8,positive,google surpasses realistic expectations,4
9,positive,execellent for engineers,3
10,positive,nice place to work,4
16,positive,review of google,3
19,positive,senior account manager,3
28,positive,great place to work,4
32,positive,is very good,3
35,positive,good place to work,4
36,positive,senior software engineer,3
39,positive,best of the best,4


In [71]:
# those reviews are much better
df[df.length >= 5].head(10)

Unnamed: 0,rating,review,length
0,positive,best company to work for,5
1,positive,moving at the speed of light burn out is inevi...,10
2,positive,great balance between big company security and...,11
3,positive,the best place i ve worked and also the most d...,11
4,positive,unique one of a kind dream job,7
5,positive,nice working in google as an intern,7
7,positive,great place to work and progress,6
12,positive,best company to work for !,6
13,positive,still the best place to work !,7
18,positive,great company with no down sides,6


In [76]:
# let's remove some more job titles
def is_job_title(sentence):
    job_list = ['senior', 'manager', 'engineer', 'intern', 
                'internship', 'program', 'account']
    for word in sentence.split():
        if word in job_list and len(sentence.split()) < 5:
            return True
    return False
df['is_job_title'] = df.review.apply(is_job_title)

In [77]:
# almost 10% more job titles
df['is_job_title'].value_counts()

False    32280
True      2334
Name: is_job_title, dtype: int64

In [79]:
# as we can see this approach works pretty fine
df[df['is_job_title'] == True].head(20)

Unnamed: 0,rating,review,length,is_job_title
19,positive,senior account manager,3,True
36,positive,senior software engineer,3,True
56,positive,senior software engineer,3,True
57,positive,google summer internship,3,True
69,positive,associate account strategist,3,True
85,positive,senior software engineer,3,True
108,positive,customer engineer google cloud,4,True
115,positive,group product manager,3,True
134,positive,software engineering manager,3,True
140,positive,software engineering intern,3,True


In [80]:
df = df[df['is_job_title'] == False]

In [82]:
df.shape

(32280, 4)

In [86]:
df = df.drop(columns=['length', 'is_job_title'])

In [87]:
df.head()

Unnamed: 0,rating,review
0,positive,best company to work for
1,positive,moving at the speed of light burn out is inevi...
2,positive,great balance between big company security and...
3,positive,the best place i ve worked and also the most d...
4,positive,unique one of a kind dream job


In [90]:
df.to_csv(args.raw_dataset_csv_3words, index=False)

What else can we to preprocess this dataset? Well, we may do spellcheck, remove name of companies and remove even more job titles. We even have reviews on foreign languages. We may also think about balancing positive and negative reviews.

### data preparation: part 2

In [None]:
reviews = pd.read_csv(args.raw_dataset_csv)
reviews_summary_columns = ['simple_ratings', 'summary']
reviews_summary = reviews[reviews_summary_columns]
reviews_summary = reviews_summary.rename(index=str, 
                                         columns={"simple_ratings": "rating", 
                                                  "summary": "review"})
reviews_summary.head()

In [None]:
reviews_summary = reviews_summary.dropna()

In [None]:
by_rating = collections.defaultdict(list)
for _, row in reviews_summary.iterrows():
    by_rating[row.rating].append(row.to_dict())

review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = len(item_list)
    n_subset = int(args.proportion_subset * n_total)
    review_subset.extend(item_list[:n_subset])
    
review_subset = pd.DataFrame(review_subset)

In [None]:
review_subset.shape

In [None]:
reviews_summary.rating.value_counts()

In [None]:
review_subset.rating.value_counts()

In [None]:
# splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
# create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # add to final list
    final_list.extend(item_list)

In [None]:
# write split data to file
final_reviews = pd.DataFrame(final_list)

In [None]:
final_reviews.split.value_counts()

In [None]:
# preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [None]:
final_reviews['rating'] = final_reviews.rating.apply({0: 'negative', 1: 'positive'}.get)

In [None]:
final_reviews.head()

In [None]:
final_reviews.to_csv(args.output_munged_csv, index=False)