In [1]:
import collections
import numpy as np
import pandas as pd
import re
from spellchecker import SpellChecker

from argparse import Namespace

We prepare data based on its analysis. There are a lot of steps that we are doing and this will reduce training set significantly. But this is the only way to get meaningful results. Here's the list of main steps:

- remove all columns except 2;
- remove entries with `NaN`;
- simplify reviews;
- normalize text of reviews;
- remove company names;
- remove reviews with `len < 3` or with `len < 5`;
- remove entries with job titles;

- remove spelling mistakes and non-English reviews;

In [2]:
args = Namespace(
    initial_dataset_csv="../data/employee_reviews.csv",
    prep_dataset_csv_3w="../data/employee_reviews_prep_3w.csv",
    prep_dataset_csv_5w="../data/employee_reviews_prep_5w.csv",
    seed=42
)

In [3]:
df = pd.read_csv(args.initial_dataset_csv)
df.shape

(67529, 17)

### remove all columns except 2

In [4]:
# remove all columns except 2
columns_to_explore = ['summary', 'overall-ratings']
df = df[columns_to_explore]
df = df.rename(index=str,columns={"summary": "review",
                                  "overall-ratings": "rating" 
                                  })
df.head()

Unnamed: 0,review,rating
0,Best Company to work for,5.0
1,"Moving at the speed of light, burn out is inev...",4.0
2,Great balance between big-company security and...,5.0
3,The best place I've worked and also the most d...,5.0
4,"Unique, one of a kind dream job",5.0


### remove entries with `NaN`

In [5]:
# remove NaN - we may see that not too many of them
df = df.dropna()
df.shape

(67409, 2)

### simplify reviews

In [6]:
# we may see that we have 18% of 3.0 reviews; 
# but we have to remove them to get simplified rating
df['rating'].value_counts() / df.shape[0]

5.0    0.343515
4.0    0.333145
3.0    0.187512
2.0    0.078120
1.0    0.057707
Name: rating, dtype: float64

In [7]:
df = df[df['rating'] != 3.0]
df.shape

(54769, 2)

In [8]:
df['rating'].value_counts()

5.0    23156
4.0    22457
2.0     5266
1.0     3890
Name: rating, dtype: int64

In [9]:
# simplify ratings as specified above
simple_dict = {1: 0, 2: 0, 4: 1, 5: 1}
df['rating'] = df.rating.apply(simple_dict.get)
df['rating'].value_counts()

1    45613
0     9156
Name: rating, dtype: int64

### preprocess text

In [10]:
# preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?']+", r" ", text)
    return text
    
df.review = df.review.apply(preprocess_text)

In [11]:
df.head()

Unnamed: 0,review,rating
0,best company to work for,1
1,"moving at the speed of light , burn out is ine...",1
2,great balance between big company security and...,1
3,the best place i've worked and also the most d...,1
4,"unique , one of a kind dream job",1


### remove company names

In [12]:
company_names = ['google', 'amazon', 'facebook', 'netflix', 'apple', 'microsoft']

In [13]:
def remove_names(sentence):
    new_sentence = []
    for word in sentence.split():
        if word in company_names:
            new_sentence.append('company')
        else:
            new_sentence.append(word)
    return ' '.join(new_sentence)

In [14]:
df.review = df.review.apply(remove_names)

### remove short reviews

In [15]:
# analyze how many reviews shorter than 5 words we have
def count_words(sentence):
    return len(sentence.split())
df['length'] = df.review.apply(count_words)

In [16]:
df3 = df[df.length >= 3].copy()

In [17]:
df3.shape

(34884, 3)

### remove job titles

In [18]:
job_list = ['senior', 'manager', 'engineer', 'intern', 
            'internship', 'program', 'account', 
            'specialist', 'consultant', 'representative',
            'head', 'director', 'marketing', 'sales',
            'position', 'product', 'management',
            'lead', 'sr', 'associate']
misc_list = ['review']

In [19]:
def is_job_title(sentence):
    for word in sentence.split():
        count = 0
        if word in job_list:
            count += 1
        if word in misc_list:
            return True
    if count > 0 and len(sentence.split()) <= 5:
        return True
    if count > 2:
        return True
    return False

In [20]:
df3['is_job_title'] = df3.review.apply(is_job_title)

In [21]:
df3 = df3[df3['is_job_title'] == False]

In [22]:
df3['is_job_title'].value_counts()

False    31828
Name: is_job_title, dtype: int64

### spell correction

We use `pyspellchecker` (that is based on Norvig's spell corrector) to remove all rows with multiple mistakes. This includes non-English reviews. We don't correct those mistakes. Doing proper spell checking for this project is too complicated for our purposes.

In [23]:
spell = SpellChecker()
spell.word_frequency.load_words(['google', 'amazon', 'facebook', 
                                 'netflix', 'apple', 'microsoft'])

In [24]:
def is_spell_correct(text):
    misspelled = spell.unknown(text.split())
    count = len(misspelled)
    for word in misspelled:
        if "'" in word:
            count -= 1
    return count

In [25]:
df3['is_spell_correct'] = df3.review.apply(is_spell_correct)

In [26]:
df3['is_spell_correct'].value_counts()

0    29463
1     2082
2      196
3       50
4       25
5        7
6        3
8        1
7        1
Name: is_spell_correct, dtype: int64

In [27]:
df3 = df3[df3['is_spell_correct'] <= 1]

In [28]:
df3['is_spell_correct'].value_counts()

0    29463
1     2082
Name: is_spell_correct, dtype: int64

### writing in the files

In [29]:
df5 = df3[df3.length >= 5].copy()

In [30]:
df3 = df3[['review', 'rating']]

In [31]:
df3.head()

Unnamed: 0,review,rating
0,best company to work for,1
1,"moving at the speed of light , burn out is ine...",1
2,great balance between big company security and...,1
3,the best place i've worked and also the most d...,1
4,"unique , one of a kind dream job",1


In [32]:
df5 = df5[['review', 'rating']]

In [33]:
df5.head()

Unnamed: 0,review,rating
0,best company to work for,1
1,"moving at the speed of light , burn out is ine...",1
2,great balance between big company security and...,1
3,the best place i've worked and also the most d...,1
4,"unique , one of a kind dream job",1


In [34]:
df3.to_csv(args.prep_dataset_csv_3w, index=False, header=True)

In [35]:
df5.to_csv(args.prep_dataset_csv_5w, index=False, header=True)

In [36]:
!head -5 "../data/employee_reviews_prep_3w.csv"

review,rating
best company to work for,1
"moving at the speed of light , burn out is inevitable",1
"great balance between big company security and fun , fast moving projects",1
the best place i've worked and also the most demanding .,1


In [37]:
!head -5 "../data/employee_reviews_prep_3w.csv"

review,rating
best company to work for,1
"moving at the speed of light , burn out is inevitable",1
"great balance between big company security and fun , fast moving projects",1
the best place i've worked and also the most demanding .,1
