In [1]:
# 3rd party imports
import pandas as pd
import numpy as np
import spacy
# sklearn imports
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Create file path
FILE_PATH = r"C:\Users\Nocx\Downloads\reviews_Tools_and_Home_Improvement_5.json"

In [3]:
# Load into DataFrame
DF = pd.read_json(FILE_PATH, lines=True)

In [4]:
# view head
DF.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,104800001X,"[0, 1]",5,"I hate it when my shirt collars, not otherwise...","01 29, 2014",A4IL0CLL27Q33,D. Brennan,Perfect for collar stay management,1390953600
1,104800001X,"[0, 0]",5,These little magnets are really powerful for t...,"05 31, 2013",A3Q5W5E7TDVLJF,funnyc130,Neat,1369958400
2,104800001X,"[0, 0]",5,I wanted something this small to mount on the ...,"03 13, 2013",A37KNOJXE2FU6,Joseph Yonke II,Very small and thin,1363132800
3,104800001X,"[0, 0]",5,I use these to magnetize my Warhammer 40K mini...,"06 16, 2013",A3U4AFML9SZPWK,"Leith Tussing ""I like tacos.""",Excellent hobby magnets,1371340800
4,104800001X,"[23, 25]",5,They are soo freaking annoying!! Why?! You sp...,"08 9, 2013",A36Y7X194VWVKA,Mark D.,They're annoying... which is why they get five...,1376006400


In [5]:
# Set columns to all lowercase for ease
DF.columns = [x.lower() for x in DF.columns.values]

In [6]:
# Check out one of the reviews
import random

DF.iloc[random.randint(0,137000)]['reviewtext']

'I am one guy who refuses to buy inferior drill bits. These Bosch bits are not the best but this set available on Amazon is one of the better deals you will find. Use these bits properly without abusing them they will last a long time.'

In [7]:
# Don't need all of these columns. Drop the unimportant ones.
DF = DF[['overall', 'reviewtext']].copy()

In [8]:
# Check class balance
DF['overall'].value_counts()

5    85266
4    28336
3    10769
1     5143
2     4962
Name: overall, dtype: int64

In [9]:
# Seems pretty imbalanced...
print("5/5:", round((85266/134476), 3)*100)
print("4/5:", round((28336/134476), 3)*100)
print("3/5:", round((10769/134476), 3)*100)
print("2/5:", round((5143/134476), 3)*100)
print("1/5:", round((4962/134476), 3)*100)

5/5: 63.4
4/5: 21.099999999999998
3/5: 8.0
2/5: 3.8
1/5: 3.6999999999999997


In [10]:
# To fix, let's change the problem
print("5/5:", round((85266/134476), 3)*100,"%")
print("Not a 5:", round(((28336+10769+5143+4962)/134476), 3)*100,"%")

5/5: 63.4 %
Not a 5: 36.6 %


In [11]:
# Create target column
DF['is_5'] = [1 if x == 5 else 0 for x in DF['overall']]

## Setting up a test case

In [12]:
# Get a random document
T1 = DF.iloc[5823:5825]

In [13]:
T1['reviewtext']

5823    Well, now I remember why I don't like this 'cl...
5824    I bought this and returned it. It jammed every...
Name: reviewtext, dtype: object

### Tokenize this text

In [14]:
nlp = spacy.load('en_core_web_lg')

In [16]:
I = 5823
STOPWORDS = [' ', '  ']

for doc in nlp.pipe(T1['reviewtext']):
    print(f"Doc {I}: Is 5? {DF.iloc[I]['is_5']}")
    I += 1
    
    for token in doc:
        if (not token.is_punct) & \
        (not token.is_stop) & \
        (not token.is_digit) & \
        (str(token) not in SPACES):
            print(str(token.lemma_).lower())
print('\n')

Doc 5823: Is 5? 0
remember
like
classic
stapler
look
feel
like
heavy
stapler
grow
heavy
take
lot
pressure
squeeze
handle
staple
fabric
batting
wood
headboard
misfire
staple
need
tap
hammer
good
old
stapler
previous
project
stapler
squeeze
handle
like
better
maybe
misfire
couple
time
stapler
stick
past
heavy
cumbersome
use
like
load
staple
slide
work
yes
fun
use
price
picky
probably
fine
Doc 5824: Is 5? 0
buy
return
jam
staple
christmas
light
minute
job
turn
hour
clearing
jam
staple
desirable
go
great
review
money
quality
product
thing
cheap
usa
reason
company
hire
low
bidder




In [None]:
# # Create new documents from processed tokens
# DOCS = list()

# for doc in nlp.pipe(DF['reviewtext']):
#     # make an empty list for tokens
#     t = list()
#     for token in doc:
#         # Filter the tokens
#         if (not token.is_punct) &\
#         (not token.is_stop) &\
#         (not token.is_digit) &\
#         (str(token) not in STOPWORDS):
#             t.append(str(token.lemma_).lower())
#     DOCS.append(' '.join(t))
    
# ######################
# #    INTERRUPTED     #
# ######################

In [137]:
# Finished 1999 in the time it ran. Let's just use the first 1000
DFS = DF.loc[:999, :].copy()

In [138]:
# Create cleaned column
DFS['cleaned'] = docs[:1000]

In [139]:
DFS

Unnamed: 0,overall,reviewtext,is_5,cleaned
0,5,"I hate it when my shirt collars, not otherwise...",1,hate shirt collar secure place button end weir...
1,5,These little magnets are really powerful for t...,1,little magnet powerful size secret compartment...
2,5,I wanted something this small to mount on the ...,1,want small mount filagree wood piece cut mount...
3,5,I use these to magnetize my Warhammer 40K mini...,1,use magnetize warhammer k miniature allow swap...
4,5,They are soo freaking annoying!! Why?! You sp...,1,soo freaking annoying spend time da*n near bre...
5,5,"am using for 40k models, they are a great size...",1,40k model great size add jet pack use ork spac...
6,5,The color pictures and exploded diagrams are a...,1,color picture explode diagram outstanding intr...
7,3,Good simple projects to start you using the Kr...,0,good simple project start kreg tool joint book...
8,5,These are projects that people can do with pre...,1,project people pretty basic wood tool nice tab...
9,5,If you have a pocket holl jig then this book i...,1,pocket holl jig book nice project book simple


In [140]:
# Feature / target split
X = DFS['cleaned']
y = DFS['is_5']

In [153]:
# Do a train / test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [160]:
X_train.shape,y_train.shape, X_test.shape,y_test.shape

((750,), (750,), (250,), (250,))

In [174]:
# Vectorize
def embed(docs):
    return [nlp(doc).vector for doc in docs]

# Instantiate model
classifier = RandomForestClassifier(n_estimators=100, random_state=107,
                                   max_depth=7)
# Fit the model on training data
classifier.fit(embed(X_train), y_train)

# Get predictions
pred = classifier.predict(embed(X_test))

In [175]:
from collections import Counter
Counter(pred)[1] / 250

0.82

In [176]:
classifier.score(embed(X_test), y_test)

0.688