In [2]:
#includes downloads for necessary nltk components
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


In [3]:
import gzip
import pandas as pd
import numpy as np

In [5]:
with gzip.open('/content/Pet_Supplies_5.json.gz','rb') as f:
  review_frame = pd.read_json(f, lines=True)

In [6]:
review_frame = review_frame.head(210000)

<gzip on 0x7f933ce7a320>


In [7]:
# separate all reviews by rating
reviews_1 = review_frame[review_frame['overall'] == 1][["reviewText","overall"]]
reviews_2 = review_frame[review_frame['overall'] == 2][["reviewText","overall"]]
reviews_3 = review_frame[review_frame['overall'] == 3][["reviewText","overall"]]
reviews_4 = review_frame[review_frame['overall'] == 4][["reviewText","overall"]]
reviews_5 = review_frame[review_frame['overall'] == 5][["reviewText","overall"]]

#now that we're down to 2 columns, clean the data set
reviews_1 = reviews_1.dropna()
reviews_2 = reviews_2.dropna()
reviews_3 = reviews_3.dropna()
reviews_4 = reviews_4.dropna()
reviews_5 = reviews_5.dropna()

#split into training and test samples:
split1 = train_test_split(reviews_1["reviewText"], reviews_1["overall"])
split2 = train_test_split(reviews_2["reviewText"], reviews_2["overall"])
split3 = train_test_split(reviews_3["reviewText"], reviews_3["overall"])
split4 = train_test_split(reviews_4["reviewText"], reviews_4["overall"])
split5 = train_test_split(reviews_5["reviewText"], reviews_5["overall"])
#consolidate training and test examples into two arrays each
X_train = pd.concat([split1[0],split2[0], split3[0], split4[0],split5[0]])
X_test = pd.concat([split1[1],split2[1], split3[1], split4[1],split5[1]])
y_train = pd.concat([split1[2],split2[2], split3[2], split4[2],split5[2]])
y_test = pd.concat([split1[3],split2[3], split3[3], split4[3],split5[3]])


#same as above; shorter dataset
#X_train = pd.concat([split1[0][0:1000],split2[0][0:1000], split3[0][0:1000], split4[0][0:1000],split5[0][0:1000]])
#X_test = pd.concat([split1[1][0:1000],split2[1][0:1000], split3[1][0:1000], split4[1][0:1000],split5[1][0:1000]])
#y_train = pd.concat([split1[2][0:1000],split2[2][0:1000], split3[2][0:1000], split4[2][0:1000],split5[2][0:1000]])
#y_test = pd.concat([split1[3][0:1000],split2[3][0:1000], split3[3][0:1000], split4[3][0:1000],split5[3][0:1000]])


#featurizing input text
def extract_features(text):
    #if word_tokenize is given bad input, the error message is fairly cryptic
    #This was written while troubleshooting; the error doesn't occur for the original dataset
    if not isinstance(text, str):
        assert False, text
    else:
        tokenized = nltk.word_tokenize(text) #tokenizing review text
        if text=="" or len(tokenized)==0: 
            return [0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        classified = nltk.pos_tag(tokenized, tagset='universal')
        _, tags = zip(*classified) #pos_tag returns a list of tuples (word,tag); unzipping this list to count

        #refer to section 2.3 of http://www.nltk.org/book/ch05.html for information on tags
        
        punc_ct = tags.count(".")
        word_ct = len(tokenized) - punc_ct
        if word_ct==0:
            return [0,0,0,0,0,0,0,0,0,0, punc_ct,0, word_ct, 0]
        verb_ct = tags.count("VERB")/word_ct
        noun_ct = tags.count("NOUN")/word_ct
        adj_ct = tags.count("ADJ")/word_ct
        pron_ct = tags.count("PRON")/word_ct
        adv_ct = tags.count("ADV")/word_ct
        adp_ct = tags.count("ADP")/word_ct
        conj_ct = tags.count("CONJ")/word_ct
        det_ct = tags.count("DET")/word_ct
        num_ct = tags.count("NUM")/word_ct
        prt_ct = tags.count("PRT")/word_ct
        badwd_ct = tags.count("X")/word_ct

        if word_ct == 0:
            av_word_len = 0 #prevents divide by 0 errors
        else:
            av_word_len = (len(text)-word_ct-punc_ct+1)/word_ct
        
        
        #verb_ct = tags.count("VERB")
        #noun_ct = tags.count("NOUN")
        #adj_ct = tags.count("ADJ")
        #punc_ct = tags.count(".")
        #badwd_ct = tags.count("X")
        #word_ct = len(tokenized) - punc_ct
        #if word_ct == 0:
        #    av_word_len = 0 #prevents divide by 0 errors
        #else:
        #    av_word_len = (len(text)-word_ct-punc_ct+1)/word_ct
        #the av_word_len calculation is an approximation

        return [verb_ct, noun_ct, adj_ct, pron_ct, adv_ct, adp_ct, conj_ct, det_ct, num_ct, prt_ct, punc_ct, badwd_ct, word_ct, av_word_len]
print(len(X_train))
print(len(X_test))


157460
52490


In [8]:
#featurizing training data
#(this may take 30-45 seconds per 10,000 reviews)

X_train_featurized = []
ctr = -1
broken_review = ""
for train_text in X_train:
    broken_review = train_text
    X_train_featurized.append(extract_features(train_text))
    ctr+=1
    if ctr%10000 == 0:
        print(ctr,"training examples featurized")
print(len(X_train_featurized), "total training examples complete")


#featurizing test data
broken_review = "initial value"
X_test_featurized = []
ctr = -1
for test_ex in X_test:
  broken_review = test_ex
  X_test_featurized.append(extract_features(test_ex))
  ctr+=1
  if ctr%10000 == 0:
    print(ctr,"test examples featurized")

print(len(X_test_featurized), "total test examples complete")

0 training examples featurized
10000 training examples featurized
20000 training examples featurized
30000 training examples featurized
40000 training examples featurized
50000 training examples featurized
60000 training examples featurized
70000 training examples featurized
80000 training examples featurized
90000 training examples featurized
100000 training examples featurized
110000 training examples featurized
120000 training examples featurized
130000 training examples featurized
140000 training examples featurized
150000 training examples featurized
157460 total training examples complete
0 test examples featurized
10000 test examples featurized
20000 test examples featurized
30000 test examples featurized
40000 test examples featurized
50000 test examples featurized
52490 total test examples complete


In [9]:
print("[",broken_review,"]")

[ I was skeptical when a PetSmart employee was trying to get me to buy this for $90 or more.  I'm totally sold now, and would never buy this from PetSmart as it is just throwing away another $50-$60.

I have a great little dog - Minature Schnauzer/Yorkshire mix and she loves to bark as much as any dog.  My problem is I work from home and need to have a quiet house.  I had tried lots of walks and exercise, timeout, a spray bottle and other things, but nothing had worked until this.

This device is fairly simple - it sprays the Citronella when its microphone senses a loud noise.  I know this because my kids shouted into the microphone for fun - and Citronella smells bad, lol.  Citronella smells a lot like Pine Sol floor cleaner - not anything fruity or nice.

This is such a lifesaver.  The minute I put it on my dog she stops barking - whether inside or outside.  My only negative comment would be that when this is off it seems she barks more than she otherwise would, but that is understan

[]


In [10]:
#sanity check--should have equal numbers of data points and classes
print(len(X_train_featurized), len(y_train))
print(len(X_test_featurized), len(y_test))
print(len(X_test_featurized[1]))

157460 157460
52490 52490
14


In [11]:
#regenerates data for binary classifier
y_train_bin = y_train.apply(lambda x: 1 if x>3 else  0)
y_test_bin = y_test.apply(lambda x: 1 if x>3 else 0)
y_train_bin = np.array(y_train_bin)
y_test_bin = np.array(y_test_bin)

In [12]:
X_train_featurized = np.array(X_train_featurized)
X_test_featurized = np.array(X_test_featurized)

In [None]:
print(type(X_train_featurized))

<class 'numpy.ndarray'>


In [13]:
print(X_train_featurized[0:2])

[[1.37931034e-01 2.41379310e-01 1.03448276e-01 1.72413793e-01
  3.44827586e-02 6.89655172e-02 6.89655172e-02 1.03448276e-01
  6.89655172e-02 0.00000000e+00 3.00000000e+00 0.00000000e+00
  2.90000000e+01 3.93103448e+00]
 [2.18750000e-01 1.87500000e-01 9.37500000e-02 1.25000000e-01
  1.25000000e-01 1.25000000e-01 3.12500000e-02 6.25000000e-02
  0.00000000e+00 3.12500000e-02 3.00000000e+00 0.00000000e+00
  3.20000000e+01 4.18750000e+00]]


In [14]:
#fitting linear model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5, max_iter=10000)
logreg.fit(X_train_featurized, y_train_bin)


mean_test_score = logreg.score(X_test_featurized, y_test_bin)
print("test accuracy: {:.3f}".format(mean_test_score))

test accuracy: 0.806


In [16]:
print(list(logreg.predict(X_test_featurized)).count(1))
print(list(y_test).count(1))
print(len(X_test_featurized))

from sklearn.metrics import confusion_matrix
y_test_predicted = logreg.predict(X_test_featurized)
confusion_matrix(y_test_bin, y_test_predicted)

52131
3195
52490


array([[  133,  9955],
       [  226, 42176]])

In [None]:
perm = np.random.permutation(len(y_train))
perm = np.array(perm[0:1000],dtype=int)
xt_short = np.array(X_train_featurized)[perm]
yt_short = np.array(y_train)[perm]
yt_short_bin = np.array(y_train_bin)[perm]

In [None]:
from sklearn.svm import SVC
#running SVC on binary class data
nlsvm = SVC(kernel='rbf', gamma=.001, C = 1e7, max_iter=5000000).fit(xt_short, yt_short)
y_test_predicted = nlsvm.predict(X_test_featurized)
a = confusion_matrix(y_test, y_test_predicted)
print(a)
print(np.trace(a)/np.sum(a))



[[  441   278   437   297  1742]
 [  298   207   374   255  1376]
 [  489   351   669   439  2435]
 [  690   480   887   812  4385]
 [ 3048  2199  3593  3801 22507]]
0.469346542198514


In [None]:
#running SVC for binary classification
nlsvm2 = SVC(kernel='rbf', gamma=.1, C = 1e4, max_iter=500).fit(X_train_featurized, y_train_bin)
y_test_predicted = nlsvm2.predict(X_test_featurized)
a = confusion_matrix(y_test_bin, y_test_predicted)
print(a)
print(np.trace(a)/np.sum(a))



[[ 9908   180]
 [41855   547]]
0.1991807963421604


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support

new_classifier = MLPClassifier(hidden_layer_sizes=(50, 50, 50), max_iter=200)
#new_classifier.fit(X_train_featurized, y_train)
new_classifier.fit(xt_short, yt_short_bin)
y_test_predicted = new_classifier.predict(X_test_featurized)

a = confusion_matrix(y_test_bin, y_test_predicted)
print(a)
print(np.trace(a)/np.sum(a))

#print(precision_recall_fscore_support(y_test_bin, y_test_predicted))

[[    0 10088]
 [    4 42398]]
0.8077348066298342


In [None]:
import pickle
filehandler = open("Local/save_wt.obj", "wb")
pickle.dump((X_train_featurized, X_test_featurized,y_train,y_test), filehandler )

In [None]:
##a bit of data analysis...
xtarray = np.array(X_train_featurized)
print("average number of words:", np.mean(xtarray[:,-2]))
print("median number of words:",np.median(xtarray[:,-2]))

average number of words: 46.85034929505906
median number of words: 27.0


Below is an attempt to see if higher word counts mean better classification

In [None]:
#saving featurized data...
import pickle
filehandler = open("save_wt.obj", "wb")
pickle.dump((X_train_featurized, X_test_featurized,y_train,y_test), filehandler)

In [None]:
xtf = np.array(X_train_featurized)
order1 = np.argsort(xtf[:,-2]) #sort training data by increasing word count
perm = order1[70000:]

In [None]:
xts = xtf[perm]
yts = np.array(y_train)[perm]
ytsb = y_train_bin[perm]

In [None]:
perm = np.random.permutation(len(y_train))
perm = np.array(perm[0:1000],dtype=int)
xt_short = np.array(X_train_featurized)[perm]
yt_short = np.array(y_train)[perm]
yt_short_bin = np.array(y_train_bin)[perm]

In [None]:
xtestf = np.array(X_test_featurized)
order2 = np.argsort(xtestf[:,-2])
perm2 = order2[:25000]

In [None]:
xtests = xtestf[perm2]
ytests = np.array(y_test)[perm2]
ytestsb = y_test_bin[perm2]

In [None]:
from sklearn.svm import SVC
#running SVC on binary class data
nlsvm = SVC(kernel='rbf', gamma=.01, C = 1e1, max_iter=200000000).fit(X_train_featurized,y_train_bin)

y_test_predicted = nlsvm.predict(xtests)
a = confusion_matrix(ytestsb, y_test_predicted)
print(a)
print(np.trace(a)/np.sum(a))