In [24]:
import numpy as np
import pandas as pd
import xgboost as xgb
import spacy

In [42]:
colnames = ["product/productId",
            "review/userId",
            "review/helpfulness",
            "review/score",
            "review/time",
            "review/summary",
            "review/text"]

df = pd.read_csv("data/finemuged.csv", encoding="latin1", header=None,
                 names=colnames, quotechar = "\"").sample(10000)

In [43]:
df.head(15)

Unnamed: 0,product/productId,review/userId,review/helpfulness,review/score,review/time,review/summary,review/text
328139,B00473OFXE,AEZM5BJYRGGC7,6/6,5.0,1309132800,SuperDelicious,The best honey ever!!! I have hated honey all...
16169,B005EL6VOY,A2JO2N97YNCD5N,0/0,5.0,1323993600,Great Taste,Bought this in the Maui Costco and thought it ...
343196,B000WFEN74,A2YG3SRC8AMQ6G,3/4,5.0,1247356800,The best in the U.S. market today.,About the only thing wrong I can find with Wel...
373324,B001ELL4ZY,A1BSG175XRZFUI,0/0,5.0,1298851200,Great decaf coffee,I have never cared for decaf coffee except for...
541624,B001BM4RC8,AY8VHSO8K2D2R,0/0,3.0,1254614400,Nature goodness,Really great tasting chips. I really did not e...
135428,B005VOOM4A,AQH1SWR5HA413,2/6,5.0,1337731200,This is a coffee lovers coffee,When we got this coffee my wife promptly told ...
251696,B001PMDYT6,A30QYHMGSDGI0J,33/34,5.0,1181433600,PET FOOD RECALL WORRIES? NO PROBLEM WITH NEWM...,"Upon the initial pet food recall news, I immed..."
18406,B001XVW3DC,A3IO2U3J119OO1,1/1,5.0,1339459200,Finest,Maldon salt is acknowledged to be the most ele...
433979,B003CG1698,A146TIEX466GXH,0/0,5.0,1299196800,My son LOVES this!,I have trouble getting my 7 month old to eat a...
409931,B0046IAJFI,A1TQENMOGWUSR1,3/3,5.0,1313539200,Healthy for my dogs,My dogs eat nothing but Halo brand dog food be...


In [44]:
df = df[['review/score', 'review/helpfulness', 'review/text']]
df.head()

Unnamed: 0,review/score,review/helpfulness,review/text
328139,5.0,6/6,The best honey ever!!! I have hated honey all...
16169,5.0,0/0,Bought this in the Maui Costco and thought it ...
343196,5.0,3/4,About the only thing wrong I can find with Wel...
373324,5.0,0/0,I have never cared for decaf coffee except for...
541624,3.0,0/0,Really great tasting chips. I really did not e...


In [45]:
df['review/helpfulness'] = df['review/helpfulness'].apply(lambda x: eval(x) if x.split('/')[1] != '0' else 0)
df['review/text'] = df['review/text'].apply(lambda x: x.lower())

df.head()

Unnamed: 0,review/score,review/helpfulness,review/text
328139,5.0,1.0,the best honey ever!!! i have hated honey all...
16169,5.0,0.0,bought this in the maui costco and thought it ...
343196,5.0,0.75,about the only thing wrong i can find with wel...
373324,5.0,0.0,i have never cared for decaf coffee except for...
541624,3.0,0.0,really great tasting chips. i really did not e...


In [46]:
import string
nlp = spacy.load('en')
def lemmatize(stringr):
    nopunct = stringr.translate(str.maketrans('','',string.punctuation))
    return' '.join([x.lemma_ for x in nlp(nopunct)])
    
lemmatize(df['review/text'].values[0])

'the good honey ever   i have hat honey all -PRON- lifeus to gag on -PRON- as a child when -PRON- mother would try to give -PRON- to -PRON- for a cough or sore throat   this stuff be different totally   first of all -PRON- be not a syrup   -PRON- be a creamy pale yellowish mass of ecstasyfinely crystalize goop almost custardlike that i have find -PRON- eat spoonful by spoonful right out of the jar   -PRON- be that good   and -PRON- be the pure of the pure and most nutritious   so -PRON- please the sens as well as the mind   delicious stuff   try -PRON-   buy -PRON- before all the bee die due to monsanto genetically modify crop which be kill -PRON- off world wide   hurry buy case of -PRON-   -PRON- will not regret -PRON-   can even be use on woundsbr br i have some that i let sit in a storage bin where -PRON- get too hot and must have cook -PRON-   -PRON- be now melt and dark and somewhat separate between a lite syrup and crispy crystal   -PRON- be still the good though with a little mo

In [47]:
df['review/text'] = df['review/text'].apply(lambda x: lemmatize(x))

KeyboardInterrupt: 

In [3]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report

In [4]:
svd = TruncatedSVD(150)

In [5]:
xtrain = svd.fit_transform(load_npz('data/xtrain.npz'))
xtest = svd.transform(load_npz('data/xtest.npz'))

In [None]:
xtrain

In [6]:
labels = np.load('data/label.npz')

In [7]:
ytrain = labels['train'].reshape(-1, 1)
ytest = labels['test'].reshape(-1, 1)

In [8]:
dtrain = xgb.DMatrix(xtrain, label=ytrain)
dtest = xgb.DMatrix(xtest, label=ytest)

In [None]:
param = {'silent':1, 'objective':'binary:hinge', 'eval_metric':'auc'}
num_round = 100

In [8]:
bst = xgb.train(param, dtrain, num_round)
pred = bst.predict(dtest)
print(classification_report(ytest, pred))

             precision    recall  f1-score   support

          0       0.65      0.42      0.51      4707
          1       0.91      0.96      0.94     28288

avg / total       0.87      0.89      0.88     32995

