In [1]:
import numpy as np
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn import svm
import re
import math
import json

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/shuo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def parseData(fname):
    for l in urllib.urlopen(fname):
        yield eval(l)

In [4]:
### all reviews

print "Reading data..."
data = list(parseData("Digital_Music_5.json"))
print "done"

Reading data...
done


In [5]:
print data[0]

{'reviewerID': 'A3EBHHCZO6V2A4', 'asin': '5555991584', 'reviewerName': 'Amaranth "music fan"', 'helpful': [3, 3], 'unixReviewTime': 1158019200, 'reviewText': 'It\'s hard to believe "Memory of Trees" came out 11 years ago;it has held up well over the passage of time.It\'s Enya\'s last great album before the New Age/pop of "Amarantine" and "Day without rain." Back in 1995,Enya still had her creative spark,her own voice.I agree with the reviewer who said that this is her saddest album;it is melancholy,bittersweet,from the opening title song."Memory of Trees" is elegaic&majestic.;"Pax Deorum" sounds like it is from a Requiem Mass,it is a dark threnody.Unlike the reviewer who said that this has a "disconcerting" blend of spirituality&sensuality;,I don\'t find it disconcerting at all."Anywhere is" is a hopeful song,looking to possibilities."Hope has a place" is about love,but it is up to the listener to decide if it is romantic,platonic,etc.I\'ve always had a soft spot for this song."On my w

In [6]:
random.shuffle(data)
train_set = data[:20000]
val_set = data[20000:40000]
test_set = data[40000:]

In [7]:
# baseline: global average
g_sum = 0
count = 0
for d in train_set+val_set:
    g_sum += d['overall']
    count += 1
g_avg = g_sum / count
print g_avg

4.2213


In [8]:
MSE_base = 0
count = 0
for d in test_set:
    MSE_base += (g_avg-d['overall'])**2
    count += 1
MSE_base /= count
print MSE_base

1.18980346414


In [9]:
def process(d, n='1', punc=False):
    review = d['reviewText']
    wl = None
    if punc:
        wl = re.findall(r"[\w]+|[^\s\w]", review.lower())
    else:
        r = ''.join([c for c in review.lower() if not c in set(string.punctuation)])
        wl = r.split()
    
    stop_words = set(stopwords.words('english'))
    wl_nostop = []
    for w in wl:
        if not w in stop_words:
            wl_nostop.append(w)
            
    wl = wl_nostop

    if n == '1':
        return wl
    elif n == '2':
        return list(nltk.bigrams(wl))
    else:
        return list(nltk.trigrams(wl))

In [10]:
"Only select top 1000 common ones"
def gen_count_dict(raw_data, n='1', punc=False):
    wordCount = defaultdict(int)
    for d in raw_data:
        ngram_l = process(d, n, punc)
        for w in ngram_l:
            wordCount[w] += 1
            
    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()
    
    count_dict = defaultdict(int)
    for (c, w) in counts[:1000]:
        count_dict[w] = c
    return count_dict

def gen_idf_dict(raw_data, count_dict, n='1', punc=False):
    reviewset_l = []
    idfDict = defaultdict(float)
    for d in raw_data:
        ngram_l = process(d, n, punc)
        ngram_s = set(ngram_l)
        for w in ngram_s:
            if w in count_dict:
                idfDict[w] += 1.0
    N = len(raw_data)
    for w in idfDict:
        idfDict[w] = math.log10(N/idfDict[w])
    return idfDict

In [11]:
def gen_vector(ngram_l, is_tfidf, dictId, idfdict=None):
    N = len(ngram_l)
    feat = [0.0]*len(dictId)
    for w in ngram_l:
        if w in dictId:
            feat[dictId[w]] += 1.0
    if is_tfidf:
        for w in set(ngram_l):
            if w in dictId:
                feat[dictId[w]] /= N
                feat[dictId[w]] *= idfdict[w]
    feat.append(1)
    return feat


In [21]:
# Pipeline with different lambdas
def validation(trainX, trainy, valX, valy, clfname): #or 'logistic', 'svm'
    opt_lam = 1
    opt_MSE = float('Inf')
    for lam in [0.005, 0.01, 0.1, 1, 10, 100, 500]:
        clf = None
        if clfname == 'ridge':
            clf = linear_model.Ridge(lam, fit_intercept=False)
        elif clfname == 'lasso':
            clf = linear_model.Lasso(lam, fit_intercept=False)
        elif clfname == 'logistic':
            clf = linear_model.LogisticRegression(C=(1.0/lam), max_iter=500, fit_intercept=False)
        else:
            clf = svm.SVC(C=(1.0/lam), kernel='linear', max_iter=500)
        
        print lam
        clf.fit(trainX, trainy)
        theta = clf.coef_
        
        predictions_val = clf.predict(valX)
        MSE_val = sum(np.square(np.subtract(predictions_val, valy)))/len(valy)
        
        print MSE_val
        
        if MSE_val < opt_MSE:
            opt_lam = lam
            opt_MSE = MSE_val
            
    print 'Best lambda: {}'.format(opt_lam)
    return opt_lam

def test_model(trainX, trainy, testX, testy, opt_lam, clfname): #or 'logistic', 'svm'
    clf = None
    if clfname == 'ridge':
        clf = linear_model.Ridge(opt_lam, fit_intercept=False)
    elif clfname == 'lasso':
        clf = linear_model.Lasso(opt_lam, fit_intercept=False)
    elif clfname == 'logistic':
        clf = linear_model.LogisticRegression(C=(1.0/opt_lam), max_iter=500, fit_intercept=False)
    else:
        clf = svm.SVC(C=(1.0/opt_lam), kernel='linear', max_iter=500)
    
    clf.fit(trainX, trainy)
    
    predictions_test = clf.predict(testX)
    MSE_test = sum(np.square(np.subtract(predictions_test, testy)))/len(testy)
    
    print '{}\'s MSE on test set with lambda {}: {}'.format(clfname, opt_lam, MSE_test)
    
    return clf

# Use word count

## without punctuation

### unigram

In [37]:
count_dict = gen_count_dict(train_set, '1')
dictId = dict(zip(count_dict.keys(), range(len(count_dict.keys()))))

In [38]:
X_train = [gen_vector(process(d, '1'), False, dictId) for d in train_set]
y_train = [d['overall'] for d in train_set]
X_val = [gen_vector(process(d, '1'), False, dictId) for d in val_set]
y_val = [d['overall'] for d in val_set]
X_test = [gen_vector(process(d, '1'), False, dictId) for d in test_set]
y_test = [d['overall'] for d in test_set]

In [39]:
count_dict1 = gen_count_dict(train_set+val_set, '1')
dictId1 = dict(zip(count_dict1.keys(), range(len(count_dict1.keys()))))
X_train1 = [gen_vector(process(d, '1'), False, dictId1) for d in train_set+val_set]
y_train1 = [d['overall'] for d in train_set+val_set]
X_test1 = [gen_vector(process(d, '1'), False, dictId1) for d in test_set]
y_test1 = [d['overall'] for d in test_set]

In [40]:
# Ridge
opt_lam = validation(X_train, y_train, X_val, y_val, 'ridge')
clf = test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'ridge')

0.005
0.9492711886831418
0.01
0.9492702823961201
0.1
0.9492539773524599
1
0.9490917707055463
10
0.9475514910231398
100
0.9384069928816849
500
0.9569689626860954
Best lambda: 100
ridge's MSE on test set with lambda 100: 0.918352527249


In [41]:
coeff = list(clf.coef_)
id_l =  [(c, coeff.index(c)) for c in coeff]
top_coef = sorted(id_l, key=lambda x: x[0], reverse=True)[:100]
low_coef = sorted(id_l, key=lambda x: x[0])[:100]

top_coef_word = []
for (coef, wordid) in top_coef:
    for k in dictId:
        if wordid == dictId[k]:
            top_coef_word.append((coef, k))
            break
print top_coef_word

low_coef_word = []
for (coef, wordid) in low_coef:
    for k in dictId:
        if wordid == dictId[k]:
            low_coef_word.append((coef, k))
            break
print low_coef_word

[(0.1302169498622831, 'totally'), (0.12396101183208154, 'range'), (0.11587623548074352, 'none'), (0.11567164796668233, 'pieces'), (0.11426809639954347, 'near'), (0.1053192146849416, 'right'), (0.10469499878714929, 'awesome'), (0.10400092109315374, 'isnt'), (0.10030345227107718, 'able'), (0.10024059262780323, 'dont'), (0.09960628624560872, 'beauty'), (0.09742731570800171, 'j'), (0.09714526888498676, 'singing'), (0.09700108697739698, 'drums'), (0.09559349738036184, 'thats'), (0.09420579664345526, 'inspired'), (0.091992772198558, '40'), (0.09145183880266858, 'peace'), (0.09115174910306555, 'tori'), (0.08832218561559789, 'keep'), (0.08759850237439433, 'club'), (0.0870000614546181, 'decided'), (0.08532756107184884, 'hiphop'), (0.08392964496581241, 'effort'), (0.08359541969921047, 'contemporary'), (0.0822570709575344, 'listening'), (0.0815823658490935, 'roll'), (0.08133828300681442, 'cds'), (0.08131469534078077, 'always'), (0.0807995196378618, 'moody'), (0.07986003076744472, 'felt'), (0.0793

In [71]:
# Logistic
opt_lam = validation(X_train, y_train, X_val, y_val, 'logistic')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'logistic')

0.005
1.3137
0.01
1.3144
0.1
1.3004
1
1.2807
10
1.2791
100
1.42165
500
1.58475
Best lambda: 10
logistic's MSE on test set with lambda 10: 1.20630616045


In [74]:
# SVM
opt_lam = validation(X_train, y_train, X_val, y_val, 'svm')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'svm')

0.005
2.85675
0.01
2.85675
0.1
2.85675
1
2.6451
10
2.1944
100
2.04695
500
1.86715
Best lambda: 500
svm's MSE on test set with lambda 500: 1.78248198818


### bigram

In [43]:
count_dict = gen_count_dict(train_set, '2')
dictId = dict(zip(count_dict.keys(), range(len(count_dict.keys()))))

In [44]:
X_train = [gen_vector(process(d, '2'), False, dictId) for d in train_set]
y_train = [d['overall'] for d in train_set]
X_val = [gen_vector(process(d, '2'), False, dictId) for d in val_set]
y_val = [d['overall'] for d in val_set]
X_test = [gen_vector(process(d, '2'), False, dictId) for d in test_set]
y_test = [d['overall'] for d in test_set]

In [45]:
count_dict1 = gen_count_dict(train_set+val_set, '2')
dictId1 = dict(zip(count_dict1.keys(), range(len(count_dict1.keys()))))
X_train1 = [gen_vector(process(d, '2'), False, dictId1) for d in train_set+val_set]
y_train1 = [d['overall'] for d in train_set+val_set]
X_test1 = [gen_vector(process(d, '2'), False, dictId1) for d in test_set]
y_test1 = [d['overall'] for d in test_set]

KeyboardInterrupt: 

In [78]:
# Ridge
opt_lam = validation(X_train, y_train, X_val, y_val, 'ridge')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'ridge')

0.005
1.072414964072224
0.01
1.0724098390465409
0.1
1.0723178967900966
1
1.0714290363964005
10
1.0647343376013882
100
1.0580392194042043
500
1.1050404841076331
Best lambda: 100
ridge's MSE on test set with lambda 100: 1.00940212094


In [79]:
# Logistic
opt_lam = validation(X_train, y_train, X_val, y_val, 'logistic')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'logistic')

0.005
1.613
0.01
1.60705
0.1
1.58425
1
1.5709
10
1.6354
100
1.77155
500
1.8311
Best lambda: 1
logistic's MSE on test set with lambda 1: 1.50489759573


In [80]:
# SVM
opt_lam = validation(X_train, y_train, X_val, y_val, 'svm')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'svm')

0.005
3.3607
0.01
3.3427
0.1
3.3172
1
3.02985
10
2.0908
100
1.84535
500
1.84415
Best lambda: 500
svm's MSE on test set with lambda 500: 1.78871529183


# Use TF-IDF

## without punctuation

### unigram

In [46]:
count_dict = gen_count_dict(train_set, '1')
idf_dict = gen_idf_dict(train_set, count_dict, '1')
dictId = dict(zip(count_dict.keys(), range(len(count_dict.keys()))))

In [47]:
X_train = [gen_vector(process(d, '1'), True, dictId, idf_dict) for d in train_set]
y_train = [d['overall'] for d in train_set]
X_val = [gen_vector(process(d, '1'), True, dictId, idf_dict) for d in val_set]
y_val = [d['overall'] for d in val_set]
X_test = [gen_vector(process(d, '1'), True, dictId, idf_dict) for d in test_set]
y_test = [d['overall'] for d in test_set]

In [48]:
count_dict1 = gen_count_dict(train_set+val_set, '1')
idf_dict1 = gen_idf_dict(train_set, count_dict1, '1')
dictId1 = dict(zip(count_dict1.keys(), range(len(count_dict1.keys()))))
X_train1 = [gen_vector(process(d, '1'), True, dictId1, idf_dict1) for d in train_set+val_set]
y_train1 = [d['overall'] for d in train_set+val_set]
X_test1 = [gen_vector(process(d, '1'), True, dictId1, idf_dict1) for d in test_set]
y_test1 = [d['overall'] for d in test_set]

In [51]:
print X_train1[0]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011623979668720988, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04607870247324972, 0.0, 0.0, 0.011068336778252398, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04151371396733899, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [49]:
print len(X_train[0])

1001


In [50]:
# Ridge
opt_lam = validation(X_train, y_train, X_val, y_val, 'ridge')
clf = test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'ridge')

0.005
0.8802483244257694
0.01
0.8783215106287362
0.1
0.8627919981218706
1
0.9209473217834379
10
1.0948129740386394
100
1.1569599314382473
500
1.1750609457655696
Best lambda: 0.1
ridge's MSE on test set with lambda 0.1: 0.85444190165


In [36]:
coeff = list(clf.coef_)
id_l =  [(c, coeff.index(c)) for c in coeff]
top_coef = sorted(id_l, key=lambda x: x[0], reverse=True)[:100]
low_coef = sorted(id_l, key=lambda x: x[0])[:100]

top_coef_word = []
for (coef, wordid) in top_coef:
    for k in dictId:
        if wordid == dictId[k]:
            top_coef_word.append((coef, k))
            break
print top_coef_word

low_coef_word = []
for (coef, wordid) in low_coef:
    for k in dictId:
        if wordid == dictId[k]:
            low_coef_word.append((coef, k))
            break
print low_coef_word

[(8.953773125508844, 'decided'), (7.887107336402777, 'range'), (7.653750607619889, 'talent'), (6.607062995092129, 'taste'), (6.599718541502019, '45'), (6.57546923644013, 'peace'), (6.57309815071944, 'inspired'), (6.485733652438709, 'dont'), (6.387091925221731, 'la'), (6.233712145214604, 'dre'), (6.207245595282777, 'beauty'), (6.090263045588114, 'tori'), (5.983020978009756, 'contemporary'), (5.783594083295869, 'roll'), (5.780064675601929, 'quite'), (5.726373670026841, 'beautifully'), (5.4709887691245385, 'rain'), (5.396976529430144, 'upon'), (5.2455206043533265, 'isnt'), (5.237427764650378, 'lady'), (5.2134491599351, 'mellow'), (5.1822529098840375, 'special'), (5.011389491870838, 'wave'), (4.957415229225871, 'crazy'), (4.916765851452452, 'perfectly'), (4.890855412454081, 'keep'), (4.717585929981646, 'listening'), (4.714210157442378, 'near'), (4.70624085004104, 'drums'), (4.6790105752345355, 'started'), (4.643641610645095, 'features'), (4.620085986381194, 'awesome'), (4.5445036485571055,

In [86]:
# Logistic
opt_lam = validation(X_train, y_train, X_val, y_val, 'logistic')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'logistic')

0.005
1.24275
0.01
1.28065
0.1
1.56595
1
1.8339
10
1.83685
100
1.83685
500
1.83685
Best lambda: 0.005
logistic's MSE on test set with lambda 0.005: 1.16016352303


In [87]:
# SVM
opt_lam = validation(X_train, y_train, X_val, y_val, 'svm')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'svm')

0.005
1.7719
0.01
1.7006
0.1
1.7291
1
1.8109
10
1.8071
100
1.8012
500
1.801
Best lambda: 0.01
svm's MSE on test set with lambda 0.01: 1.71132518417


### bigram

In [88]:
count_dict = gen_count_dict(train_set, '2')
idf_dict = gen_idf_dict(train_set, count_dict, '2')
dictId = dict(zip(count_dict.keys(), range(len(count_dict.keys()))))

In [89]:
X_train = [gen_vector(process(d, '2'), True, dictId, idf_dict) for d in train_set]
y_train = [d['overall'] for d in train_set]
X_val = [gen_vector(process(d, '2'), True, dictId, idf_dict) for d in val_set]
y_val = [d['overall'] for d in val_set]
X_test = [gen_vector(process(d, '2'), True, dictId, idf_dict) for d in test_set]
y_test = [d['overall'] for d in test_set]

In [90]:
count_dict1 = gen_count_dict(train_set+val_set, '2')
idf_dict1 = gen_idf_dict(train_set, count_dict1, '2')
dictId1 = dict(zip(count_dict1.keys(), range(len(count_dict1.keys()))))
X_train1 = [gen_vector(process(d, '2'), True, dictId1, idf_dict1) for d in train_set+val_set]
y_train1 = [d['overall'] for d in train_set+val_set]
X_test1 = [gen_vector(process(d, '2'), True, dictId1, idf_dict1) for d in test_set]
y_test1 = [d['overall'] for d in test_set]

In [91]:
print X_train1[:5]

[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.015341398783329461, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.016445034888245235, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011739971837046755, 0.0, 0.0, 0.0, 0.0, 0.0, 0.014479587932067882, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [92]:
# Ridge
opt_lam = validation(X_train, y_train, X_val, y_val, 'ridge')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'ridge')

0.005
1.112701644930789
0.01
1.1069830094543596
0.1
1.082797514744754
1
1.1337082008782686
10
1.1967235153933302
100
1.2102262481405592
500
1.217879589765983
Best lambda: 0.1
ridge's MSE on test set with lambda 0.1: 1.03385621488


In [93]:
# Logistic
opt_lam = validation(X_train, y_train, X_val, y_val, 'logistic')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'logistic')

0.005
1.6289
0.01
1.6644
0.1
1.8061
1
1.8365
10
1.83685
100
1.83685
500
1.83685
Best lambda: 0.005
logistic's MSE on test set with lambda 0.005: 1.57475916781


In [94]:
# SVM
opt_lam = validation(X_train, y_train, X_val, y_val, 'svm')
test_model(X_train1, y_train1, X_test1, y_test1, opt_lam, 'svm')

0.005
2.02145
0.01
1.9299
0.1
1.83855
1
1.8405
10
1.84135
100
1.84175
500
1.84175
Best lambda: 0.1
svm's MSE on test set with lambda 0.1: 1.7696106209
