In [1]:
#includes downloads for necessary nltk components
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [2]:
import gzip
import pandas as pd
import numpy as np

In [3]:
with gzip.open('Digital_Music_5.json.gz','rb') as f:
    review_frame = pd.read_json(f, lines=True)

In [63]:
# separate all reviews by rating
reviews_1 = review_frame[review_frame['overall'] == 1][["reviewText","overall"]]
reviews_2 = review_frame[review_frame['overall'] == 2][["reviewText","overall"]]
reviews_3 = review_frame[review_frame['overall'] == 3][["reviewText","overall"]]
reviews_4 = review_frame[review_frame['overall'] == 4][["reviewText","overall"]]
reviews_5 = review_frame[review_frame['overall'] == 5][["reviewText","overall"]]

#now that we're down to 2 columns, clean the data set
reviews_1 = reviews_1.dropna()
reviews_2 = reviews_2.dropna()
reviews_3 = reviews_3.dropna()
reviews_4 = reviews_4.dropna()
reviews_5 = reviews_5.dropna()

#split into training and test samples:
split1 = train_test_split(reviews_1["reviewText"], reviews_1["overall"])
split2 = train_test_split(reviews_2["reviewText"], reviews_2["overall"])
split3 = train_test_split(reviews_3["reviewText"], reviews_3["overall"])
split4 = train_test_split(reviews_4["reviewText"], reviews_4["overall"])
split5 = train_test_split(reviews_5["reviewText"], reviews_5["overall"])
#consolidate training and test examples into two arrays each
#X_train = pd.concat([split1[0],split2[0], split3[0], split4[0],split5[0]])
#X_test = pd.concat([split1[1],split2[1], split3[1], split4[1],split5[1]])
#y_train = pd.concat([split1[2],split2[2], split3[2], split4[2],split5[2]])
#y_test = pd.concat([split1[3],split2[3], split3[3], split4[3],split5[3]])


#same as above; shorter dataset
X_train = pd.concat([split1[0][0:1000],split2[0][0:1000], split3[0][0:1000], split4[0][0:1000],split5[0][0:1000]])
X_test = pd.concat([split1[1][0:1000],split2[1][0:1000], split3[1][0:1000], split4[1][0:1000],split5[1][0:1000]])
y_train = pd.concat([split1[2][0:1000],split2[2][0:1000], split3[2][0:1000], split4[2][0:1000],split5[2][0:1000]])
y_test = pd.concat([split1[3][0:1000],split2[3][0:1000], split3[3][0:1000], split4[3][0:1000],split5[3][0:1000]])


#featurizing input text
def extract_features(text):
    #if word_tokenize is given bad input, the error message is fairly cryptic
    #This was written while troubleshooting; the error doesn't occur for the original dataset
    if not isinstance(text, str):
        assert False, text
    else:
        tokenized = nltk.word_tokenize(text) #tokenizing review text
        classified = nltk.pos_tag(tokenized, tagset='universal')
        _, tags = zip(*classified) #pos_tag returns a list of tuples (word,tag); unzipping this list to count

        #refer to section 2.3 of http://www.nltk.org/book/ch05.html for information on tags

        verb_ct = tags.count("VERB")
        noun_ct = tags.count("NOUN")
        adj_ct = tags.count("ADJ")
        punc_ct = tags.count(".")
        badwd_ct = tags.count("X")
        word_ct = len(tokenized) - punc_ct
        if word_ct == 0:
            av_word_len = 0 #prevents divide by 0 errors
        else:
            av_word_len = (len(text)-word_ct-punc_ct+1)/word_ct
        #the av_word_len calculation is an approximation

        return [verb_ct, noun_ct, adj_ct, punc_ct, badwd_ct, word_ct, av_word_len]
print(len(X_train))
print(len(X_test))


5000
4001
548


In [10]:
#featurizing training data
#(this may take 30-45 seconds per 10,000 reviews)

X_train_featurized = []
ctr = -1
broken_review = ""
for train_text in X_train:
    broken_review = train_text
    X_train_featurized.append(extract_features(train_text))
    ctr+=1
    if ctr%10000 == 0:
        print(ctr,"training examples featurized")

#featurizing test data

print(len(X_train_featurized), "total training examples complete")

X_test_featurized = []
ctr = -1
for test_ex in X_test:
    X_test_featurized.append(extract_features(test_ex))
    ctr+=1
    if ctr%10000 == 0:
        print(ctr,"test examples featurized")

print(len(X_test_featurized), "total test examples complete")

0 training examples featurized
5000 total training examples complete
0 test examples featurized
4001 total test examples complete


In [11]:
#sanity check--should have equal numbers of data points and classes
print(len(X_train_featurized), len(y_train))
print(len(X_test_featurized), len(y_test))
print(len(X_test_featurized[1]))

5000 5000
4001 4001
7


In [14]:
#regenerates data for binary classifier
y_train_bin = y_train.apply(lambda x: 1 if x>3 else  0)
y_test_bin = y_test.apply(lambda x: 1 if x>3 else 0)

In [79]:
#fitting linear model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train_featurized, y_train_bin)


mean_test_score = logreg.score(X_test_featurized, y_test_bin)
print("test accuracy: {:.3f}".format(mean_test_score))

test accuracy: 0.531




In [80]:
print(list(logreg.predict(X_test_featurized)).count(1))
print(list(y_test).count(1))
print(len(X_test_featurized))

from sklearn.metrics import confusion_matrix
y_test_predicted = logreg.predict(X_test_featurized)
confusion_matrix(y_test_bin, y_test_predicted)

416
548
4001


array([[1855,  146],
       [1730,  270]], dtype=int64)

In [108]:
from sklearn.svm import SVC
#running SVC on 5 class data
nlsvm = SVC(kernel='rbf', gamma=.1, C = 1e4, max_iter=1000000).fit(X_train_featurized, y_train)
y_test_predicted = nlsvm.predict(X_test_featurized)
a = confusion_matrix(y_test, y_test_predicted)
print(a)
print(np.trace(a)/np.sum(a))

[[230 135  72  51  60]
 [ 72 215  71  38  57]
 [126 272 351 128 123]
 [147 234 196 211 212]
 [159 176 181 168 316]]
0.3306673331667083


In [111]:
#running SVC for binary classification
nlsvm2 = SVC(kernel='rbf', gamma=.1, C = 1e4, max_iter=10000000).fit(X_train_featurized, y_train_bin)
y_test_predicted = nlsvm2.predict(X_test_featurized)
a = confusion_matrix(y_test_bin, y_test_predicted)
print(a)
print(np.trace(a)/np.sum(a))

[[1441  560]
 [ 942 1058]]
0.6245938515371158
