In [157]:
%matplotlib inline

import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import numpy as np
import nltk

df_all = pd.read_csv('amazon_review_full_csv/train.csv', header=None)
df_all.columns = ['rating', 'title', 'body']

df = df_all.head(100000).copy()
df_test = df_all.tail(1000).copy()

In [18]:
df.head(10)

Unnamed: 0,rating,title,body
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...
5,5,There's a reason for the price,"There's a reason this CD is so expensive, even..."
6,1,Buyer beware,"This is a self-published book, and if you want..."
7,4,"Errors, but great story",I was a dissapointed to see errors on the back...
8,1,The Worst!,A complete waste of time. Typographical errors...
9,1,Oh please,I guess you have to be a romance novel lover f...


## Data processing and exploration


In [166]:
from nltk.corpus import stopwords
import string
stop_words = set(stopwords.words('english') + list(string.punctuation) 
                 + ['...', 'n\'t', '\'\'', '``', '\'s'])
wnl = nltk.WordNetLemmatizer()

def process_string(s):
    if s == '':
        return []
    tokens = nltk.word_tokenize(s.lower())
    # remove stopwords
    return [wnl.lemmatize(w) for w in tokens if not w in stop_words]
    

df.title.fillna('', inplace=True)
all_doc_titles = [process_string(x) for x in df.title.values]
all_titles = np.hstack(all_doc_titles)
fd_titles = nltk.FreqDist(all_titles)

all_doc_body = [process_string(x) for x in df.body.values]
all_body = np.hstack(all_doc_body)
fd_body = nltk.FreqDist(all_body)

In [167]:
print(fd_titles.most_common(20))
print(fd_body.most_common(20))

[('good', 7367), ('great', 6932), ('book', 5353), ('movie', 3227), ('best', 2585), ('one', 2162), ('read', 2104), ('bad', 1905), ('work', 1688), ('better', 1552), ('love', 1453), ('time', 1253), ('like', 1195), ('buy', 1174), ('story', 1173), ('excellent', 1148), ('review', 1139), ('classic', 1128), ('product', 1090), ('ever', 1084)]
[('book', 66762), ('one', 38334), ('movie', 31527), ('like', 31046), ('good', 28108), ('would', 25988), ('read', 23314), ('time', 22498), ('great', 22044), ('get', 20940), ('really', 17331), ('story', 16295), ('much', 15406), ('first', 14420), ('well', 14172), ('make', 13396), ('love', 13134), ('even', 13097), ('could', 13056), ('work', 12273)]


In [58]:
#titles = [nltk.Text(tokens) for tokens in all_doc_titles]
#bodies = [nltk.Text(tokens) for tokens in all_doc_body]

In [168]:
from gensim import corpora, models, similarities

dictionary = corpora.Dictionary(all_doc_body)
corpus = [dictionary.doc2bow(text) for text in all_doc_body]

In [151]:
#tfidf = models.TfidfModel(corpus) 
#corpus_tfidf = tfidf[corpus]
#print(corpus_tfidf[0])

In [169]:
#lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
#corpus_lsi = lsi[corpus_tfidf]

lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5)

In [170]:
lda.print_topics()

[(0,
  '0.054*"book" + 0.019*"read" + 0.009*"story" + 0.008*"one" + 0.007*"character" + 0.007*"would" + 0.007*"reading" + 0.006*"like" + 0.006*"time" + 0.006*"good"'),
 (1,
  '0.016*"album" + 0.015*"song" + 0.014*"cd" + 0.013*"like" + 0.011*"music" + 0.010*"one" + 0.009*"sound" + 0.009*"dvd" + 0.008*"great" + 0.008*"good"'),
 (2,
  '0.016*"game" + 0.007*"like" + 0.005*"use" + 0.005*"would" + 0.005*"also" + 0.005*"one" + 0.004*"get" + 0.004*"make" + 0.004*"look" + 0.004*"good"'),
 (3,
  '0.035*"movie" + 0.012*"film" + 0.011*"one" + 0.009*"good" + 0.009*"like" + 0.006*"really" + 0.006*"time" + 0.006*"see" + 0.006*"great" + 0.006*"bad"'),
 (4,
  '0.011*"one" + 0.010*"would" + 0.009*"product" + 0.008*"work" + 0.008*"get" + 0.007*"time" + 0.007*"bought" + 0.006*"use" + 0.006*"good" + 0.006*"year"')]

In [163]:
# test set
test_bodies = [process_string(x) for x in df_test.body.values]
test_corpus = [dictionary.doc2bow(text) for text in test_bodies]

index = similarities.MatrixSimilarity(lda[test_corpus])

In [164]:
sims = index[lda[test_corpus]]

In [165]:
# test a document and look at similar documents:
ix = 10
test_text = test_bodies[ix]
print(test_text)

sorted_sims = sorted(enumerate(sims[ix]), key= lambda x: x[1], reverse=True)

for i in range(1,10):
    print(sorted_sims[i][0])
    print(sorted_sims[i], test_bodies[sorted_sims[i][0]])

['expansion', '...', 'cheaper', 'would', 'go', 'star', 'fix', 'original', 'engine', 'outstanding', "n't", 'already', 'played', 'icewind', 'dale', 'playing', 'first', 'time', 'expansion', "'ll", 'get', 'lot', 'money', 'already', 'completed', 'i.d', 'hungry', "n't", 'think', 'calm', 'hunger', "'s", 'like', 'snack', 'meal', 'bonus', 'mission', 'lot', 'fun', 'brief', 'simple', 'least', 'one', 'extra', 'long', 'term', 'quest', 'would', 'appreciated', 'shelling', '``', 'money', "''", 'advice', 'wait', 'price', 'go']
912
(912, 0.9999986) ['purchase', 'described', 'fast', 'shipped', 'great', 'condition', 'son', 'act', 'favorite', 'thomas', 'episode', 'rest', 'train', 'look', 'good', 'constructed', 'well', 'changing', 'battery', 'little', 'cumbersome', 'protects', 'motor', 'working', '``', 'bit', "''", 'petty', 'well']
672
(672, 0.9999926) ['love', 'ipaq', 'lack', 'service', 'provided', 'compaq', 'make', 'project', 'almost', 'worthless', 'finally', 'get', 'finished', 'transferred', 'one', 'line

(3000000, 3)