# Import Stuff

In [81]:
import pandas as pd
import json
from nltk.classify.naivebayes import NaiveBayesClassifier
import nltk
import collections as c

In [18]:
reviews = []
for line in open('part1.json', 'r'):
    reviews.append(json.loads(line))

In [35]:
reviews[:1]

[{'asin': '000100039X',
  'helpful': [0, 0],
  'overall': 5.0,
  'reviewText': 'Spiritually and mentally inspiring! A book that allows you to question your morals and will help you discover who you really are!',
  'reviewTime': '12 16, 2012',
  'reviewerID': 'A10000012B7CGYKOMPQ4L',
  'reviewerName': 'Adam',
  'summary': 'Wonderful!',
  'unixReviewTime': 1355616000}]

In [50]:
df  = pd.DataFrame(reviews)
df = df[['overall', 'summary', 'reviewText']].copy()
df['overall'] = df['overall'].astype(str)
df[:10]

Unnamed: 0,overall,summary,reviewText
0,5.0,Wonderful!,Spiritually and mentally inspiring! A book tha...
1,5.0,close to god,This is one my must have books. It is a master...
2,5.0,Must Read for Life Afficianados,This book provides a reflection that you can a...
3,5.0,Timeless for every good and bad time in your l...,I first read THE PROPHET in college back in th...
4,5.0,A Modern Rumi,A timeless classic. It is a very demanding an...
5,5.0,This book will bring you peace,Reading this made my mind feel like a still po...
6,5.0,Graet Work,"As you read, Gibran's poetry brings spiritual ..."
7,5.0,Such Beauty,"Deep, moving dramatic verses of the heart and ..."
8,5.0,The Prophet,This is a timeless classic. Over the years I'...
9,5.0,A Modern Classic,An amazing work. Realizing extensive use of Bi...


## Split to train and test

In [60]:
train = df.sample(frac=0.8, random_state=200)
test = df.drop(train.index)

## Convert numeric rating to word rating (5.0 -> "Five")

In [67]:
def convert_numeric_to_word(temp_df):
    for index, row in temp_df.iterrows():
        rating = "Five"
        if row['overall'] == "1.0":
            rating = "One"
        elif row['overall'] == "2.0":
            rating = "Two"
        elif row['overall'] == "3.0":
            rating = "Three"
        elif row['overall'] == "4.0":
            rating = "Four"
        else:
            rating = "Five"
        temp_df.set_value(index, 'overall', rating)
    return temp_df
        
train = convert_numeric_to_word(train)
test = convert_numeric_to_word(test)
train[:5]

Unnamed: 0,overall,summary,reviewText
71488,Five,Read it even if you don't think you'd be inter...,"A sledgehammer of a book. Well-written, compa..."
18462,Five,I Am 16/f and I loved this book!,"HI! aLRIGHT, LET ME GIVE IT STRAIGHT HAHA I re..."
5190,Five,ah.. have to wait for 2 more sequals!,love this book too. arya's new found skills. d...
49745,Five,A fabulous book of recipes!,This book is a great product. Lots of informa...
51643,Four,"Well done, good research","If you're looking into things like Angels, occ..."


In [68]:
test[:5]

Unnamed: 0,overall,summary,reviewText,feats
3,Five,Timeless for every good and bad time in your l...,I first read THE PROPHET in college back in th...,{'I first read THE PROPHET in college back in ...
4,Five,A Modern Rumi,A timeless classic. It is a very demanding an...,{'A timeless classic. It is a very demanding ...
14,Five,A book everyone &#34;should&#34; read,I discovered The Prophet fifty years ago in co...,{'I discovered The Prophet fifty years ago in ...
17,Five,Flawless,"Anything I've read by Gibran is, in my mind, f...","{'Anything I've read by Gibran is, in my mind,..."
23,Two,Eloquent,I read this about a year ago and can't recall ...,{'I read this about a year ago and can't recal...


## Build Baseline

In [77]:
def build_baseline_feature(temp_df):
    temp_df['feats'] = temp_df['reviewText'].map(lambda x: c.Counter(x.split(" ")))
    return list(zip(temp_df['feats'], temp_df['overall']))

train_data = build_baseline_feature(train)
test_data = build_baseline_feature(test)

In [101]:
test_data[:5]

[(Counter({'(Liberia)': 1,
           "60's.": 2,
           'AND': 1,
           'After': 1,
           'Corps': 1,
           'I': 8,
           'It': 2,
           'PROPHET': 1,
           'Peace': 1,
           'THE': 1,
           'The': 1,
           'a': 6,
           'after': 2,
           'again': 2,
           'always': 2,
           'am': 1,
           'amazed': 1,
           'and': 9,
           'anything': 1,
           'as': 2,
           'at': 1,
           'back': 1,
           'became': 1,
           'before': 3,
           'book': 4,
           'born': 1,
           'both': 1,
           'chapter': 1,
           'children': 1,
           'college': 1,
           'comfort': 1,
           'comfort.': 1,
           'could': 1,
           'country': 1,
           'definitely': 1,
           'did': 1,
           'during': 1,
           'each': 1,
           'effect': 1,
           'fatal': 1,
           'first': 1,
           'for': 2,
           'future.Gibran': 1,
      

In [79]:
classifier = NaiveBayesClassifier.train(train_data)

In [82]:
nltk.classify.util.accuracy(classifier, test_data)

0.20905

In [102]:
classifier.classify({"what": 1, "a": 1, "wonderful": 1, "book": 1})

'Five'

In [107]:
classifier.classify({"Not": 12, "worth": 1, "reading": 1, "very": 1, "bad": 15, "don't": 20, "read": 1})

'One'