In [11]:
import pandas as pd

data_chunks = pd.read_json("review_dataset.json", orient="columns",
                       lines=True, chunksize=10000)

for chunk in data_chunks:
    print(chunk[["text", "stars"]])

                                                   text  stars
0     Total bill for this horrible service? Over $8G...      1
1     I *adore* Travis at the Hard Rock's new Kelly ...      5
2     I have to say that this office really has it t...      5
3     Went in for a lunch. Steak sandwich was delici...      5
4     Today was my second out of three sessions I ha...      1
...                                                 ...    ...
9995  Quite sci if airport. Kiosks to check in and g...      4
9996  My family went here on new years eve for my da...      4
9997  I'm here to comment on the conduct of one of t...      1
9998  It, like many buffets started out some promisi...      2
9999  This place was really great!  I know all Teppa...      5

[10000 rows x 2 columns]
                                                    text  stars
10000  WOW!! This company is amazing!!!!! As a full t...      5
10001  We had an appointment for 6:50 and we are stil...      1
10002  Review is purely on

KeyboardInterrupt: 

In [28]:
from sklearn.naive_bayes import MultinomialNB

#https://medium.com/@awantikdas/a-comprehensive-naive-bayes-tutorial-using-scikit-learn-f6b71ae84431
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,
                               alternate_sign=False)

for chunk in data_chunks:
    #get the fields we want
    chunk = chunk[["text", "stars"]]
    
    #pre-process our chunk
    chunk.text = chunk.text.map(lambda x:tokenizer.tokenize(x))
    chunk.text = chunk.text.map(lambda l: [stemmer.stem(word) for word in l])
    chunk.text = chunk.text.str.join(sep=' ')
    
    #take a random sample from the chunk
    test = chunk.sample(2000)
    
    #vectorize the chunk and the sample
    chunk_text_tf = vectorizer.transform(chunk.text)
    test_tf = vectorizer.transform(test.text)
    
    #train on the chunk
    mnb = MultinomialNB()
    mnb.partial_fit(chunk_text_tf, chunk.stars, classes=[1,2,3,4,5])
    
    #predict
    predictions = mnb.predict(test_tf)
    print(predictions)

[5 5 5 ..., 5 5 5]
[5 5 5 ..., 5 5 5]
[5 5 5 ..., 5 5 5]
[5 5 5 ..., 5 5 5]


KeyboardInterrupt: 