In [1]:
import sqlite3
import pandas as pd
import lib_v2

In [2]:
connection_sqlobject = sqlite3.connect('amazon-fine-food-reviews/database.sqlite') 
filtered_data = pd.read_sql_query(""" SELECT Id, Score, Text FROM Reviews WHERE Id > 500000 """, connection_sqlobject)

In [3]:
print(len(filtered_data))
filtered_data.head()

68454


Unnamed: 0,Id,Score,Text
0,500001,5,I was looking for an easy and convenient way t...
1,500002,5,DO NOT be freaked out by the ostrich! This tas...
2,500003,5,I bought the Ostrim with a little apprehension...
3,500004,5,At only 80 calories these are a great bang for...
4,500005,5,"These are quite tasty and by far the leanest, ..."


In [4]:
categories = ['1','2','3','4','5']
cat_tweets=[[],[],[],[],[]]
for index, cat in enumerate(categories):
    for i, row in filtered_data.iterrows():
        if row['Score'] == int(cat):
            cat_tweets[index].append([row['Id'],row['Score'],row['Text']])
    print(len(cat_tweets[index]))
#use the smallest number as split criterion: 
#3500 as total for each category, then split them into 2500 (approx. 70%) and 1000 as trainning and testing

5981
3509
5031
9611
44322


In [5]:
train_set = []
test_set = []
for cat in range(len(cat_tweets)):
    train_set = train_set + cat_tweets[cat][:2500]
    test_set = test_set + cat_tweets[cat][2500:3500]
    
print(len(train_set))
print(len(test_set))

12500
5000


In [6]:
train_data = lib_v2.tok(train_set)
test_data = lib_v2.tok(test_set)

In [7]:
#lib_v2.show_tweets(train_data)

In [8]:
#lib_v2.show_tweets(test_data)

In [7]:
prior_probs, token_probs = lib_v2.learn_nb(train_data)
predictions = [(tweet, lib_v2.classify_nb(tweet, prior_probs, token_probs)) for tweet in test_data]
lib_v2.evaluate(predictions)

1
Precision:  67.31301939058172
Recall:  24.3
F1:  35.70903747244673

2
Precision:  30.971128608923884
Recall:  59.0
F1:  40.61962134251291

3
Precision:  25.979772439949432
Recall:  41.1
F1:  31.835786212238574

4
Precision:  33.601841196777904
Recall:  29.2
F1:  31.24665596575709

5
Precision:  69.6113074204947
Recall:  19.7
F1:  30.70927513639906

Average F1:  34.024075225870874


In [8]:
predictions_train = [(tweet, lib_v2.classify_nb(tweet, prior_probs, token_probs)) for tweet in train_data]
lib_v2.evaluate(predictions_train)

1
Precision:  74.29352780309937
Recall:  32.6
F1:  45.31554072838476

2
Precision:  36.05080831408776
Recall:  62.44
F1:  45.710102489019036

3
Precision:  36.51188501934771
Recall:  52.84
F1:  43.18404707420726

4
Precision:  48.69179600886918
Recall:  43.92
F1:  46.18296529968455

5
Precision:  79.33333333333333
Recall:  38.08
F1:  51.45945945945945

Average F1:  46.370423010151015


In [9]:
lib_v2.show_confusion_matrix(predictions_train) #unigram

Unnamed: 0,1,2,3,4,5
1,815,1181,397,78,29
2,130,1561,679,111,19
3,76,783,1321,298,22
4,38,421,765,1098,178
5,38,384,456,670,952


In [15]:
lib_v2.show_confusion_matrix(predictions_train) #bigram

Unnamed: 0,1,2,3,4,5
1,288,1574,481,113,44
2,375,377,1480,251,17
3,78,1328,382,684,28
4,42,518,1247,344,349
5,72,441,464,1369,154


so we can categorize the score into good reviews and bad reviews base on the scores, 1,2 as distatisfied and 4,5 as satisified. 