In [1]:
import math
import random
import pandas as pd

In [2]:
# display the full text of each entry
pd.set_option('display.max_colwidth', -1)

In [3]:
df = pd.read_csv('winemag-data_first150k.csv')

In [4]:
df = df[['description', 'points']]

In [5]:
df = df.sort_values('points')

In [6]:
# number of negative and positive reviews to use
size = 10000

In [7]:
negative = df['description'].tolist()[:size]

In [8]:
positive = df['description'].tolist()[-size:]

In [9]:
negative = [(doc, 0) for doc in negative]

In [10]:
negative = list(set(negative))

In [11]:
len(negative)

6650

In [12]:
positive = [(doc, 1) for doc in positive]

In [13]:
positive = list(set(positive))

In [14]:
len(positive)

6904

In [15]:
dataset = negative + positive

In [16]:
split = int(len(dataset) * .9)

In [17]:
split

12198

In [18]:
random.shuffle(dataset)

In [19]:
training = dataset[:split]

In [20]:
validation = dataset[split:]

In [21]:
len(training)

12198

In [22]:
len(validation)

1356

In [23]:
# make sure there are no shared documents between training and validation
set(training) & set(validation)

set()

In [24]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [25]:
from collections import Counter

In [26]:
negative = [doc for doc, label in training if label == 0]

In [27]:
positive = [doc for doc, label in training if label == 1]

In [28]:
neg_tokenized = [tokenizer.tokenize(doc) for doc in negative]
neg_counts = Counter([token for doc in neg_tokenized for token in doc])

In [29]:
pos_tokenized = [tokenizer.tokenize(doc) for doc in positive]
pos_counts = Counter([token for doc in pos_tokenized for token in doc])

In [30]:
vocab = set(token for doc in neg_tokenized+pos_tokenized for token in doc)

In [31]:
len(vocab)

21774

In [32]:
weights = {token: math.log((pos_counts[token]+1)/(neg_counts[token]+1)) for token in vocab}

In [33]:
sorted(weights.items(), key=lambda kv: kv[1], reverse=True)[:10]

[('sample.', 5.69035945432406),
 ('delicious', 4.894101477840304),
 ('beautifully', 4.890349128221754),
 ('2018', 4.875197323201151),
 ('decade', 4.844187086458591),
 ('2020', 4.820281565605037),
 ('beautiful', 4.804021044733257),
 ('finely', 4.74493212836325),
 ('gorgeous', 4.740574822994295),
 ('Age', 4.6913478822291435)]

In [34]:
sorted(weights.items(), key=lambda kv: kv[1])[:10]

[('harsh', -5.093750200806762),
 ('Simple', -4.919980925828125),
 ('dull', -4.852030263919617),
 ('vegetal', -4.783316371371566),
 ('watery', -4.653960350157523),
 ('everyday', -4.48863636973214),
 ('bland', -4.465908118654584),
 ('Porty', -4.382026634673881),
 ('pickled', -4.276666119016055),
 ('canned', -4.204692619390966)]

In [35]:
most_count = 350
most_pos = {token: label for token, label in sorted(weights.items(), key=lambda kv: kv[1], reverse=True)[:most_count]}
most_neg = {token: label for token, label in sorted(weights.items(), key=lambda kv: kv[1])[:most_count]}

In [36]:
#sorted(most_pos.items(), key=lambda kv: kv[1], reverse=True)

In [37]:
#sorted(most_neg.items(), key=lambda kv: kv[1])

In [38]:
def evaluate(data):
    correct = 0
    total = len(data)
    for doc, label in data:
        doc = tokenizer.tokenize(doc)
        score = 0
        for token in doc:
            if token in most_pos:
                score += most_pos[token]
            if token in most_neg:
                score += most_neg[token]
        if score > 0:
            pred = 1
        else:
            pred = 0
        if pred == label:
            correct += 1
    print('{}/{} ({:2.2f}%) correct'.format(correct, total, correct/total * 100))

In [39]:
evaluate(training)

11866/12198 (97.28%) correct


In [40]:
evaluate(validation)

1308/1356 (96.46%) correct
