In [1]:
import random
import pandas as pd

In [2]:
# display the full text of each entry
pd.set_option('display.max_colwidth', -1)

In [3]:
df = pd.read_csv('winemag-data_first150k.csv')

In [4]:
df = df[['description', 'points']]

In [5]:
df = df.sort_values('points')

In [6]:
# number of negative and positive reviews to use
size = 10000

In [7]:
negative = df['description'].tolist()[:size]

In [8]:
positive = df['description'].tolist()[-size:]

In [9]:
negative = [(doc, 0) for doc in negative]

In [10]:
negative = list(set(negative))

In [11]:
len(negative)

6650

In [12]:
positive = [(doc, 1) for doc in positive]

In [13]:
positive = list(set(positive))

In [14]:
len(positive)

6904

In [15]:
dataset = negative + positive

In [16]:
split = int(len(dataset) * .9)

In [17]:
split

12198

In [18]:
random.shuffle(dataset)

In [19]:
training = dataset[:split]

In [20]:
validation = dataset[split:]

In [21]:
len(training)

12198

In [22]:
len(validation)

1356

In [23]:
# make sure there are no shared documents between training and validation
set(training) & set(validation)

set()

In [24]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [25]:
from collections import Counter

In [26]:
negative = [doc for doc, label in training if label == 0]

In [27]:
positive = [doc for doc, label in training if label == 1]

In [28]:
neg_tokenized = [tokenizer.tokenize(doc) for doc in negative]
neg_counts = Counter([token for doc in neg_tokenized for token in doc])

In [29]:
pos_tokenized = [tokenizer.tokenize(doc) for doc in positive]
pos_counts = Counter([token for doc in pos_tokenized for token in doc])

In [30]:
neg2pos = [token for token, score in sorted([(token, pos_count/(neg_counts[token]+1)) for token, pos_count in pos_counts.most_common()], key=lambda x: x[1])]

In [31]:
halfway = len(neg2pos) // 2

In [32]:
most_pos = list(reversed(neg2pos[halfway:]))

In [33]:
most_neg = neg2pos[:halfway+1]

In [34]:
len(most_pos)

8279

In [35]:
len(most_neg)

8279

In [36]:
most_pos

['sample.',
 'wonderful',
 'beautifully',
 'delicious',
 '2018',
 'decade',
 '2020',
 'finely',
 'gorgeous',
 'wonderfully',
 'superb',
 'cellar.',
 'beautiful',
 'Barolo',
 'crème',
 'dramatic',
 'exceptional',
 'brilliant',
 'beauty',
 'enormous',
 'richly',
 'Hold',
 'long-term',
 '2013',
 'mountain',
 'terrific',
 '6–8',
 'fabulous',
 'velvet',
 '2015',
 'Barrel',
 'ageworthy',
 'elegant',
 'potential.',
 'Age',
 'important',
 '2019',
 'sophisticated',
 'absolutely',
 'impressively',
 '2014',
 'flashy',
 'deliciously',
 'complex.',
 'until',
 'complex',
 '2016',
 'crafted',
 'opulent',
 'spectacular',
 'explodes',
 '2025',
 '4–5',
 'refined',
 'densely',
 'develop',
 'explosive',
 'seamless',
 'Cellar',
 '92-94',
 '2012',
 'Complex',
 'delicious.',
 'built',
 'brûlée',
 'stylish',
 'powerhouse',
 '2017',
 'aging.',
 'estate',
 'detailed',
 '93-95',
 '5–6',
 '94-96',
 'terroir',
 'massive',
 '2022',
 'impeccable',
 'now–2020',
 '2030',
 'six',
 'succulent',
 'considerable',
 'greate

In [37]:
most_neg

['vegetal',
 'dull',
 'unripe',
 'Porty',
 'awkward',
 'mealy',
 'rubbery',
 'sugary',
 'thin',
 'strange',
 'odd',
 'simple',
 'flat',
 'Thin',
 'Sharp',
 'cough',
 'resiny',
 'raisiny',
 'oxidized',
 'sugared',
 'weak',
 'okay',
 'Smells',
 'underripe',
 'flabby',
 'falls',
 'bad',
 'Raw',
 'prickly',
 'Way',
 'generic',
 'smelling',
 'Barely',
 'hollow',
 'one-dimensional',
 'Tough',
 'grassy',
 'decent',
 'mildly',
 'pizza',
 'asparagus',
 'lacks',
 'rough',
 'bell',
 'banana',
 'stemmy',
 'waxy',
 'adequate',
 'fails',
 'forced',
 'stalky',
 'volatile',
 'inexpensive',
 'narrow',
 'artificial',
 'spritzy',
 'jagged',
 'Chunky',
 'pithy',
 'Lean',
 'overripe',
 'Too',
 'disappointing',
 'rate',
 'Burnt',
 'Nice',
 'bulky',
 'party',
 'sipper',
 'sweetened',
 'popcorn',
 'tad',
 'cheesy',
 'fiery',
 'wintergreen',
 'Herbal',
 'pleasant',
 'leafy',
 'lacking',
 'clumsy',
 'sweaty',
 'straightforward',
 'seafood',
 'pulse',
 'Flat',
 'briny',
 'flat.',
 'excessively',
 'candy.',
 'Tur

In [38]:
# training accuracy
correct = 0
total = 0
for doc, label in training:
    total += 1
    doc = tokenizer.tokenize(doc)
    for neg, pos in zip(most_neg, most_pos):
        if neg in doc:
            if label == 0:
                correct += 1
            break
        if pos in doc:
            if label == 1:
                correct += 1
            break
print('{}/{} ({:2.2f}%) correct'.format(correct, total, correct/total * 100))

11719/12198 (96.07%) correct


In [39]:
# validation accuracy
correct = 0
total = 0
mislabeled = []
for doc_string, label in validation:
    total += 1
    doc = tokenizer.tokenize(doc_string)
    for neg, pos in zip(most_neg, most_pos):
        if neg in doc:
            if label == 0:
                correct += 1
            else:
                mislabeled.append((doc_string, label, neg))
            break
        if pos in doc:
            if label == 1:
                correct += 1
            else:
                mislabeled.append((doc_string, label, pos))
            break
print('{}/{} ({:2.2f}%) correct'.format(correct, total, correct/total * 100))

1275/1356 (94.03%) correct


In [40]:
random.choice(mislabeled)

('The least interesting wine in the normally excellent Cartuxa range, this has black currant aromas, but disappoints with its leather character, and tannic, bitter cranberry flavors.',
 0,
 'least')