In [51]:
from packages import *
import utils
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import re

In [11]:
import tensorflow as tf
from tensorflow.keras import metrics
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [28]:
y = np.array([1,5,3,3,4,2]) - 1
pred = OneHotEncoder(sparse=False).fit_transform(y.reshape(len(y),1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [29]:
print(y)
print(pred)

[0 4 2 2 3 1]
[[1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]]


In [30]:
m = metrics.CategoricalAccuracy()
m.update_state(y, pred)
print('Final result: ', m.result().numpy())  # Final result: 0.75

Final result:  0.16666667


In [31]:
m = metrics.SparseCategoricalAccuracy()
m.update_state(y, pred)
print('Final result: ', m.result().numpy())  # Final result: 0.75

Final result:  1.0


In [1]:
LOOKUP = {
    'review' : {
                'business_id',
                'text',
                'date',
                'cool',
                'funny',
                'useful',
                'stars',
                'review_id',
                'user_id'
               }
}

In [14]:
def load_data(lookup):
    data  = {}
    for key in lookup:
        data[key] = {}
        for c in lookup[key]:
            path = os.path.join(DATA_DIR, 'dataset', '{}-{}.npy'.format(key, c))
            if c == 'text':
                x = np.load(path, allow_pickle=True)
            else:
                x = np.load(path, allow_pickle=False)
            data[key][c] = x
    return data

In [19]:
data = load_data(LOOKUP)
review_data = data['review']

In [124]:
text_data = review_data['text']
N_DATA = 1000

In [125]:
tokens = [w for rev in text_data[:N_DATA] for w in word_tokenize(rev)]

In [166]:
def clean_token(t):
    t = t.lower()
    m = re.match('^[^\w\'](\w+).*', t)
    if m is not None:
        return m.group(1)
    else:
        return t

def clean_tokenize(s):
    tokens = []
    for t in word_tokenize(s):
        tokens.append(clean_token(t))
    return tokens

In [2]:
from packages import *
import numpy as np
import pandas as pd
import nbimporter
import tensorflow as tf
from tensorflow.keras.layers import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, r2_score
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
import re

### Data Splits

In [3]:
def shuffle_sample(dictionary, n_sample=None, random_seed=42):
    lens = [len(l) for l in dictionary.values()]
    assert min(lens) == max(lens)
    
    n_data = lens[0]
    processed = {}
    for key, array in dictionary.items():
        if n_sample is not None:
            processed[key] = shuffle(array, random_state=random_seed)[:n_sample]
        else:
            processed[key] = shuffle(array, random_state=random_seed)[:n_sample]
    return processed

In [4]:
a = {
    "A" : range(0,10),
    "B" : range(10,20),
    "C" : range(100,110)
}
shuffle_sample(a, random_seed=24)

{'A': [9, 4, 8, 7, 5, 6, 1, 0, 3, 2],
 'B': [19, 14, 18, 17, 15, 16, 11, 10, 13, 12],
 'C': [109, 104, 108, 107, 105, 106, 101, 100, 103, 102]}

In [25]:
N_DATA = 20000
RANDOM_SEED = 42
TEST_RATIO = 0.33
data_sample = shuffle_sample(load_data({'review': ['text', 'stars']})['review'],
                             n_sample=N_DATA,
                             random_seed=RANDOM_SEED
                            )
x_train, x_test, y_train, y_test = train_test_split(data_sample['text'],
                                                    data_sample['stars'],
                                                    test_size = TEST_RATIO,
                                                    random_state = 24,
                                                    shuffle = True
                                                   )

In [26]:
print((len(x_train), len(y_train)))
print((len(x_test), len(y_test)))

(13400, 13400)
(6600, 6600)


### Baseline Test

In [27]:
class BaselineModel:
    def __init__(self):
        self.model = LinearSVC()
        self.vectorizer = CountVectorizer(max_features = 400000) #number of entries in GLoVe
        
    def train(self, reviews, stars):
        x = self.vectorizer.fit_transform(reviews)
        self.model.fit(x, stars)

    def predict(self, reviews):
        x = self.vectorizer.transform(reviews)
        predictions = self.model.predict(x)
        return predictions
        

In [28]:
baseline = BaselineModel()

In [29]:
%%time
baseline.train(x_train, y_train)

CPU times: user 11.5 s, sys: 0 ns, total: 11.5 s
Wall time: 11.5 s


In [30]:
baseline.train(x_train, y_train)

In [31]:
%%time
predictions = baseline.predict(x_test)

CPU times: user 415 ms, sys: 0 ns, total: 415 ms
Wall time: 414 ms


In [32]:
predictions = baseline.predict(x_test)

In [33]:
f1_score(predictions, y_test, average='macro')

0.4751783760327564

In [34]:
f1_score(predictions, y_test, average='micro')

0.57