## dependencies

In [None]:
!pip install wandb -qqq
import wandb

[K     |████████████████████████████████| 1.7 MB 7.9 MB/s 
[K     |████████████████████████████████| 180 kB 64.8 MB/s 
[K     |████████████████████████████████| 97 kB 6.4 MB/s 
[K     |████████████████████████████████| 140 kB 55.5 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [None]:
# Log in to your W&B account
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## download the data

`glove.6B.zip` and `Video_Games_5.json.gz` files are downloaded from the respective url locations. These are done using python functions.

Python functions are used to dowmload the file instead of using `wget` for this notebook to be executable in non colab environments as well.

In [None]:
import urllib.request as req
from urllib.parse import urlparse
import os
import progressbar
import zipfile
import gzip
import shutil
import json
import pandas as pd
import re
import string
import imblearn

pbar = None


def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size)
        pbar.start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

def wget(url):
    a = urlparse(url)
    filename = os.path.basename(a.path)
    if not os.path.isfile(filename):
        req.urlretrieve(url, filename, show_progress)
        print(f'downloaded to {filename}')
    else:
        print(f'file {filename} has already been downloaded')
    return filename

def unzip(filename, directory_to_extract_to=os.getcwd()):
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(directory_to_extract_to)
        print(f'extraction done {zip_ref.namelist()}')

def gunzip(gzfile, fout):
    with gzip.open(gzfile, 'rb') as f_in:
        with open(fout, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f'{gzfile} extracted to {fout}')


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    df = {}
    i = 0
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


# map punctuation to space
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 

def text_preprocessing(text):
    """
    Preprocess the text for better understanding
    
    """
    text = text.strip()
    text = text.lower()
    text = text.replace('\n', '.')
    return text


# filename = wget("https://nlp.stanford.edu/data/glove.6B.zip")
# unzip(filename)
Video_Games_5 = wget('http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz')
df = pd.read_json("./Video_Games_5.json.gz", lines=True, compression='gzip')
df = df[['reviewText', 'overall']]
df = df[df['reviewText'].notnull()]
df['reviewText'] = df['reviewText'].apply(text_preprocessing)
df = df.dropna()
df = df.drop_duplicates()
print(df.shape)

100% (154050105 of 154050105) |##########| Elapsed Time: 0:00:02 Time:  0:00:02


downloaded to Video_Games_5.json.gz
(400985, 2)


## train test split

split the dataset based on the label distribution. The test size is 0.3 and random state is given so that the split is the same for the different lessons.

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.3, stratify=df.overall, random_state=42)

X_train = df_train['reviewText']
y_train = df_train['overall']

X_test = df_test['reviewText']
y_test = df_test['overall']

print(len(df_train), len(df_test))

280689 120296


## baseline classifier

A very naive classifier where the labels are assigned randomly.

The precision values for the baseline classifier are according to the distribution of the output class and the recall is 20%. The  matthews correlation coefficient is almost 0 which means that the  classifier has not information on the output label. This is obvious now since we know the model, but sometimes a neural network may also give such values. This is the sign that neural network is also nothing better than a random classfier.

In [None]:
import random

random.seed(42)

def baseline_classifier(text):
    """
    Baseline classifier returning a label randomly
    """
    return float(random.choice([1, 2, 3, 4, 5]))

df_test['baseline_preds'] = df_test['reviewText'].apply(baseline_classifier)

In [None]:
from sklearn.metrics import accuracy_score, matthews_corrcoef
from imblearn.metrics import classification_report_imbalanced

model_prediction_col = 'baseline_preds'

accuracy = accuracy_score(df_test['overall'], df_test[model_prediction_col])
matthews_corrcoef_metric = matthews_corrcoef(df_test['overall'], df_test[model_prediction_col])

print('accuracy:', accuracy)
print('matthews_corrcoef:', matthews_corrcoef_metric)
print('classification_report:\n', classification_report_imbalanced(df_test['overall'], df_test[model_prediction_col]))

accuracy: 0.1979866329720024
matthews_corrcoef: -0.0028328122350226533
classification_report:
                    pre       rec       spe        f1       geo       iba       sup

        1.0       0.07      0.20      0.80      0.10      0.40      0.15      8386
        2.0       0.05      0.19      0.80      0.08      0.39      0.14      6711
        3.0       0.11      0.20      0.80      0.14      0.40      0.15     13262
        4.0       0.20      0.20      0.80      0.20      0.40      0.15     24579
        5.0       0.56      0.20      0.80      0.29      0.40      0.15     67358

avg / total       0.37      0.20      0.80      0.23      0.40      0.15    120296



In [None]:
architecture = 'random-classifier'

wandb.init(
    # Set entity to specify your username or team name
    # ex: entity="carey",
    # Set the project where this run will be logged
    project="amazon-gamereviews", 
    # Track hyperparameters and run metadata
    config={
    "architecture": architecture,
    "dataset": "Video_Games_5.json.gz",})

run_id = wandb.run.id
wandb.run.name = architecture + '_' + run_id

# define a metric we are interested in the maximum of
wandb.define_metric("accuracy", summary="max")
wandb.define_metric("matthews_corrcoef", summary="max")

# 2️⃣ Log metrics from your script to W&B
wandb.log({"accuracy": accuracy, "matthews_corrcoef": matthews_corrcoef_metric})
    
# Mark the run as finished
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁
matthews_corrcoef,▁


## Statistical Model: Count Vectoriser + Naive Bayes

hyperparameter tuning to search for the best parameters using grid search

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

cv = CountVectorizer()
model_mult_nb = MultinomialNB()
alphas = np.linspace(0,2,20)[1:]

pipe = Pipeline(
    [('cv', cv),
     ('model', model_mult_nb)]
)

params = {'cv__ngram_range': [(1,1), (1,2), (2,2)],
          'cv__analyzer': ['word', 'char', 'char_wb'],
          'model__fit_prior': [True, False],
#           'cv__max_features': [4000, 5000, 6000, None],
#           'cv__ngram_range': [(1,1)],
#           'cv__min_df': [1, 5, 10, 15],
#           'cv__max_df': [0.10, 0.15, 0.2, 0.3],
#           'model__alpha' : alphas
    }
gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=2)
print(gs.fit(X_train, y_train))
print(gs.best_params_)
print("Train Score: ", round(gs.best_score_, 4))
print("Train Score: ", round(gs.score(X_test,y_test), 4))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=True; total time=  47.7s
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=True; total time=  31.0s
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=True; total time=  31.3s
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=True; total time=  32.1s
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=True; total time=  31.3s
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=False; total time=  31.3s
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=False; total time=  32.0s
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=False; total time=  31.4s
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=False; total time=  31.3s
[CV] END cv__analyzer=word, cv__ngram_range=(1, 1), model__fit_prior=False; total time=  

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

model = make_pipeline(CountVectorizer(analyzer="word", ngram_range=(1, 1)),
                      MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(df_test['overall'], y_pred)
matthews_corrcoef_metric = matthews_corrcoef(df_test['overall'], y_pred)

print('accuracy:', accuracy)
print('matthews_corrcoef:', matthews_corrcoef_metric)
print('classification_report:\n', classification_report_imbalanced(df_test['overall'], y_pred))

accuracy: 0.617576644277449
matthews_corrcoef: 0.3382875062068449
classification_report:
                    pre       rec       spe        f1       geo       iba       sup

        1.0       0.49      0.53      0.96      0.51      0.72      0.49      8386
        2.0       0.33      0.07      0.99      0.12      0.27      0.06      6711
        3.0       0.37      0.27      0.94      0.31      0.50      0.24     13262
        4.0       0.41      0.28      0.89      0.33      0.50      0.24     24579
        5.0       0.71      0.87      0.54      0.78      0.69      0.49     67358

avg / total       0.57      0.62      0.71      0.58      0.61      0.39    120296



In [None]:
architecture = 'CV+NB'

wandb.init(
    # Set entity to specify your username or team name
    # ex: entity="carey",
    # Set the project where this run will be logged
    project="amazon-gamereviews", 
    # Track hyperparameters and run metadata
    config={
    "architecture": architecture,
    "dataset": "Video_Games_5.json.gz",})

run_id = wandb.run.id
wandb.run.name = architecture + '_' + run_id

# define a metric we are interested in the maximum of
wandb.define_metric("accuracy", summary="max")
wandb.define_metric("matthews_corrcoef", summary="max")

# 2️⃣ Log metrics from your script to W&B
wandb.log({"accuracy": accuracy, "matthews_corrcoef": matthews_corrcoef_metric})
    
# Mark the run as finished
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁
matthews_corrcoef,▁


## TFIDF + naive bayes

its worse than using count vectoriser. The scores are quite worse

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(df_test['overall'], y_pred)
matthews_corrcoef_metric = matthews_corrcoef(df_test['overall'], y_pred)

print('accuracy:', accuracy)
print('matthews_corrcoef:', matthews_corrcoef_metric)
print('classification_report:\n', classification_report_imbalanced(df_test['overall'], y_pred))

accuracy: 0.5610743499368225
matthews_corrcoef: 0.03788252553922388
classification_report:
                    pre       rec       spe        f1       geo       iba       sup

        1.0       0.82      0.01      1.00      0.02      0.09      0.01      8386
        2.0       0.00      0.00      1.00      0.00      0.00      0.00      6711
        3.0       0.32      0.00      1.00      0.00      0.03      0.00     13262
        4.0       0.37      0.00      1.00      0.01      0.06      0.00     24579
        5.0       0.56      1.00      0.01      0.72      0.08      0.01     67358

avg / total       0.48      0.56      0.44      0.41      0.07      0.01    120296



In [None]:
architecture = 'TFIDF+NB'

wandb.init(
    # Set entity to specify your username or team name
    # ex: entity="carey",
    # Set the project where this run will be logged
    project="amazon-gamereviews", 
    # Track hyperparameters and run metadata
    config={
    "architecture": architecture,
    "dataset": "Video_Games_5.json.gz",})

run_id = wandb.run.id
wandb.run.name = architecture + '_' + run_id

# define a metric we are interested in the maximum of
wandb.define_metric("accuracy", summary="max")
wandb.define_metric("matthews_corrcoef", summary="max")

# 2️⃣ Log metrics from your script to W&B
wandb.log({"accuracy": accuracy, "matthews_corrcoef": matthews_corrcoef_metric})
    
# Mark the run as finished
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁
matthews_corrcoef,▁


##  count vectorizer + random oversampling + naive bayes

The matthews correlation coefficient is not 0.33. The recall for class 1 has improved.

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from collections import Counter

vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1))
X_train_title_vec = vectorizer.fit_transform(X_train)
X_test_title_vec = vectorizer.transform(X_test)

X_resampled, y_resampled = RandomOverSampler().fit_resample(X_train_title_vec, y_train)
print(sorted(Counter(y_resampled).items()))

# for alpha in np.linspace(0,2,20)[1:]:
model = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test_title_vec)

accuracy = accuracy_score(df_test['overall'], y_pred)
matthews_corrcoef_metric = matthews_corrcoef(df_test['overall'], y_pred)

print('accuracy:', accuracy)
print('matthews_corrcoef:', matthews_corrcoef_metric)
print('classification_report:\n', classification_report_imbalanced(df_test['overall'], y_pred))

[(1.0, 157168), (2.0, 157168), (3.0, 157168), (4.0, 157168), (5.0, 157168)]
accuracy: 0.5570093768703864
matthews_corrcoef: 0.33455421412072117
classification_report:
                    pre       rec       spe        f1       geo       iba       sup

        1.0       0.30      0.69      0.88      0.41      0.78      0.59      8386
        2.0       0.22      0.28      0.94      0.24      0.51      0.24      6711
        3.0       0.31      0.32      0.91      0.32      0.54      0.27     13262
        4.0       0.39      0.30      0.88      0.34      0.52      0.25     24579
        5.0       0.80      0.71      0.77      0.75      0.74      0.54     67358

avg / total       0.59      0.56      0.83      0.57      0.66      0.44    120296



In [None]:
architecture = 'CV+random_oversampling+NB'

wandb.init(
    # Set entity to specify your username or team name
    # ex: entity="carey",
    # Set the project where this run will be logged
    project="amazon-gamereviews", 
    # Track hyperparameters and run metadata
    config={
    "architecture": architecture,
    "dataset": "Video_Games_5.json.gz",})

run_id = wandb.run.id
wandb.run.name = architecture + '_' + run_id

# define a metric we are interested in the maximum of
wandb.define_metric("accuracy", summary="max")
wandb.define_metric("matthews_corrcoef", summary="max")

# 2️⃣ Log metrics from your script to W&B
wandb.log({"accuracy": accuracy, "matthews_corrcoef": matthews_corrcoef_metric})
    
# Mark the run as finished
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁
matthews_corrcoef,▁
