<a href="https://colab.research.google.com/github/kscheibner/data/blob/master/NLP_Summative_draft1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook Setup

In [1]:
!pip install transformers



In [None]:
!pip install vaderSentiment

In [82]:
import pickle
import numpy as np
import pandas as pd
import torch
import warnings
import re
import operator

from html import unescape
from collections import defaultdict, Counter
from string import punctuation
from matplotlib import pyplot as plt
from nltk.util import bigrams
from tqdm import tqdm
from scipy.spatial.distance import cosine

from sklearn.feature_extraction import _stop_words
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertModel, BertTokenizer

warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load and Explore the Data

In [4]:
# Define function to clean text
def clean(text):
    try:
      text = unescape(text)
      text = re.sub( '[^a-zA-Z]', ' ', text)
      text = [w.lower().strip(punctuation) for w in text.split()]
      return text
    except:
      return None

In [5]:
tok = BertTokenizer.from_pretrained('bert-base-uncased')

with open('bert_vocab.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(list(tok.vocab)))

In [6]:
# Load data
with open('drive/MyDrive/nlp_summative_data/clothing_reviews.p', 'rb') as f:
    data = pickle.load(f)

# rename columns to match naming convention
data.rename(columns={"verified": "label"},inplace=True)

In [7]:
# Clean text to remove special characters, numbers, and blank reviews
data = data[~data.text.isna()]
data['cleaned'] = data.text.apply(clean)
data = data[data.cleaned.apply(lambda x: len(x) > 0)]
data.head()

Unnamed: 0,rating,text,label,cleaned
0,5.0,"This book has beautiful photos, good and under...",True,"[this, book, has, beautiful, photos, good, and..."
1,5.0,Loved their approach in this book and that it ...,True,"[loved, their, approach, in, this, book, and, ..."
2,5.0,great,True,[great]
3,5.0,"Always love the way Eva thinks, and there are ...",True,"[always, love, the, way, eva, thinks, and, the..."
4,5.0,Nice patterns,True,"[nice, patterns]"


In [8]:
# Load fastText embeddings
with open('drive/MyDrive/formative4_data_2022/fasttext_vectors.p', 'rb') as f:
    fasttext_vecs = pickle.load(f)

In [9]:
# Define list of labels
labels = ['unverified','verified']

# Define dictionary for label look-up
label2id = {'unverified': 0, 'verified': 1}

# Define dictionary for reverse label look-up
id2label = {v: k for k, v in label2id.items()}

# Convert labels
data['label'] = data.label.apply(lambda x: int(x))

# FOR DEBUGGING: Reduce dataset size to make the task more computationally tractable
data, _ = train_test_split(data, train_size=400, stratify=data['label'], random_state=0)

# Inspect dataframe
data

Unnamed: 0,rating,text,label,cleaned
10479,5.0,My kids even told me that this is a nice walle...,1,"[my, kids, even, told, me, that, this, is, a, ..."
265609,5.0,These shoes are very comfortable and durable. ...,1,"[these, shoes, are, very, comfortable, and, du..."
448110,4.0,good material,1,"[good, material]"
1106270,4.0,I like the style. It will fit just about anybody.,0,"[i, like, the, style, it, will, fit, just, abo..."
908481,5.0,I love this! My son looks so cute in his onesie.,0,"[i, love, this, my, son, looks, so, cute, in, ..."
...,...,...,...,...
63745,2.0,This chain had a bend in it that I could not g...,1,"[this, chain, had, a, bend, in, it, that, i, c..."
1313355,5.0,The back pack is perfect to look at but is ver...,0,"[the, back, pack, is, perfect, to, look, at, b..."
931724,3.0,"The ring got opened the first day of use, so I...",0,"[the, ring, got, opened, the, first, day, of, ..."
745623,5.0,Love these! Love that I can throw them in the ...,0,"[love, these, love, that, i, can, throw, them,..."


In [10]:
# Split data into training, development, and test sets
train, dev_test = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=0)
dev, test = train_test_split(dev_test, test_size=0.5, stratify=dev_test['label'], random_state=0)

## Compare/Contrast Descriptive Features

### Average Length

In [13]:
print("Average number of words per review:")
for l in labels:
  print("\t{}: {} words".format(l.upper(), np.array([len(r) for r in train[train['label']==label2id[l]].cleaned]).mean()))

Average number of words per review:
	UNVERIFIED: 53.325 words
	VERIFIED: 33.1125 words


In [29]:
print("Average number of characters per (uncleaned) review:")
for l in labels:
  print("\t{}: {} chars".format(l.upper(), np.array([len(r) for r in train[train['label']==label2id[l]].text]).mean()))

Average number of characters per (uncleaned) review:
	UNVERIFIED: 275.9875 chars
	VERIFIED: 170.60625 chars


### Average Rating

In [14]:
print("Average rating per review:")
for l in labels:
  print("\t{}: {} stars".format(l.upper(), train[train['label']==label2id[l]].rating.mean()))

Average rating per review:
	UNVERIFIED: 4.25 stars
	VERIFIED: 4.45625 stars


### Create Unigram Vocabularies for Each Category 

In [26]:
# stopwords + common particles remaining after removing apostrophes
stops = list(_stop_words.ENGLISH_STOP_WORDS) + ['t', 's','m','ve']

In [27]:
# Define function to create dictionary
def create_dict(review_list):
    # Create a list of all words in the reviews
    word_list = [w for r in review_list for w in r]
    # Create a dictionary of words and corresponding frequency counts 
    vocab_dict = Counter(word_list)
    # Return filtered dictionary, removing all stopwords
    return Counter({w: c for w, c in vocab_dict.items() if not w in stops})

#### View Most Frequent Unigrams

In [28]:
# Initialize data structures
vocab_dicts = defaultdict(Counter)

top30_df = pd.DataFrame(columns=labels)

for label in labels:
    vocab_dicts[label] = create_dict(train[train['label']==label2id[label]].cleaned)
    top30_df[label] = np.array(vocab_dicts[label].most_common()[:30])[:,0]

top30_df

Unnamed: 0,unverified,verified
0,size,size
1,fit,like
2,love,fit
3,great,great
4,like,wear
5,wear,comfortable
6,shirt,just
7,comfortable,good
8,good,pair
9,just,bought


#### Calculate Unigram Jaccard Similarity

In [29]:
# Function for calculating jacccard Similarity
def jaccard_sim(vocab_1, vocab_2):
    intersection = len(set(vocab_1).intersection(set(vocab_2)))
    union = len(set(vocab_1).union(set(vocab_2)))
    return intersection / union

In [30]:
# Initialize similarity table as dictionary
table = dict()

for l1 in labels:
    
    table[l1] = list()
    
    for l2 in labels:
        
        # Calculate Jaccard similarity 
        table[l1].append(jaccard_sim(vocab_dicts[l1], vocab_dicts[l2]))
        
# Display table
pd.DataFrame.from_dict(table, orient='index', columns=labels).reindex(index=[l for l in labels])

Unnamed: 0,unverified,verified
unverified,1.0,0.259128
verified,0.259128,1.0


### Create Bigram Vocabularies for Review Categories

In [20]:
# Define function to create dictionary
def create_bigram_dict(review_list):

    # Create a list of all bigrams in the reviews
    bigram_list = [b for r in review_list for b in list(bigrams(r))]

    # Create a dictionary of bigrams and corresponding frequencies
    vocab_dict = Counter(bigram_list) 

    # Return filtered bigram dictionary (no stopwords)
    return Counter({b: c for b, c in vocab_dict.items() if ((not (b[0] in stops)) and (not (b[1] in stops)))})

#### View Most Frequent Bigrams

In [21]:
# Initialize data structures
vocab_bi = defaultdict(Counter)

# Create vocabularies
for review, label in zip(train.cleaned, train.label):
    if review:
       vocab_bi[id2label[label]].update(bigrams([w for w in review if w not in stops]))
    
top30_bigrams_df = pd.DataFrame(columns=labels)

# Inspect most frequent bigrams
for l in labels:
    top30_bigrams_df[l] = np.array(vocab_bi[l].most_common(30), dtype=object)[:,0]

display(top30_bigrams_df)

Unnamed: 0,unverified,verified
0,"(honest, review)","(great, quality)"
1,"(received, product)","(feel, like)"
2,"(exchange, honest)","(smaller, size)"
3,"(product, discount)","(arrived, quickly)"
4,"(wear, size)","(year, old)"
5,"(quick, drying)","(love, shoes)"
6,"(water, shoes)","(fit, perfect)"
7,"(year, old)","(different, colors)"
8,"(drying, water)","(good, quality)"
9,"(discount, exchange)","(fit, good)"


#### Calculate Bigram Jaccard Similarity

In [22]:
# Initialize similarity matrix
sims_bi = defaultdict(Counter)

# Fill similarity matrix
for l_1 in labels:
    for l_2 in labels:
        sims_bi[l_1][l_2] = jaccard_sim(set(vocab_bi[l_1]), set(vocab_bi[l_2]))
        
# Display similarity matrix
pd.DataFrame.from_dict(sims_bi, orient='index', columns=labels).reindex(index=labels)

Unnamed: 0,unverified,verified
unverified,1.0,0.020076
verified,0.020076,1.0


### Cosine Similarity of Review Embeddings

In [31]:
# Define function to compute review embedding
def review2vector(review, vectors):
    return np.mean([vectors[w] for w in review if w in vectors and w not in stops], axis=0)

In [32]:
# Generate new column in dataframe containing review embeddings
train['vector'] = train['cleaned'].apply(lambda x: review2vector(x, fasttext_vecs))

  out=out, **kwargs)


In [33]:
# Create dictionary with aggregated review vectors
review_vectors = {l: np.mean(train[train.label==label2id[l]].vector, axis=0) for l in labels}

In [34]:
def cosine_sim(v_i, v_j):
    return 1 - cosine(v_i, v_j)

In [35]:
# Initialize table as dictionary
table = dict()

for l_i in labels:
    
    # Skip RANDOM category    
    if l_i == 'RANDOM':
        continue
    
    table[l_i] = list()
    
    for l_j in labels:
        
        # Calculate Cosine similarity
        table[l_i].append(cosine_sim(review_vectors[l_i], review_vectors[l_j]))

In [36]:
# Display table
pd.DataFrame.from_dict(table, orient='index', columns=labels).reindex(index=[l for l in labels])

Unnamed: 0,unverified,verified
unverified,1.0,0.989542
verified,0.989542,1.0


## Analyze Sentiment

In [30]:
def score2int(score):
    '''
    Utility function to classify the polarity of a review
    using textblob and vader.
    '''
    label = 1 # assume neutral
    
    # positive 
    if score > 0.05:
        label = 2
    
    # negative 
    elif score < -0.5:
        label = 0
        
    return label

In [31]:
# create vader sentiment analyser
vader_obj = SentimentIntensityAnalyzer()

# Describe avg sentiment of unverified & verified reviews
# 

train['textblob'] = train['text'].apply(lambda x: TextBlob(x).sentiment[0])
train['vader'] = train['text'].apply(lambda x: vader_obj.polarity_scores(x)['compound'])

for l_i in labels: 
    print(l_i.upper())
    display(train[train['label']==label2id[l_i]].describe())

UNVERIFIED


Unnamed: 0,rating,label,textblob,vader
count,160.0,160.0,160.0,160.0
mean,4.25,0.0,0.320126,0.629121
std,1.170927,0.0,0.280761,0.408304
min,1.0,0.0,-0.7,-0.7407
25%,4.0,0.0,0.145942,0.510275
50%,5.0,0.0,0.305808,0.7823
75%,5.0,0.0,0.5,0.924725
max,5.0,0.0,1.0,0.9974


VERIFIED


Unnamed: 0,rating,label,textblob,vader
count,160.0,160.0,160.0,160.0
mean,4.45625,1.0,0.315703,0.570189
std,0.96379,0.0,0.269772,0.386195
min,1.0,1.0,-0.25,-0.7711
25%,4.0,1.0,0.120833,0.42115
50%,5.0,1.0,0.270133,0.6823
75%,5.0,1.0,0.4925,0.866125
max,5.0,1.0,1.0,0.9801


# TSNE

# Build Classifiers

In [11]:
# Create dictionary for word look-up
word_counter = Counter([w for r in train.cleaned for w in r])
w2id = {w: i + 2 for i, w in enumerate(w for w, c in word_counter.most_common())}

# Create dictionary for reverse word look-up
id2w = {i: w for w, i in w2id.items()}

## Naive Bayes

In [116]:
# Define function to get P(w|c_i), class-conditional propbabilities for w
def naive_bayes_unsmoothed(vocab, labels):
    
    # Calculate unsmoothed probabilities
    probabilities = dict()
    
    for l_i in labels:
        
        # First, we create a partial copy of our vocab count dict, selecting only words that occur in both classes (--> no smoothing)
        probabilities[l_i] = {word: vocab[l_i][word] for word in vocab[l_i] if vocab['verified'][word] > 0 and vocab['unverified'][word] > 0}
        
        # Second, we take the sum of counts of words in this new dict
        total = sum(probabilities[l_i].values())
        
        # Last, we turn the counts for each word into probabilities by dividing them by that sum
        probabilities[l_i] = {word: probabilities[l_i][word] / total for word in probabilities[l_i]}
    
    return probabilities

In [117]:
# Train Naive Bayes without smoothing
probabilities_unsmoothed = naive_bayes_unsmoothed(vocab_dicts, labels)

In [118]:
def get_nb_predictions(categories, test_reviews, probabilities):

    # Initialize lists for storing ground truth labels and predictions
    lbls = list()
    predictions = list()

    # Loop over categories
    for l_i in labels:

        # Loop over test reviews
        for review in test[test['label']==labels.index(l_i)]:

            # Store ground truth
            lbls.append(l_i)

            # For each post, calculate scores for each of the two categories
            scores = {'verified': 0, 'unverified': 0}
            for word in review:
                if word in probabilities[l_i]:
                    scores["verified"] += np.log(probabilities["verified"][word])
                    scores["unverified"] += np.log(probabilities["unverified"][word])

            # Use higher score for prediction
            predictions.append(max(scores.items(), key=operator.itemgetter(1))[0])

    return lbls, predictions

In [119]:
lbls, predictions = get_nb_predictions(labels, test, probabilities_unsmoothed)

In [120]:
print("Our classifier is {:.2%} accurate on the test set".format(np.mean(pd.Series(lbls)==pd.Series(predictions))))

Our classifier is 50.00% accurate on the test set


## NB With Smoothing

In [121]:
# Naive Bayes with additive smoothing
# Define function to get P(w|c_i), class-conditional propbabilities for w

def naive_bayes_additive_smoothing(vocab, categories, smoothing_alpha):
    
    # Calculate unsmoothed probabilities
    probabilities = dict()
    
    for l_i in labels:
        
        probabilities[l_i] = dict()
        
        # First, consider all words that are in the vocab for either class
        for word in set(vocab["unverified"]).union(set(vocab["verified"])):
            # If they do exist in the current class c_i, store their count --> 1st order model
            if vocab[l_i][word]>0:
                probabilities[l_i][word] = vocab[l_i][word]
            else:
                probabilities[l_i][word] = smoothing_alpha
        
        # Second, we take the sum of counts of words in this new dict
        total = sum(probabilities[l_i].values())
        
        # Last, we turn the counts for each word into probabilities by dividing them by that sum
        probabilities[l_i] = {word: probabilities[l_i][word] / total for word in probabilities[l_i]}
    
    return probabilities

In [122]:
alpha = 0.000000001

In [123]:
# Train
probs = naive_bayes_additive_smoothing(vocab_dicts, labels, alpha)

# Get predictions on dev set
lbls, predictions = get_nb_predictions(labels, dev, probs)

# Calculate and store macro F1 on test set
f1_score(lbls, predictions, average="macro")

0.5

## Bert Classifier

#### Initialize Datasets and Model 

In [12]:
# Define dataset class
class BERTDataset(Dataset):

    def __init__(self, data):
        
        # Initialize tokenizer
        self.tok = BertTokenizer.from_pretrained('bert-base-uncased')
        
        # Truncate and encode reviews
        self.reviews = list(data.text.apply(self.tok.encode, max_length=100, truncation=True))
        
        # Store labels
        self.labels = list(data.label)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        return review, label

In [13]:
# Define collate function
def bert_collate(batch):
    
    # Store batch size
    batch_size = len(batch)
    
    # Separate review and labels
    reviews = [r for r, _ in batch]
    labels = torch.tensor([l for _, l in batch]).long()
    
    # Store length of longest review in batch
    max_len = max(len(r) for r in reviews)
    
    # Create padded reviews and attention mask tensors 
    # (the latter to avoid performing attention on padding token indices)
    reviews_pad = torch.zeros((batch_size, max_len)).long()
    masks_pad = torch.zeros((batch_size, max_len)).long()
    for i, r in enumerate(reviews):
        reviews_pad[i, :len(r)] = torch.tensor(r)
        masks_pad[i, :len(r)] = 1
    
    return reviews_pad, masks_pad, labels

In [14]:
# Define BERT classifier
class BERTClassifier(nn.Module):

    def __init__(self):
        
        # Define network layers
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear = nn.Linear(768, 4)
        
        # Define dropout
        self.dropout = nn.Dropout(0.2)
        
        # Freeze BERT layers
        for n, p in self.bert.named_parameters():
            p.requires_grad = False

    def forward(self, reviews, masks):
        
        # Define flow of tensors through network
        output_bert = self.bert(reviews, attention_mask=masks)[0].mean(axis=1)
        return self.linear(self.dropout(output_bert))

In [15]:
%%time

# Create datasets
train_dataset = BERTDataset(train)
dev_dataset = BERTDataset(dev)
test_dataset = BERTDataset(test)

CPU times: user 872 ms, sys: 23.2 ms, total: 895 ms
Wall time: 12 s


In [16]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=100, collate_fn=bert_collate, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=100, collate_fn=bert_collate)
test_loader = DataLoader(test_dataset, batch_size=100, collate_fn=bert_collate)

In [17]:
# Initialize model
model = BERTClassifier()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# Define optimizer and training objective
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

#### Train model

In [19]:
# Define device and move model to CUDA if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [20]:
%%time

# Train model
for e in range(1, 6):

    model.train()

    for i, b in enumerate(tqdm(train_loader)):

        # Perform forward pass
        optimizer.zero_grad()
        reviews, masks, lbls = [t.to(device) for t in b]
        output = model(reviews, masks)
        loss = criterion(output, lbls)
        
        # Perform backpropagation and update weights
        loss.backward()
        optimizer.step()
  
    # Evaluate model on development data
    model.eval()

    y_true = list()
    y_pred = list()

    with torch.no_grad():
        for b in dev_loader:
            reviews, masks, lbls = [t.to(device) for t in b]
            output = model(reviews, masks)
            max_output = output.argmax(dim=1)
            y_true.extend(lbls.tolist())
            y_pred.extend(max_output.tolist())
            
    print('Accuracy after {} epoch(s): {:.2f}'.format(e, accuracy_score(y_true, y_pred)))

100%|██████████| 4/4 [00:01<00:00,  3.98it/s]


Accuracy after 1 epoch(s): 0.62


100%|██████████| 4/4 [00:00<00:00,  4.31it/s]


Accuracy after 2 epoch(s): 0.65


100%|██████████| 4/4 [00:00<00:00,  4.33it/s]


Accuracy after 3 epoch(s): 0.60


100%|██████████| 4/4 [00:00<00:00,  4.33it/s]


Accuracy after 4 epoch(s): 0.62


100%|██████████| 4/4 [00:00<00:00,  4.31it/s]

Accuracy after 5 epoch(s): 0.65
CPU times: user 5.53 s, sys: 81.8 ms, total: 5.61 s
Wall time: 5.63 s





#### Test model

In [21]:
%%time

# Evaluate model on test data
model.eval()

y_true = list()
y_pred = list()

with torch.no_grad():
    for b in test_loader:
        reviews, masks, lbls = [t.to(device) for t in b]
        output = model(reviews, masks)
        max_output = output.argmax(dim=1)
        y_true.extend(lbls.tolist())
        y_pred.extend(max_output.tolist())

print('Test accuracy: {:.2f}'.format(accuracy_score(y_true, y_pred)))

Test accuracy: 0.78
CPU times: user 142 ms, sys: 21 µs, total: 142 ms
Wall time: 141 ms


#### Confusion Matrix

In [22]:
# Initialize confusion matrix
c_matrix = defaultdict(Counter)

# Fill confusion matrix
for t, p in zip(y_true, y_pred):
    c_matrix[id2label[t]][id2label[p]] += 1

# Display confusion matrix
pd.DataFrame.from_dict(c_matrix, orient='index', columns=labels).reindex(index=labels)

Unnamed: 0,unverified,verified
unverified,14,6
verified,3,17
