In [1]:
import thinc
import random
import spacy
import GPUtil
import torch
import pandas as pd
from spacy.util import minibatch
from tqdm.auto import tqdm
import unicodedata
import wasabi
import numpy
from collections import Counter
from project_functions import *

### Check GPU

In [2]:
spacy.util.fix_random_seed(0)
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    print("GPU Usage")
    GPUtil.showUtilization()

### Load Data

In [3]:
df = pd.read_csv('csv/sqr_comments_sentiment.csv')

In [4]:
df.shape

(985, 45)

### Clean comments

In [5]:
cleanText(df, 'comments')

### Change compound scores to binary

In [9]:
df['compound_binary'] = df['compound'].apply(lambda x: 0 if x < 0 else (1 if x > 0 else x))

### Create list of tuples column w/ (comments, sqr_rating) for each school

In [12]:
df['tuples'] = list(zip(df.comments, df.compound_binary))

### Functions for formatting and loading training and evaluation data

In [15]:
# Partitions tuples into text and labels for sqr_rating values
def _prepare_partition(text_label_tuples, *, preprocess=False):
    texts, labels = zip(*text_label_tuples)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    return texts, cats

# limit: how many examples to load from data, dev_size: size of hold-out set
def load_data(df, *, limit=0, dev_size=98): 
    """Load data, splitting off a held-out set."""
    if limit != 0:
        limit += dev_size 
    assert dev_size != 0
    
    # load training data
    train_data = df    
    # len(train_data) = 985 > dev_size = 98
    assert len(train_data) > dev_size    
    # training data is shuffled
    random.shuffle(train_data)
    # dev_data = first 98 entries of training data
    dev_data = train_data[:dev_size]
    # train_data = from 98th entry onwards of training data for length of 887
    train_data = train_data[dev_size:]
    # partition tuples into text and labels -> train_texts, train_labels
    train_texts, train_labels = _prepare_partition(train_data, preprocess=False) 
    # partition tuples into text and labels -> dev_texts, dev_labels
    dev_texts, dev_labels = _prepare_partition(dev_data, preprocess=False)
    return (train_texts, train_labels), (dev_texts, dev_labels)

### Create training and evaluation text and labels

In [16]:
(train_texts, train_cats), (eval_texts, eval_cats) = load_data(df['tuples'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i], x[j] = x[j], x[i]


### Load language model

In [17]:
nlp = spacy.load('en_trf_bertbaseuncased_lg')
print(nlp.pipe_names)

['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']


### Instantiate classifier

In [18]:
textcat = nlp.create_pipe("trf_textcat", config={"architecture": "softmax_class_vector"})

### Add label to text classifier

In [19]:
 # add label to text classifier
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

1

In [20]:
print("Labels:", textcat.labels)
nlp.add_pipe(textcat, last=True)
print(f"Using {len(train_texts)} training docs, {len(eval_texts)} evaluation")

Labels: ('POSITIVE', 'NEGATIVE')
Using 887 training docs, 98 evaluation


In [21]:
# total_words = sum(len(text.split()) for text in train_texts)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

In [22]:
print('type(train_data) : {} of tuples'.format(type(train_data)))
print('')
print('type(train_data[0]) : {}'.format(type(train_data[0])))
print('type(train_data[0]) : {}'.format(train_data[0]))
print('')
print('type(train_data[0][0]) : {}'.format(type(train_data[0][0])))
print('len(train_data[0][0]) : {}'.format(len(train_data[0][0])))
print('train_data[0][0][0:7] : {}'.format(train_data[0][0][0:7]))
print('')
print('type(train_data[0][1]) : {}'.format(type(train_data[0][1])))
print('type(train_data[0][1]) : {}'.format(train_data[0][1]))
print('type(train_data[0][1][\'cats\']) : {}'.format(train_data[0][1]['cats']))
print('type(train_data[0][1][\'cats\'][\'6\']) : {}'.format(train_data[0][1]['cats']['6']))

type(train_data) : <class 'list'> of tuples

type(train_data[0]) : <class 'tuple'>
type(train_data[0]) : ('hawtree creek middle school has held its first commencement ceremony  the students from the schools first graduating class will be attending close to  different high schools starting in september read more a hrefhttpwwwqchroncomeditionssouthagamblepaysoffforsopstudentsarticleefafdcbbfcebchtmlherea ms  lets students take the lead in parentteacher conferences with resounding success read more a hrefhttpwwwqchroncomeditionssouthmssnewkindofparentteachermeetingarticleaacfbecbccdfehtmlherea', {'cats': {'POSITIVE': True, 'NEGATIVE': False}})

type(train_data[0][0]) : <class 'str'>
len(train_data[0][0]) : 490
train_data[0][0][0:7] : hawtree

type(train_data[0][1]) : <class 'dict'>
type(train_data[0][1]) : {'cats': {'POSITIVE': True, 'NEGATIVE': False}}
type(train_data[0][1]['cats']) : {'POSITIVE': True, 'NEGATIVE': False}


KeyError: '6'

---

### Set hyperparameters

In [23]:
n_iter=4
n_texts=1000 #Changed number of texts to 75 to relieve pressue on GPU memory
batch_size=8 #batch-szie changed to 4 to relieve pressure on GPU memory
learn_rate=2e-5
max_wpb=1000
pos_label="POSITIVE"

### Adaptive learning rate function

In [24]:
def cyclic_triangular_rate(min_lr, max_lr, period):
    it = 1
    while True:
        # https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee
        cycle = numpy.floor(1 + it / (2 * period))
        x = numpy.abs(it / period - 2 * cycle + 1)
        relative = max(0, 1 - x)
        yield min_lr + (max_lr - min_lr) * relative
        it += 1

### Evaluation function (precision, recall, f1)

In [25]:
# nlp = spaCy Language Transformer, texts = eval_texts, cats = eval_cats, pos_label = 'POSITIVE'
def evaluate(nlp, texts, cats, pos_label):
    tp = 0.0  # True positives
    fp = 0.0  # False positives
    fn = 0.0  # False negatives
    tn = 0.0  # True negatives
    total_words = sum(len(text.split()) for text in texts)
    with tqdm(total=total_words, leave=False) as pbar:
        for i, doc in enumerate(nlp.pipe(texts, batch_size=batch_size)):
            gold = cats[i]
            for label, score in doc.cats.items():
                if label not in gold:
                    continue
                if label != pos_label:
                    continue
                if score >= 0.5 and gold[label] >= 0.5:
                    tp += 1.0
                elif score >= 0.5 and gold[label] < 0.5:
                    fp += 1.0
                elif score < 0.5 and gold[label] < 0.5:
                    tn += 1
                elif score < 0.5 and gold[label] >= 0.5:
                    fn += 1
            pbar.update(len(doc.text.split()))
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

### Initialize the TextCategorizer, and create an optimizer.

In [26]:
optimizer = nlp.resume_training()
optimizer.alpha = 0.001
optimizer.trf_weight_decay = 0.005
optimizer.L2 = 0.0
learn_rates = cyclic_triangular_rate(
    learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size
    )
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))

pbar = tqdm(total=100, leave=False)
results = []
epoch = 0
step = 0
eval_every = 100
patience = 3
while True:
    # Train and evaluate
    losses = Counter()
    random.shuffle(train_data)
    batches = minibatch(train_data, size=batch_size)
    for batch in batches:
        optimizer.trf_lr = next(learn_rates)
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses)
        pbar.update(1)
        if step and (step % eval_every) == 0:
            pbar.close()
            with nlp.use_params(optimizer.averages):
                # nlp = spaCy Language Transformer
                scores = evaluate(nlp, eval_texts, eval_cats, pos_label)
            results.append((scores["textcat_f"], step, epoch))
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
                    losses["trf_textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )
            pbar = tqdm(total=eval_every, leave=False)
        step += 1
    epoch += 1
    print(f"epoch {epoch}")
    # Stop if no improvement in HP.patience checkpoints
    if results:
        best_score, best_step, best_epoch = max(results)
        print(f"best score: {best_score}  best_step : {best_step}  best epoch : {best_epoch} ")
        print(f"break clause: {((step - best_step) // eval_every)}")
        if ((step - best_step) // eval_every) >= patience:
            break

    msg = wasabi.Printer()
    table_widths = [2, 4, 6]
    msg.info(f"Best scoring checkpoints")
    msg.row(["Epoch", "Step", "Score"], widths=table_widths)
    msg.row(["-" * width for width in table_widths])
    for score, step, epoch in sorted(results, reverse=True)[:10]:
        msg.row([epoch, step, "%.2f" % (score * 100)], widths=table_widths)

    # Test the trained model
    test_text = eval_texts[0]
    doc = nlp(test_text)
    print(test_text, doc.cats)

Training the model...
LOSS 	  P  	  R  	  F  


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.497	0.765	1.000	0.867


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

epoch 1
best score: 0.8670520230211501  best_step : 100  best epoch : 0 
break clause: 0
[38;5;4mℹ Best scoring checkpoints[0m
Epoch   Step   Score 
--   ----   ------
0    100    86.71 
child safety is not their concern children are being lined up outside the school with no supervision parents standing around in cluster where as children are unable to get in line because of this  security guard is either inside the school or standing at the top of the stairs the parent coordinator is never outside you only see her once in a while standing outside with the children they have a empty school yard where the children can line up by class just like they use to what going to happen come winter snow and the rain not to mention the cold they have to stand outside until the parent coordinator come out and tell them to go in  thats another thing she will tell the th graders to go into the school first while the nd graders have to stand outside you see there is many men shelters in the area as 

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.006	0.765	1.000	0.867


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.524	0.773	1.000	0.872


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

epoch 1
best score: 0.8720930231544078  best_step : 200  best epoch : 0 
break clause: 0
[38;5;4mℹ Best scoring checkpoints[0m
Epoch   Step   Score 
--   ----   ------
0    200    87.21 
0    100    86.71 
0    100    86.71 
child safety is not their concern children are being lined up outside the school with no supervision parents standing around in cluster where as children are unable to get in line because of this  security guard is either inside the school or standing at the top of the stairs the parent coordinator is never outside you only see her once in a while standing outside with the children they have a empty school yard where the children can line up by class just like they use to what going to happen come winter snow and the rain not to mention the cold they have to stand outside until the parent coordinator come out and tell them to go in  thats another thing she will tell the th graders to go into the school first while the nd graders have to stand outside you see ther

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.008	0.773	1.000	0.872


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.484	0.773	1.000	0.872


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

epoch 1
best score: 0.8720930231544078  best_step : 200  best epoch : 0 
break clause: 0
[38;5;4mℹ Best scoring checkpoints[0m
Epoch   Step   Score 
--   ----   ------
0    200    87.21 
0    200    87.21 
0    100    87.21 
0    100    86.71 
0    100    86.71 
child safety is not their concern children are being lined up outside the school with no supervision parents standing around in cluster where as children are unable to get in line because of this  security guard is either inside the school or standing at the top of the stairs the parent coordinator is never outside you only see her once in a while standing outside with the children they have a empty school yard where the children can line up by class just like they use to what going to happen come winter snow and the rain not to mention the cold they have to stand outside until the parent coordinator come out and tell them to go in  thats another thing she will tell the th graders to go into the school first while the nd grad

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.008	0.765	1.000	0.867


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.481	0.765	1.000	0.867


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

epoch 1
best score: 0.8720930231544078  best_step : 200  best epoch : 0 
break clause: 0
[38;5;4mℹ Best scoring checkpoints[0m
Epoch   Step   Score 
--   ----   ------
0    200    87.21 
0    200    87.21 
0    100    87.21 
0    200    86.71 
0    100    86.71 
0    100    86.71 
0    100    86.71 
child safety is not their concern children are being lined up outside the school with no supervision parents standing around in cluster where as children are unable to get in line because of this  security guard is either inside the school or standing at the top of the stairs the parent coordinator is never outside you only see her once in a while standing outside with the children they have a empty school yard where the children can line up by class just like they use to what going to happen come winter snow and the rain not to mention the cold they have to stand outside until the parent coordinator come out and tell them to go in  thats another thing she will tell the th graders to go i

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.010	0.765	1.000	0.867


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.511	0.765	1.000	0.867


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

epoch 1
best score: 0.8720930231544078  best_step : 200  best epoch : 0 
break clause: 0
[38;5;4mℹ Best scoring checkpoints[0m
Epoch   Step   Score 
--   ----   ------
0    200    87.21 
0    200    87.21 
0    100    87.21 
0    200    86.71 
0    200    86.71 
0    100    86.71 
0    100    86.71 
0    100    86.71 
0    100    86.71 
child safety is not their concern children are being lined up outside the school with no supervision parents standing around in cluster where as children are unable to get in line because of this  security guard is either inside the school or standing at the top of the stairs the parent coordinator is never outside you only see her once in a while standing outside with the children they have a empty school yard where the children can line up by class just like they use to what going to happen come winter snow and the rain not to mention the cold they have to stand outside until the parent coordinator come out and tell them to go in  thats another thin

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.009	0.765	1.000	0.867


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=70064.0), HTML(value='')))

0.503	0.765	1.000	0.867


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

KeyboardInterrupt: 