In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from project_functions import *
import spacy
from spacy.lang.en import English
spacy.load('en')
parser = English()

# Spacy transformer
import thinc
import random
import GPUtil
import torch
from spacy.util import minibatch
from tqdm.auto import tqdm
import unicodedata
import wasabi
import numpy
from collections import Counter



OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

### Import DF

In [2]:
df = pd.read_csv('csv/sqr_comments_sentiment.csv')

---

### Clean Text

In [3]:
# Clean text function from project_functions file. Removes punctuation, whitespace, numbers, and makes text lowercase
cleanupText(df, 'comments')

---

### Upsample minority class to address class imbalance

In [4]:
# separate minority and majority classes
negative = df[df.compound_binary==0]
positive = df[df.compound_binary==1]

# upsample minority
negative_upsampled = resample(negative,
                          replace=True, # sample with replacement
                          n_samples=len(positive), # match number in majority class
                          random_state=23) # reproducible results

# combine majority and upsampled minority
df = pd.concat([positive, negative_upsampled])

---

### Train Test Split

In [5]:
#TRAIN/TEST SPLIT
X = df['comments'].values
y = df['compound_binary'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

---

### Instantiate, fit, and encode using TfidfVectorizer

In [6]:
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train)
tf_idf_data_test = vectorizer.transform(X_test)

---

### Baseline: Dummy Classifier

In [7]:
#Fitting & predicting the Dummy Classifier (Baseline Model)
dclf = DummyClassifier() 
dclf.fit(tf_idf_data_train, y_train)
dummy_test_preds = dclf.predict(X_test)



In [8]:
# Get scores
dummy_prec_test_score = precision_score(y_test, dummy_test_preds)
dummy_recall_test_score = recall_score(y_test, dummy_test_preds)
dummy_f1_test_score = f1_score(y_test, dummy_test_preds, average='macro')
dummy_scores = pd.DataFrame({'Model':['Dummy Classifier'], 'Precision':[dummy_prec_test_score], 'Recall':[dummy_recall_test_score], 'Test F1':[dummy_f1_test_score]})
dummy_scores

Unnamed: 0,Model,Precision,Recall,Test F1
0,Dummy Classifier,0.490196,0.520833,0.505051


---

### Naive Bayes Classifier

In [9]:
# Instantiate Naieve Bayes Classifier
nb_classifier = MultinomialNB(alpha=.03)

# Predict using Naieve Bayes Classifier
nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

# Get scores
nb_prec_train_score = precision_score(y_train, nb_train_preds)
nb_prec_test_score = precision_score(y_test, nb_test_preds)
nb_recall_train_score = recall_score(y_train, nb_train_preds)
nb_recall_test_score = recall_score(y_test, nb_test_preds)
nb_f1_train_score = f1_score(y_train, nb_train_preds, average='macro')
nb_f1_test_score = f1_score(y_test, nb_test_preds, average='macro')
nb_scores = pd.DataFrame({'Model':['Naieve Bayes'], 'Train Precision':[nb_prec_train_score], 'Test Precision':[nb_prec_test_score], 'Train Recall':[nb_recall_train_score], 'Test Recall':[nb_recall_test_score], 'Train F1':[nb_f1_train_score], 'Test F1':[nb_f1_test_score]})
nb_scores

Unnamed: 0,Model,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Naieve Bayes,0.998311,0.943878,0.985,0.963542,0.991582,0.954527


### Random Forest Classifier

In [10]:
# Instantiate Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=1000, min_samples_leaf=.001, n_jobs=-1)
# rf_classifier = RandomForestClassifier(max_depth=20, n_estimators=1000, min_samples_leaf=.001, n_jobs=-1)

# Predict using Random Forest Classifier
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

# Get scores
rf_prec_train_score = precision_score(y_train, rf_train_preds)
rf_prec_test_score = precision_score(y_test, rf_test_preds)
rf_recall_train_score = recall_score(y_train, rf_train_preds)
rf_recall_test_score = recall_score(y_test, rf_test_preds)
rf_f1_train_score = f1_score(y_train, rf_train_preds, average='macro')
rf_f1_test_score = f1_score(y_test, rf_test_preds, average='macro')
rf_scores = pd.DataFrame({'Model':['Random Forest'], 'Train Precision':[rf_prec_train_score], 'Test Precision':[rf_prec_test_score], 'Train Recall':[rf_recall_train_score], 'Test Recall':[rf_recall_test_score], 'Train F1':[rf_f1_train_score], 'Test F1':[rf_f1_test_score]})
rf_scores

Unnamed: 0,Model,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Random Forest,0.996593,0.944751,0.975,0.890625,0.98569,0.921452


### SVC Classifier using Spacy

In [11]:
# Train test split
train, test = train_test_split(df, random_state=333)

In [12]:
# Spacy's standard transformer
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]

class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

In [21]:
# Instantiate vectorizer, classifier, and pipeline
vectorizer = TfidfVectorizer(tokenizer=tokenizeText)
clf = LinearSVC(tol=1e-5, C=1, dual=True, max_iter=2000)
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# Create training and testing dependent/independent variables
train1 = train['comments'].tolist()
print(train1)
labelsTrain1 = train['compound_binary'].tolist()

test1 = test['comments'].tolist()
labelsTest1 = test['compound_binary'].tolist()


# Fit the LinearSVC pipeline to the training data
pipe.fit(train1, labelsTrain1)

# Training predictions 
train_preds = pipe.predict(train1)
svc_prec_train_score = precision_score(labelsTrain1, train_preds)
svc_recall_train_score = recall_score(labelsTrain1, train_preds)
svc_f1_train_score = f1_score(labelsTrain1, train_preds, average='macro')

# Testing predictions
preds = pipe.predict(test1)
svc_prec_test_score = precision_score(labelsTest1, preds)
svc_recall_test_score = recall_score(labelsTest1, preds)
svc_f1_test_score = f1_score(labelsTest1, preds, average='macro')

pd.DataFrame({'Model':['Linear SVC w/ Spacy'], 'Train Precision':[svc_prec_train_score], 'Test Precision':[svc_prec_test_score], 'Train Recall':[svc_recall_train_score], 'Test Recall':[svc_recall_test_score], 'Train F1':[svc_f1_train_score], 'Test F1':[svc_f1_test_score]})

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Unnamed: 0,Model,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Linear SVC w/ Spacy,1.0,0.994624,0.99665,0.948718,0.998316,0.972182


---

### Sentiment Classifier using spaCy Transformer and BERT embeddings

In [10]:
import thinc
import random
import GPUtil
import torch
from spacy.util import minibatch
from tqdm.auto import tqdm
import unicodedata
import wasabi
import numpy
from collections import Counter
from project_functions import *

Checks if GPU is in usage

In [5]:
spacy.util.fix_random_seed(0)
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    print("GPU Usage")
    GPUtil.showUtilization()
else:
    print('GPU not in use.')

GPU not in use.


Reload the dataframe because when creating the training and testing text and labels below it will not accept the upsampled data due to duplicates.

In [27]:
df = pd.read_csv('/Users/kevinmacmat/Desktop/capstone/csv/sqr_comments_sentiment.csv')
cleanupText(df, 'comments')
df.head()

Unnamed: 0,dbn,school_name,school_type,enrollment,rigor_instruction_rating,collab_teachers_rating,support_environ_rating,effective_lead_rating,fam_comm_ties_rating,trust_rating,...,pct_chronic_absent,teacher_attendance_rate,sqr_rating,borough,comments,pos,neg,neu,compound,compound_binary
0,01M015,P.S. 015 Roberto Clemente,Elementary,161,1.0,1.0,1.0,1.0,1.0,0.75,...,0.227,0.974,6.0,manhattan,ps is an extraordinary small school that goes...,0.173,0.006,0.821,0.9995,1.0
1,01M019,P.S. 019 Asher Levy,Elementary,239,1.0,1.0,0.75,1.0,1.0,0.75,...,0.343,0.966,6.0,manhattan,wrong person the negative comments dont do jus...,0.154,0.054,0.792,0.9959,1.0
2,01M020,P.S. 020 Anna Silver,Elementary,439,0.25,0.5,0.5,0.5,0.75,0.75,...,0.296,0.966,5.0,manhattan,while we have always lived with the notion tha...,0.259,0.017,0.724,0.9999,1.0
3,01M034,P.S. 034 Franklin D. Roosevelt,K-8,288,0.5,0.5,0.5,0.5,0.75,0.5,...,0.455,0.968,4.0,manhattan,the doe has tabled a proposal to combine ps w...,0.111,0.107,0.782,0.7877,1.0
4,01M063,The STAR Academy - P.S.63,Elementary,207,1.0,1.0,0.75,0.75,1.0,0.75,...,0.347,0.967,6.0,manhattan,this school is an amazing school because we ha...,0.181,0.05,0.769,0.9992,1.0


Create list of tuples column w/ (comments, compound_binary) for each school in order to fit formatting requirements.

In [28]:
# e.g. [('text', score) . . .]
df['tuples'] = list(zip(df.comments, df.compound_binary))
df['tuples'].head()

0    (ps  is an extraordinary small school that goe...
1    (wrong person the negative comments dont do ju...
2    (while we have always lived with the notion th...
3    (the doe has tabled a proposal to combine ps  ...
4    (this school is an amazing school because we h...
Name: tuples, dtype: object

Functions for splitting texts and labels, and loading data.

In [None]:
# Partitions tuples into text and labels for compound_binary values
def _prepare_partition(text_label_tuples, *, preprocess=False):
    # texts = tuple of sentence strings and labels = tuple of 0 or 1's
    # e.g. ('texts are here', 'texts . . . ', . . .)
    texts, labels = zip(*text_label_tuples)
    # [{'POSITIVE': False, 'NEGATIVE': True}, {'POSITIVE': True, . . .}, {. . .} . . .]
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    return texts, cats

# limit: how many examples to load from data, dev_size: size of hold-out set
def load_data(df, *, limit=0, dev_size=98): 
    """Load data, splitting off a held-out set."""
    if limit != 0:
        limit += dev_size 
    assert dev_size != 0
    
    # load training data: df['tuples'] e.g. [('text', score), . . .]
    train_data = df    
    # len(train_data) = 985 > dev_size = 98
    assert len(train_data) > dev_size    
    # training data is shuffled
    random.shuffle(train_data)
    # dev_data = first 98 entries of training data
    dev_data = train_data[:dev_size]
    # train_data = from 98th entry onwards of training data for length of 887
    train_data = train_data[dev_size:]
    # partition tuples into text and labels -> train_texts, train_labels
    train_texts, train_labels = _prepare_partition(train_data, preprocess=False) 
    # partition tuples into text and labels -> dev_texts, dev_labels
    dev_texts, dev_labels = _prepare_partition(dev_data, preprocess=False)
    return (train_texts, train_labels), (dev_texts, dev_labels)

Create training and testing text and labels

In [None]:
(train_texts, train_cats), (eval_texts, eval_cats) = load_data(df['tuples'])

Load language model

In [None]:
nlp = spacy.load('en_trf_bertbaseuncased_lg')
print(nlp.pipe_names)

Instantiate classifier

In [None]:
textcat = nlp.create_pipe("trf_textcat", config={"architecture": "softmax_class_vector"})

Add labels to text classifier

In [None]:
 # add label to text classifier
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

Add classifier to pipeline

In [None]:
print("Labels:", textcat.labels)
nlp.add_pipe(textcat, last=True)
print(f"Using {len(train_texts)} training docs, {len(eval_texts)} testing docs")

Format train_data as a list of tuples with text at index 0 and a dictionary of labels at index 1

In [None]:
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

Set hyperparameters

In [None]:
n_iter=4
n_texts=1000 
batch_size=8 
learn_rate=2e-5
max_wpb=1000
pos_label="POSITIVE"

Function for adaptive learning rate <br>
Info: https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee


In [None]:
def cyclic_triangular_rate(min_lr, max_lr, period):
    it = 1
    while True:
        # https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee
        cycle = numpy.floor(1 + it / (2 * period))
        x = numpy.abs(it / period - 2 * cycle + 1)
        relative = max(0, 1 - x)
        yield min_lr + (max_lr - min_lr) * relative
        it += 1

Evaluation function for precision, recall, and f1 scores

In [None]:
# nlp = spaCy Language Transformer, texts = eval_texts, cats = eval_cats, pos_label = 'POSITIVE'
def evaluate(nlp, texts, cats, pos_label):
    tp = 0.0  # True positives
    fp = 0.0  # False positives
    fn = 0.0  # False negatives
    tn = 0.0  # True negatives
    total_words = sum(len(text.split()) for text in texts)
    with tqdm(total=total_words, leave=False) as pbar:
        for i, doc in enumerate(nlp.pipe(texts, batch_size=batch_size)):
            gold = cats[i]
            for label, score in doc.cats.items():
                if label not in gold:
                    continue
                if label != pos_label:
                    continue
                if score >= 0.5 and gold[label] >= 0.5:
                    tp += 1.0
                elif score >= 0.5 and gold[label] < 0.5:
                    fp += 1.0
                elif score < 0.5 and gold[label] < 0.5:
                    tn += 1
                elif score < 0.5 and gold[label] >= 0.5:
                    fn += 1
            pbar.update(len(doc.text.split()))
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

Using stochastic gradient descent data is evaluated and optimized in batches 

In [None]:
optimizer = nlp.resume_training()
optimizer.alpha = 0.001
optimizer.trf_weight_decay = 0.005
optimizer.L2 = 0.0
learn_rates = cyclic_triangular_rate(
    learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size
    )
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))

pbar = tqdm(total=100, leave=False)
results = []
epoch = 0
step = 0
eval_every = 100
patience = 3
while True:
    # Train and evaluate
    losses = Counter()
    random.shuffle(train_data)
    batches = minibatch(train_data, size=batch_size)
    for batch in batches:
        optimizer.trf_lr = next(learn_rates)
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses)
        pbar.update(1)
        if step and (step % eval_every) == 0:
            pbar.close()
            with nlp.use_params(optimizer.averages):
                # nlp = spaCy Language Transformer
                scores = evaluate(nlp, eval_texts, eval_cats, pos_label)
            results.append((scores["textcat_f"], step, epoch))
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
                    losses["trf_textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )
            pbar = tqdm(total=eval_every, leave=False)
        step += 1
    epoch += 1
    print(f"epoch {epoch}")
    # Stop if no improvement in HP.patience checkpoints
    if results:
        best_score, best_step, best_epoch = max(results)
        print(f"best score: {best_score}  best_step : {best_step}  best epoch : {best_epoch} ")
        print(f"break clause: {((step - best_step) // eval_every)}")
        if ((step - best_step) // eval_every) >= patience:
            break

    msg = wasabi.Printer()
    table_widths = [2, 4, 6]
    msg.info(f"Best scoring checkpoints")
    msg.row(["Epoch", "Step", "Score"], widths=table_widths)
    msg.row(["-" * width for width in table_widths])
    for score, step, epoch in sorted(results, reverse=True)[:10]:
        msg.row([epoch, step, "%.2f" % (score * 100)], widths=table_widths)

    # Test the trained model
    test_text = eval_texts[0]
    doc = nlp(test_text)
    print(test_text, doc.cats)