In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
import transformers
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
import torch
import pandas as pd
from tqdm.notebook import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup
import pickle
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)


In [23]:
def make_pairs(corpus):
    for i in range(len(corpus)-1):
        yield (corpus[i], corpus[i+1])

def hmm_generate(text):
    newtext = text.str.cat(sep=' ')
    corpus = newtext.split()
    pairs = make_pairs(corpus)
    word_dict = {}
    for word_1, word_2 in pairs:
        if word_1 in word_dict.keys():
            word_dict[word_1].append(word_2)
        else:
            word_dict[word_1] = [word_2]

    first_word = np.random.choice(corpus)
    chain = [first_word]
    n_words = 20

    for i in range(n_words):
        chain.append(np.random.choice(word_dict[chain[-1]]))
    
    return ' '.join(chain)

In [24]:
def get_dataloader(data):
    """
    Encode data
    """
    encoded_data = tokenizer.batch_encode_plus(
        data,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )
    """
    Assign inputs
    """
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = torch.tensor(data_labels)
    """
    Assign dataset
    """
    dataset = TensorDataset(
        input_ids,
        attention_masks,
        labels
    )
    """
    Create data loader
    """
    dataloader = DataLoader(
        dataset,
        sampler=RandomSampler(dataset),
        batch_size=batch_size
    )
    return dataloader

In [25]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

<b> Read in unseen data </b>

In [62]:
col_names = ['col1','col2','col3','col4','col5','col6','col7','col8','col9','col10']
df = pd.read_csv("test.csv", header=None, names=col_names, low_memory=False)

<b> Clean unseen data </b>

In [63]:
df = df[['col3','col4','col6']][1:51]
df.drop_duplicates(inplace=True)
df.columns = ['label','prompt','response']

df.prompt = df.prompt.str.replace('_comma_',',',regex=True)
df.response = df.response.str.replace('_comma_',',',regex=True)

<b> Load and apply label dictionaries</b>

In [64]:
label_dict = pickle.load(open('label_dict.pickle','rb'))
rev_label_dict = pickle.load(open('rev_label_dict.pickle','rb'))

In [65]:
df['label'] = df.label.replace(label_dict)

<b> Loading Tokenizer </b>

In [66]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

<b> Set values </b>

In [67]:
max_length = 512
batch_size = 32

In [68]:
data_labels = df.label.values

In [69]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

<b> Create data loaders </b>

In [70]:
dataloader_prompts = get_dataloader(df.prompt.values)
dataloader_responses = get_dataloader(df.response.values)

<b> Load BERT model </b>

In [71]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(label_dict),
    output_attentions=False,
    output_hidden_states=False
);

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [72]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device);

In [73]:
model.load_state_dict(
    torch.load("working_BERT.model",
               map_location=torch.device('cpu')))

<All keys matched successfully>

<b> Generate sentiment predictions for prompts and responses </b>

In [74]:
_, prompts_pred, _ = evaluate(dataloader_prompts)
_, responses_pred, _ = evaluate(dataloader_responses)

In [85]:
annotated = pd.read_csv("annotated_train.csv")

In [136]:
sample = annotated.loc[42]

In [123]:
sample_corpus = annotated.response[(annotated.prompt_sentiment == sample.prompt_sentiment) & (annotated.response_top_topic == sample.response_top_topic)]

<b> Text generation with Markov Chain

In [135]:
hmm_generate(sample_corpus)

'it. How can really stinks, I bet it is conditional people who does little jealous? Did they have a new job'

In [134]:
hmm_generate(annotated.response)

'when I love getting a final exam, thank God of nowhere,lol. When did they cared for. After 3 months back in'

<b> Text generation with GPT2

In [140]:
!pip install -q gpt-2-simple
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.3.0-cp37-cp37m-manylinux2010_x86_64.whl (320.4 MB)
[K     |███████████████████████████▋    | 276.2 MB 112.6 MB/s eta 0:00:01

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 320.4 MB 30 kB/s 
Collecting keras-preprocessing<1.2,>=1.1.1
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.6 MB/s  eta 0:00:01
[?25hCollecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting tensorboard<3,>=2.3.0
  Downloading tensorboard-2.3.0-py3-none-any.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 65.5 MB/s eta 0:00:01
Collecting absl-py>=0.7.0
  Downloading absl_py-0.10.0-py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 81.7 MB/s eta 0:00:01
[?25hCollecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 5.4 MB/s  eta 0:00:01
Collecting google-pasta>=0.1.8
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 7.2 MB/s  eta 0:00:01
Collecting tensorflow-estimator<2.

In [139]:
import tensorflow
import gpt_2_simple as gpt2

ModuleNotFoundError: No module named 'tensorflow'