# Evaluating Classification Models of Personalized Persuasive Dialogue Systems

# Goal:
## 1 - Generalized combination of strategies that leads to successful persuasion / dialogue (Measured by donation in dataset)

## 2 - Importance of pysch profiles in determining approach to persuasion

# Work Log:
## July 16, 2021:
* Initialized Pandas data frames one containing persuader strategies used, and the other whether or not the dialogue ended in a donation (i.e. Successful dialogue or not). 
  * Both of these data frames are linked by specific dialogue identification numbers. 
* Successfully implemented an elementary 'features' method.
* Successfully implemented a simple Naive Bayes' Classifier that uses the 'features' method.

## July 17, 2021:
* Experimented with features
  * Concluded that setting features manually had next to no effect in classification model
  * Without any features the model had avg 60% of predicting correct
* Decided a BLSTM model would be the way to go

## July 18, 2021:
* Re-Worked data initialization feature to better suit PyTorch
  * Created a method for generating training and testing data with crucial features (i.e. Dialogue ID, Strategies Used, Score)
* Began work on BRNN implementation
  * Impelemented first version of BRNN class

## July 20, 2021

In [1]:

''' Imports '''

# General Imports
from os import path
import math
import random
import pandas as pd
import json


# NLTK Imports
import nltk
from nltk.classify import apply_features

# NumPy Imports
import numpy as np
from numpy import argmax

# PyTorch Imports
import torch
from torchtext.legacy import data
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.utils
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm


In [2]:

''' Generating Data '''


def generate_json_data():
    '''
    Method Info - generate_json_data() :
    check if "persuasion_data.json" file already exists
    import data from .xlsx files
    create two Pandas dataframes
    create a list of tuples containing dialogue ID and score from one of the dataframes
    create a dictionary using dialogue IDs as keys and list of strategies used in the dialogue as values
    generate a .json file to hold data for torchtext processing in format {"Dialogue ID": ID str, "Strategies": Strategies list, "Score": score int}
    '''

    if path.exists("test_data.json") and path.exists("train_data.json"):
        print("Files already exists")
    else:

        ''' Initializing ID and Score data from 'info.xlsx' '''

        info = pd.read_excel(r'info.xlsx')
        info = info[info.B4 != 1]
        info = info.reset_index()

        # checking if donation was made to create binary score scheme
        score = []
        for row in info.itertuples():
            if row[6] > 0:
                score.append(1)
            else:
                score.append(0)
        info['Score'] = score

        # dropping unnecessary columns from dataframe
        info.drop(['index', 'B3', 'B4', 'B5', 'B6', 'B7'],
                  inplace=True, axis=1)

        # creating finished id_and_score list of tuples using generator
        id_and_score = sorted([(row[1], row[2]) for row in info.itertuples()])

        ''' Initializing ID and Strategy data from 'xlsx' file in the form of Pandas Dataframe '''

        main = pd.read_excel(r'dialog.xlsx')
        main = main[main.B4 != 1]
        main = main.reset_index()
        main.drop(['Unnamed: 0', 'index', 'B4', 'Turn', 'Unit', 'ee_label_1',
                   'ee_label_2', 'er_label_2', 'neg', 'neu', 'pos'], inplace=True, axis=1)

        ''' Transforming Data from Pandas Dataframe to Python Dictionary '''

        dialogue_IDs = []           # list of all dialogue IDs
        strats = []                 # temp list used for strats
        id_and_strat = {}           # dict for id and strat

        # iterating through main dataframe
        for pos, row in enumerate(main.itertuples()):

            if row[1] not in dialogue_IDs:
                # appending IDs to dialogue IDs list
                dialogue_IDs.append(row[1])

                # ID access mechanism
                current_ID = dialogue_IDs[-1]
                if len(dialogue_IDs) > 1:
                    prev_ID = dialogue_IDs[-2]
                else:
                    prev_ID = dialogue_IDs[-1]

                # appending strategy list to dictionary
                strats.insert(0, '<START>')
                strats.append('<END>')
                s = strats
                strats = []
                id_and_strat[prev_ID] = s

            # mechanism for last dialogue
            elif pos == (len(main)-1):

                strats.insert(0, '<START>')
                strats.append('<END>')
                s = strats
                strats = []
                id_and_strat[current_ID] = s

            # appending persuasion strategies to strategy list
            if current_ID == row[1]:
                strats.append(row[2])

        ''' Formatting and Outputing it to a .json file '''

        jsonFile1 = open("test_data.json", "w")
        jsonFile2 = open("train_data.json", "w")
        for i, (n, s) in enumerate(id_and_score):
            aDict = {"Identification": n,
                     "Content": id_and_strat[n], "Score": s}
            jsonString = json.dumps(aDict)
            # if i == 0:
            #     jsonFile1.write('{\n')
            #     jsonFile2.write('{\n')

            if i < (len(id_and_score)/2):
                jsonFile1.write(jsonString + "\n")
            # elif i == (len(id_and_score)-1):
            #     jsonFile1.write(jsonString + '\n}')
            #     jsonFile2.write(jsonString + '\n}')
            elif i >= (len(id_and_score)/2):
                jsonFile2.write(jsonString + "\n")

        jsonFile1.close()
        jsonFile2.close()

        ''' Output Message '''
        print("Files now exist")


# calling function
generate_json_data()


Files already exists


In [3]:

''' Settings '''

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
input_size = 30
sequence_length = 1
num_layers = 2
hidden_size = 15  # expected input features
num_classes = 31
learning_rate = 0.00005
batch_size = 1
num_epochs = 11


In [4]:

''' Loading Data '''

# setting fields
# ide = data.Field(use_vocab=True)
content = data.Field(sequential=True, use_vocab=True)
score = data.Field(sequential=False, use_vocab=False)

fields = {"Content": ('c', content), "Score": ('s', score)}
# "Identification": ('i', ide),
# importing data from json files
train_data, test_data = data.TabularDataset.splits(
    path='',
    train="train_data.json",
    test="test_data.json",
    format="json",
    fields=fields
)


# building vocabulary
content.build_vocab(train_data,
                    max_size=1000,
                    min_freq=1)

print(len(content.vocab))

score.build_vocab(train_data,
                  max_size=2,
                  min_freq=1)
# setting iterators
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=batch_size,
    device=device)


31


In [5]:

''' Creating a bidirectional LSTM '''


class BRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, bidirectional=True)
        self.fc = nn.Linear((hidden_size*2), num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers*2, x.size(0),
                         self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers*2, x.size(0),
                         self.hidden_size).to(device)

        out, _ = self.lstm(x, (h0, c0))
        x = x.view(x.size(0), -1)
        out = self.fc(out[:, -1, :])

        return out


In [6]:
# Defining Checkpoint Methods

def save_checkpoint(state, filename='my_checkpoint.pth.tar'):
    print('=> Saving checkpoint')
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])


In [7]:
# Initilaizing network

model = BRNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [8]:
''' Checking Accuracy '''


def check_accuracy(iterator, model):
    num_correct = 0
    num_samples = 0

    # Set model to eval
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            x = batch.c.to(device).squeeze(1)
            x = nn.functional.pad(x, (0, input_size-len(x)), 'constant', 99)
            x = data.reshape(batch_size, sequence_length,
                             input_size).to(device)

            y = batch.s.to(device)

            scores = model(torch.tensor(x, dtype=torch.float))
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

    # Toggle model back to train
    model.train()
    return num_correct / num_samples


In [9]:
''' Training the Network '''

total_step = len(train_iterator)
for epoch in range(num_epochs):
    for batch_idx, batch in enumerate(tqdm(train_iterator)):

        # Data Shape = [batch_size, seq_len, input_size]
        # num_sample
        # Get data to cuda if possible
        data = batch.c.to(device).squeeze(1)
        data = nn.functional.pad(
            data, (0, input_size-len(data)), 'constant', 99)
        data = data.reshape(batch_size, sequence_length, input_size).to(device)
        targets = batch.s.to(device)

        # forward
        scores = model(torch.tensor(data, dtype=torch.float))
        loss = criterion(scores, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent update step/adam step
        optimizer.step()
    
print(f"Learning Rate: {learning_rate} Accuracy on training set: {check_accuracy(train_iterator, model)*100:2f}")


  scores = model(torch.tensor(data, dtype=torch.float))
100%|██████████| 150/150 [00:00<00:00, 264.24it/s]
100%|██████████| 150/150 [00:00<00:00, 502.06it/s]
100%|██████████| 150/150 [00:00<00:00, 466.12it/s]
100%|██████████| 150/150 [00:00<00:00, 419.59it/s]
100%|██████████| 150/150 [00:00<00:00, 500.86it/s]
100%|██████████| 150/150 [00:00<00:00, 494.10it/s]
100%|██████████| 150/150 [00:00<00:00, 489.27it/s]
100%|██████████| 150/150 [00:00<00:00, 435.92it/s]
100%|██████████| 150/150 [00:00<00:00, 378.33it/s]
100%|██████████| 150/150 [00:00<00:00, 414.24it/s]
100%|██████████| 150/150 [00:00<00:00, 448.70it/s]
  scores = model(torch.tensor(x, dtype=torch.float))
 25%|██▌       | 38/150 [00:00<00:00, 378.53it/s]

Learning Rate: 1e-05 Accuracy on training set: 60.666668


100%|██████████| 150/150 [00:00<00:00, 427.87it/s]
100%|██████████| 150/150 [00:00<00:00, 482.04it/s]
100%|██████████| 150/150 [00:00<00:00, 486.53it/s]
100%|██████████| 150/150 [00:00<00:00, 460.11it/s]
100%|██████████| 150/150 [00:00<00:00, 366.13it/s]
100%|██████████| 150/150 [00:00<00:00, 487.87it/s]
100%|██████████| 150/150 [00:00<00:00, 464.86it/s]
100%|██████████| 150/150 [00:00<00:00, 504.68it/s]
100%|██████████| 150/150 [00:00<00:00, 510.63it/s]
100%|██████████| 150/150 [00:00<00:00, 479.51it/s]
100%|██████████| 150/150 [00:00<00:00, 421.43it/s]
 31%|███       | 46/150 [00:00<00:00, 452.73it/s]

Learning Rate: 2e-05 Accuracy on training set: 60.666668


100%|██████████| 150/150 [00:00<00:00, 478.53it/s]
100%|██████████| 150/150 [00:00<00:00, 479.70it/s]
100%|██████████| 150/150 [00:00<00:00, 499.24it/s]
100%|██████████| 150/150 [00:00<00:00, 503.21it/s]
100%|██████████| 150/150 [00:00<00:00, 449.45it/s]
100%|██████████| 150/150 [00:00<00:00, 476.84it/s]
100%|██████████| 150/150 [00:00<00:00, 469.65it/s]
100%|██████████| 150/150 [00:00<00:00, 469.42it/s]
100%|██████████| 150/150 [00:00<00:00, 476.18it/s]
100%|██████████| 150/150 [00:00<00:00, 502.33it/s]
100%|██████████| 150/150 [00:00<00:00, 503.85it/s]
 69%|██████▉   | 104/150 [00:00<00:00, 519.21it/s]

Learning Rate: 3.0000000000000004e-05 Accuracy on training set: 60.666668


100%|██████████| 150/150 [00:00<00:00, 505.97it/s]
100%|██████████| 150/150 [00:00<00:00, 516.66it/s]
100%|██████████| 150/150 [00:00<00:00, 482.24it/s]
100%|██████████| 150/150 [00:00<00:00, 484.70it/s]
100%|██████████| 150/150 [00:00<00:00, 512.21it/s]
100%|██████████| 150/150 [00:00<00:00, 506.37it/s]
100%|██████████| 150/150 [00:00<00:00, 514.12it/s]
100%|██████████| 150/150 [00:00<00:00, 491.73it/s]
100%|██████████| 150/150 [00:00<00:00, 495.82it/s]
100%|██████████| 150/150 [00:00<00:00, 473.69it/s]
100%|██████████| 150/150 [00:00<00:00, 460.37it/s]
 35%|███▍      | 52/150 [00:00<00:00, 515.36it/s]

Learning Rate: 4e-05 Accuracy on training set: 60.666668


100%|██████████| 150/150 [00:00<00:00, 492.66it/s]
100%|██████████| 150/150 [00:00<00:00, 506.72it/s]
100%|██████████| 150/150 [00:00<00:00, 504.98it/s]
100%|██████████| 150/150 [00:00<00:00, 510.79it/s]
100%|██████████| 150/150 [00:00<00:00, 492.45it/s]
100%|██████████| 150/150 [00:00<00:00, 460.78it/s]
100%|██████████| 150/150 [00:00<00:00, 477.88it/s]
100%|██████████| 150/150 [00:00<00:00, 517.49it/s]
100%|██████████| 150/150 [00:00<00:00, 444.23it/s]
100%|██████████| 150/150 [00:00<00:00, 474.77it/s]
100%|██████████| 150/150 [00:00<00:00, 477.02it/s]
 32%|███▏      | 48/150 [00:00<00:00, 476.39it/s]

Learning Rate: 5e-05 Accuracy on training set: 60.666668


100%|██████████| 150/150 [00:00<00:00, 462.39it/s]
100%|██████████| 150/150 [00:00<00:00, 460.65it/s]
100%|██████████| 150/150 [00:00<00:00, 298.15it/s]
100%|██████████| 150/150 [00:00<00:00, 387.70it/s]
100%|██████████| 150/150 [00:00<00:00, 430.58it/s]
100%|██████████| 150/150 [00:00<00:00, 477.23it/s]
100%|██████████| 150/150 [00:00<00:00, 480.20it/s]
100%|██████████| 150/150 [00:00<00:00, 475.34it/s]
100%|██████████| 150/150 [00:00<00:00, 485.69it/s]
100%|██████████| 150/150 [00:00<00:00, 472.97it/s]
100%|██████████| 150/150 [00:00<00:00, 478.08it/s]
 33%|███▎      | 50/150 [00:00<00:00, 492.43it/s]

Learning Rate: 6.000000000000001e-05 Accuracy on training set: 60.666668


100%|██████████| 150/150 [00:00<00:00, 477.17it/s]
100%|██████████| 150/150 [00:00<00:00, 478.49it/s]
100%|██████████| 150/150 [00:00<00:00, 485.06it/s]
100%|██████████| 150/150 [00:00<00:00, 467.40it/s]
100%|██████████| 150/150 [00:00<00:00, 478.21it/s]
100%|██████████| 150/150 [00:00<00:00, 480.72it/s]
100%|██████████| 150/150 [00:00<00:00, 472.55it/s]
100%|██████████| 150/150 [00:00<00:00, 465.45it/s]
100%|██████████| 150/150 [00:00<00:00, 457.18it/s]
100%|██████████| 150/150 [00:00<00:00, 502.47it/s]
100%|██████████| 150/150 [00:00<00:00, 500.34it/s]
 34%|███▍      | 51/150 [00:00<00:00, 502.78it/s]

Learning Rate: 7.000000000000001e-05 Accuracy on training set: 60.666668


100%|██████████| 150/150 [00:00<00:00, 493.66it/s]
100%|██████████| 150/150 [00:00<00:00, 454.05it/s]
100%|██████████| 150/150 [00:00<00:00, 470.78it/s]
100%|██████████| 150/150 [00:00<00:00, 481.09it/s]
100%|██████████| 150/150 [00:00<00:00, 499.85it/s]
100%|██████████| 150/150 [00:00<00:00, 480.83it/s]
100%|██████████| 150/150 [00:00<00:00, 479.97it/s]
100%|██████████| 150/150 [00:00<00:00, 427.20it/s]
100%|██████████| 150/150 [00:00<00:00, 476.83it/s]
100%|██████████| 150/150 [00:00<00:00, 490.59it/s]
100%|██████████| 150/150 [00:00<00:00, 510.86it/s]


Learning Rate: 8e-05 Accuracy on training set: 60.666668


In [10]:
# print(f"Accuracy on training set: {check_accuracy(train_iterator, model)*100:2f}")
# print(f"Accuracy on test set: {check_accuracy(test_iterator, model)*100:.2f}")
