In [1]:
import re
import pickle
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
!pip install transformers

In [2]:
# read pickle file
pickle_in = open("plots_text.pickle","rb")
movie_plots = pickle.load(pickle_in)

# count of movie plot summaries
len(movie_plots)

500

In [3]:
# clean text
movie_plots = [re.sub("[^a-z' ]", "", i) for i in movie_plots]

In [4]:
# create sequences of length 5 tokens
def create_seq(text, seq_len = 5):
    
    sequences = []

    # if the number of tokens in 'text' is greater than 5
    if len(text.split()) > seq_len:
      for i in range(seq_len, len(text.split())):
        # select sequence of tokens
        seq = text.split()[i-seq_len:i+1]
        # add to the list
        sequences.append(" ".join(seq))

      return sequences

    # if the number of tokens in 'text' is less than or equal to 5
    else:
      
      return [text]

In [5]:
seqs = [create_seq(i) for i in movie_plots]

# merge list-of-lists into a single list
seqs = sum(seqs, [])

# count of sequences
len(seqs)

152644

In [6]:
# create inputs and targets (x and y)
x = []
y = []

for s in seqs:
  x.append(" ".join(s.split()[:-1]))
  y.append(" ".join(s.split()[1:]))

In [7]:
# create integer-to-token mapping
int2token = {}
cnt = 0

for w in set(" ".join(movie_plots).split()):
  int2token[cnt] = w
  cnt+= 1

# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}

token2int["the"], int2token[14271]

(1921, 'micky')

In [8]:
token2int["progressive"]=16592
int2token[16592] = "wiprogressivengers"

In [9]:
token2int["wingers"]=16593
int2token[16593] = "wingers"

In [10]:
token2int["intellectuals"]=16594
int2token[165954] = "intellectuals"

In [11]:
token2int["transgenders"]=16595
int2token[16595] = "transgenders"

In [12]:
token2int["left-wing"]=16596
int2token[16595] = "left-wing"

In [13]:
token2int["right-wing"]=16597
int2token[16597] = "right-wing"

In [14]:
# set vocabulary size
vocab_size = len(int2token)
vocab_size

16597

In [15]:
def get_integer_seq(seq):
  return [token2int[w] for w in seq.split()]

# convert text sequences to integer sequences
x_int = [get_integer_seq(i) for i in x]
y_int = [get_integer_seq(i) for i in y]

# convert lists to numpy arrays
x_int = np.array(x_int)
y_int = np.array(y_int)

In [16]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
      x = arr_x[prv:n,:]
      y = arr_y[prv:n,:]
      prv = n
      yield x, y

In [27]:
def sampl(input_text):

    ot = generator_gpt2(input_text, max_length=30, num_return_sequences=5)
    output_text = ot[1]['generated_text']
    return output_text

In [17]:
class WordLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 200)

        ## define the LSTM
        self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [18]:
# instantiate the model
net = WordLSTM()

# push the model to GPU (avoid it if you are not using the GPU)
net.cuda()

print(net)

WordLSTM(
  (emb_layer): Embedding(16597, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=16597, bias=True)
)


In [19]:
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to GPU
    net.cuda()
    
    counter = 0

    net.train()

    for e in range(epochs):

        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            # push tensors to GPU
            inputs, targets = inputs.cuda(), targets.cuda()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1))

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{}...".format(e+1, epochs),
                    "Step: {}...".format(counter))

In [20]:
# train the model
train(net, batch_size = 32, epochs=20, print_every=256)

Epoch: 1/20... Step: 256...
Epoch: 1/20... Step: 512...
Epoch: 1/20... Step: 768...
Epoch: 1/20... Step: 1024...
Epoch: 1/20... Step: 1280...
Epoch: 1/20... Step: 1536...
Epoch: 1/20... Step: 1792...
Epoch: 1/20... Step: 2048...
Epoch: 1/20... Step: 2304...
Epoch: 1/20... Step: 2560...
Epoch: 1/20... Step: 2816...
Epoch: 1/20... Step: 3072...
Epoch: 1/20... Step: 3328...
Epoch: 1/20... Step: 3584...
Epoch: 1/20... Step: 3840...
Epoch: 1/20... Step: 4096...
Epoch: 1/20... Step: 4352...
Epoch: 1/20... Step: 4608...
Epoch: 2/20... Step: 4864...
Epoch: 2/20... Step: 5120...
Epoch: 2/20... Step: 5376...
Epoch: 2/20... Step: 5632...
Epoch: 2/20... Step: 5888...
Epoch: 2/20... Step: 6144...
Epoch: 2/20... Step: 6400...
Epoch: 2/20... Step: 6656...
Epoch: 2/20... Step: 6912...
Epoch: 2/20... Step: 7168...
Epoch: 2/20... Step: 7424...
Epoch: 2/20... Step: 7680...
Epoch: 2/20... Step: 7936...
Epoch: 2/20... Step: 8192...
Epoch: 2/20... Step: 8448...
Epoch: 2/20... Step: 8704...
Epoch: 2/20... St

In [21]:
# predict next token
def predict(net, tkn, h=None):
         
  # tensor inputs
  x = np.array([[token2int[tkn]]])
  inputs = torch.from_numpy(x)
  
  # push to GPU
  inputs = inputs.cuda()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  # get the output of the model
  out, h = net(inputs, h)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()

  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  return int2token[sampled_token_index], h

# function to generate text
#def sample(net, size, prime='it is'):
def sample(net, size, prime):

    # push to GPU
    net.cuda()
    
    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    toks = prime.split()

    # predict next token
    for t in prime.split():
      token, h = predict(net, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [30]:
sampl("one of the")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'one of the first people who had been told by his brother that it would be time to move to an additional city] he did so and he put'

In [29]:
sampl("as soon as")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'as soon as she had finished, she raised an eyebrow at him as though he had heard her question whether he was still in the castle. Her face'

In [31]:
!pip install algoauditor



In [32]:
!pip install lethai

Collecting lethai
[?25l  Downloading https://files.pythonhosted.org/packages/2d/00/ab1df039ba463a5d17e457f07275cb8a1baf009141c8d66dc4052ba24f02/lethai-0.0.7-py3-none-any.whl (220kB)
[K     |█▌                              | 10kB 23.1MB/s eta 0:00:01[K     |███                             | 20kB 30.7MB/s eta 0:00:01[K     |████▌                           | 30kB 25.7MB/s eta 0:00:01[K     |██████                          | 40kB 18.7MB/s eta 0:00:01[K     |███████▍                        | 51kB 9.1MB/s eta 0:00:01[K     |█████████                       | 61kB 8.3MB/s eta 0:00:01[K     |██████████▍                     | 71kB 9.4MB/s eta 0:00:01[K     |███████████▉                    | 81kB 10.2MB/s eta 0:00:01[K     |█████████████▍                  | 92kB 10.9MB/s eta 0:00:01[K     |██████████████▉                 | 102kB 7.5MB/s eta 0:00:01[K     |████████████████▎               | 112kB 7.5MB/s eta 0:00:01[K     |█████████████████▉              | 122kB 7.5MB/s eta

In [33]:
import algoauditor
import lethai

In [34]:
username = "imhardikj"

In [35]:
api = "fb1c50701a2c47e5a40cff75977778fa"

In [36]:
def generator_a(input_text):
    """
    Keyword arguments:
    input_text - String

    The string argument should be provided to the ML model.
    The ML model would then generate an output string.
    This output string should be returned from the function.
    """

    output_text = sampl(input_text)
    return output_text

In [37]:
x = algoauditor.nlg.config(username,api)

In [38]:
x.check_discrimination(generator_a)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Dataset retrieved
Generating predictions...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Predictions generated
Running analysis on predictions...
Analysis completed. Opening browser to show the analysis...


In [25]:
from transformers import pipeline, set_seed
set_seed(42)

In [None]:
generator_gptneo = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1347.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5312753599.0, style=ProgressStyle(descr…




In [None]:
def generator_b(input_text):
    """
    Keyword arguments:
    input_text - String

    The string argument should be provided to the ML model.
    The ML model would then generate an output string.
    This output string should be returned from the function.
    """

    ot = generator_gptneo(input_text, max_length=30, num_return_sequences=5)
    output_text = ot[1]['generated_text']
    return output_text

In [None]:
api_c = ""

In [None]:
xb = algoauditor.nlg.config(username,api_c)

In [None]:
xb.check_discrimination(generator_b)

In [26]:
generator_gpt2 = pipeline('text-generation', model='gpt2')
#generator_gpt2("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




In [None]:
def generator_c(input_text):
    """
    Keyword arguments:
    input_text - String

    The string argument should be provided to the ML model.
    The ML model would then generate an output string.
    This output string should be returned from the function.
    """

    ot = generator_gpt2(input_text, max_length=30, num_return_sequences=5)
    output_text = ot[1]['generated_text']
    return output_text

In [None]:
api_b = "20fdc9d187934558b7ce7f59cc93f01f"

In [None]:
xb = algoauditor.nlg.config(username,api_b)

In [None]:
xb.check_discrimination(generator_c)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Dataset retrieved
Generating predictions...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Predictions generated
Running analysis on predictions...
Analysis completed. Opening browser to show the analysis...


In [7]:
generator_xlnetbase = pipeline('text-generation', model='xlnet-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…




In [8]:
def generator_d(input_text):
    """
    Keyword arguments:
    input_text - String

    The string argument should be provided to the ML model.
    The ML model would then generate an output string.
    This output string should be returned from the function.
    """

    ot = generator_xlnetbase(input_text, max_length=30, num_return_sequences=5)
    output_text = ot[1]['generated_text']
    return output_text

In [9]:
api_d = "04707b1f8b1e4e549d11511c712664de"

In [10]:
xd = algoauditor.nlg.config(username,api_d)

In [None]:
xd.check_discrimination(generator_d)

In [15]:
generator_xlnetlarge = pipeline('text-generation', model='xlnet-large-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=761.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1441285815.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…




In [None]:
def generator_e(input_text):
    """
    Keyword arguments:
    input_text - String

    The string argument should be provided to the ML model.
    The ML model would then generate an output string.
    This output string should be returned from the function.
    """

    ot = generator_xlnetlarge(input_text, max_length=30, num_return_sequences=5)
    output_text = ot[1]['generated_text']
    return output_text

[{'generated_text': "Hello, I'm a language model, a kind of free-lance language modeler called ‘The Slumber Guy’. Now that it"}]

In [None]:
api_e = ""

In [None]:
xe = algoauditor.nlg.config(username,api_d)

In [None]:
xe.check_discrimination(generator_d)