In [None]:
!pip install pycld2 regex nltk gensim spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycld2
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[K     |████████████████████████████████| 41.4 MB 1.2 MB/s 
Building wheels for collected packages: pycld2
  Building wheel for pycld2 (setup.py) ... [?25l[?25hdone
  Created wheel for pycld2: filename=pycld2-0.41-cp37-cp37m-linux_x86_64.whl size=9834370 sha256=909099cb6acd9bc42f42e9d45fb76af2b0587392c0d1f83176778226b5622c26
  Stored in directory: /root/.cache/pip/wheels/ed/e4/58/ed2e9f43c07d617cc81fe7aff0fc6e42b16c9cf6afe960b614
Successfully built pycld2
Installing collected packages: pycld2
Successfully installed pycld2-0.41


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import pycld2 as cld2

with open('/content/drive/MyDrive/kialo_corpus.json', 'r') as f:
    out = json.load(f)

# Need to use regex to remove offending non-UTF-8 characters from the data, which
# causes (error: input contains invalid UTF-8 around byte ...)
# Ref: https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790

import regex
import math

RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")

def remove_bad_chars(text):
    return RE_BAD_CHARS.sub("", text)

def detect_en(text):
    _, _, _, detection = cld2.detect(text, returnVectors=True)
    for tup in detection:
        if 'en' not in tup[-1]:
            return False
    return True

out_filtered = []
prev_num = 0
for i in range(len(out)):
    x = out[i]
    try:
        x['text'] = remove_bad_chars(x['text'])
        if detect_en(x['text']):
            out_filtered.append(x)
    except Exception as e:
        print(x['text'])
        print(f'Exception {e} raised')
        break
    percent = (i+1) / len(out) * 100
    _, num = math.modf(percent)
    num_ = int(num - (num % 5))
    if num_ != prev_num:
        for x in range(prev_num+5, num_+1, 5):
            print(f"{x}% of sentences done")
        prev_num = num_

5% of sentences done
10% of sentences done
15% of sentences done
20% of sentences done
25% of sentences done
30% of sentences done
35% of sentences done
40% of sentences done
45% of sentences done
50% of sentences done
55% of sentences done
60% of sentences done
65% of sentences done
70% of sentences done
75% of sentences done
80% of sentences done
85% of sentences done
90% of sentences done
95% of sentences done
100% of sentences done


In [None]:
import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import re
import os
import pickle
import gensim
from tqdm import tqdm

nltk.download("wordnet")
nltk.download("omw-1.4")

stemmer = SnowballStemmer("english")

models = {}
word_map = {}
debug = False
drive_path = '/content/drive/MyDrive'
#drive_path = './'

num_topics = 50

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

def sentence_to_seq(text):
    split_words = set(text.split())
    tokens = preprocess(re.sub(r'http\S+', '', text))

    # Keep a mapping of stems to original words
    if not os.path.exists('word_map.pkl'):
        for tk in tokens:
            for word in split_words:
                if tk in word:
                    if not word_map.get(tk):
                        word_map[tk] = set()
                    word_map[tk].add(word)
    
    return tokens

def prep_docs(out_filtered):
  all_docs = []
  all_sents = []
  for i, x in enumerate(tqdm(out_filtered, ascii=True)):
    topic_id, _ = x['id'].strip().split('.')
    if not models.get(topic_id):
      models[topic_id] = {}
    if x['neutral']:
      all_sents.append(x['neutral'][0]['text'])
      seq = sentence_to_seq(x['neutral'][0]['text'])
      models[topic_id]['topic'] = seq
      all_docs.append(seq)

    for obj in x['pro']:
      if not models[topic_id].get('pro'):
        models[topic_id]['pro'] = []
      all_sents.append(obj['text'])
      seq = sentence_to_seq(obj['text'])
      models[topic_id]['pro'].append(seq)
      all_docs.append(seq)
    
    for obj in x['con']:
      if not models[topic_id].get('con'):
        models[topic_id]['con'] = []
      all_sents.append(obj['text'])
      seq = sentence_to_seq(obj['text'])
      models[topic_id]['con'].append(seq)
      all_docs.append(seq)

  return all_docs, all_sents

def model_topics(processed_docs, num_topics=10):
    os.makedirs(os.path.join(drive_path, 'kialo_topics'), exist_ok=True)
    model_path = os.path.join(drive_path, 'kialo_topics', 'lda_kialo_topics.ckpt')
    if os.path.exists(model_path):
        lda_model = gensim.models.LdaMulticore.load(model_path)
    else:
        dictionary = gensim.corpora.Dictionary(processed_docs)
        bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
        lda_model =  gensim.models.LdaMulticore(
            bow_corpus, num_topics = num_topics,
            id2word = dictionary, passes = 10, workers = 8
        )
        # Save the model
        lda_model.save(model_path)

def model_subtopics(processed_docs, topic_id, stance):
    fname = f'models/lda_topics_{topic_id}_{stance}.ckpt'
    if os.path.exists(fname):
        lda_model = gensim.models.LdaMulticore.load(fname)
    else:
        dictionary = gensim.corpora.Dictionary(processed_docs)
        bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
        lda_model =  gensim.models.LdaMulticore(
            bow_corpus, num_topics = 10, id2word = dictionary, passes = 20, workers = 4
        )
        # Save the model
        lda_model.save(fname)
    
    if debug:
      # Print each topic discovered with it's top-40 words (tokens)
      for idx, topic in lda_model.print_topics(num_words=5):
          topic_words_raw = [x.strip().split('*')[-1] for x in topic.strip().split('+')]
          topic_words_mapped = [word_map.get(x.replace('"', '')) for x in topic_words_raw]
          topic_words = [min(x, key=len) if x else topic_words_raw[i] for i, x in enumerate(topic_words_mapped)]
          print(f"For topic ID {topic_id} and stance {stance}")
          print("Topic: {} => Words: {}".format(idx, ','.join(topic_words)))
    return fname

def run_topic_modeling():
  global models, word_map, debug, num_topics
  all_docs = []
  all_sents = []
  topic_model_path = os.path.join(drive_path, 'kialo_topics', 'lda_kialo_topics.ckpt')
  if not os.path.exists(topic_model_path):
    all_docs, all_sents = prep_docs(out_filtered)
    model_topics(all_docs, num_topics=num_topics)
  else:
    pass
    #all_docs, all_sents = prep_docs(out_filtered)

  if not os.path.exists(os.path.join(drive_path, 'word_map.pkl')):
    with open(os.path.join(drive_path, 'word_map.pkl'), 'wb') as f:
      pickle.dump(word_map, f)
  else:
    with open(os.path.join(drive_path, 'word_map.pkl'), 'rb') as f:
      word_map = pickle.load(f)

  return all_docs, all_sents

all_docs, all_sents = run_topic_modeling()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 17 kB/s 
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:

topics_path = os.path.join(drive_path, 'topics_overall')

import spacy
nlp = spacy.load('en_core_web_lg')

from spacy.tokens import Span
from spacy.matcher import Matcher

nltk.download('punkt')

topic_model_path = os.path.join(drive_path, 'kialo_topics', 'lda_kialo_topics.ckpt')
lda_model = gensim.models.LdaMulticore.load(topic_model_path)

def extract_entities(sents):
   global nlp
   # chunk one
   enti_one = ""
   enti_two = ""
  
   dep_prev_token = "" # dependency tag of previous token in sentence
  
   txt_prev_token = "" # previous token in sentence
  
   prefix = ""
   modifier = ""
  
   for tokn in nlp(sents):
       # chunk two
       ## move to next token if token is punctuation
      
       if tokn.dep_ != "punct":
           #  check if token is compound word or not
           if tokn.dep_ == "compound":
               prefix = tokn.text
               # add the current word to it if the previous word is 'compound’
               if dep_prev_token == "compound":
                   prefix = txt_prev_token + " "+ tokn.text
                  
           # verify if token is modifier or not
           if tokn.dep_.endswith("mod") == True:
               modifier = tokn.text
               # add it to the current word if the previous word is 'compound'
               if dep_prev_token == "compound":
                   modifier = txt_prev_token + " "+ tokn.text
                  
           # chunk3
           if tokn.dep_.find("subj") == True:
               enti_one = modifier +" "+ prefix + " "+ tokn.text
               prefix = ""
               modifier = ""
               dep_prev_token = ""
               txt_prev_token = ""
              
           # chunk4
           if tokn.dep_.find("obj") == True:
               enti_two = modifier +" "+ prefix +" "+ tokn.text
              
           # chunk 5
           # update variable
           dep_prev_token = tokn.dep_
           txt_prev_token = tokn.text
          
   return [enti_one.strip(), enti_two.strip()]


os.makedirs(topics_path, exist_ok=True)

topic_wise_entities = {
    i: [] for i in range(num_topics)
}
topic_wise_words = {
    i: [] for i in range(num_topics)
}

from nltk.tokenize import sent_tokenize

# Get top-40 words for each topic
for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=40):
  topic_words_raw = [x.strip().split('*')[-1] for x in topic.strip().split('+')]
  topic_words_mapped = [word_map.get(x.replace('"', '')) for x in topic_words_raw]
  topic_words = [min(x, key=len) if x else topic_words_raw[i] for i, x in enumerate(topic_words_mapped)]
  print(f"Topic {idx}: {topic}")
  topic_wise_words[idx] = topic_words

# print(topic_wise_words)
topic_words_json = os.path.join(drive_path, 'kialo_topics', 'topic_words.json')
with open(topic_words_json, 'w') as f:
  json.dump(topic_wise_words, f)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Topic 0: 0.059*"relationship" + 0.044*"medic" + 0.032*"treat" + 0.032*"respect" + 0.030*"ethic" + 0.029*"treatment" + 0.025*"doctor" + 0.022*"patient" + 0.017*"coupl" + 0.017*"peopl" + 0.017*"profession" + 0.016*"theft" + 0.015*"procedur" + 0.013*"homosexu" + 0.012*"steal" + 0.012*"warn" + 0.011*"virtu" + 0.011*"person" + 0.011*"enemi" + 0.010*"surgeri" + 0.009*"investig" + 0.009*"digniti" + 0.008*"chines" + 0.008*"hospit" + 0.008*"secret" + 0.008*"privaci" + 0.007*"confus" + 0.007*"perform" + 0.007*"cheat" + 0.006*"invas" + 0.006*"request" + 0.006*"donald" + 0.006*"kong" + 0.006*"hong" + 0.006*"starv" + 0.006*"undergo" + 0.006*"condemn" + 0.005*"euthanasia" + 0.005*"washington" + 0.005*"want"
Topic 1: 0.177*"caus" + 0.122*"harm" + 0.030*"justifi" + 0.027*"protest" + 0.023*"violent" + 0.023*"donat" + 0.018*"effect" + 0.017*"chariti" + 0.015*"peopl" + 0.015*"trigger" + 0.014*"trait" + 0.013*"psycholog" + 0.011*"carbon" + 0.010*"counter" + 0.010*"broad" + 0.009*"regim" + 0.008*"root" + 0

"\n# Get entities for each sentence and add for the topics\ndictionary = gensim.corpora.Dictionary(all_docs)\nfor idx in tqdm(range(len(all_docs)), ascii=True):\n  doc = all_docs[idx]\n  sent = all_sents[idx]\n  corpus = [dictionary.doc2bow(doc)]\n  top_topics = (\n      lda_model.get_document_topics(corpus, minimum_probability=0.0)\n  )\n  # Pick top topic for adding entities\n  top_topic = sorted(top_topics[0], key=lambda x: x[1], reverse=True)[0]\n  top_topic_id = top_topic[0]\n\n  entities = []\n  tk_sents = sent_tokenize(sent)\n  for ss in tk_sents:\n    e1, e2 = extract_entities(ss)\n    entities.append((e1, e2))\n  \n  # Add to the current topic as a single entry\n  topic_wise_entities[top_topic_id].append(entities)\n\ntopic_entities_json = os.path.join(drive_path, 'kialo_topics', 'topic_entities.json')\nwith open(topic_entities_json, 'w') as f:\n  json.dump(topic_wise_entities, f)\n"

In [None]:
import sys
import os
print(os.path.abspath('.'))
def generate_bow(input_sentence, aspect):
  # Use topic model to find input sentence's topic, get the words and entities matching
  # the ones in input sentence and use that BoW txt for inference.

  dictionary = gensim.corpora.Dictionary(all_docs)

  bow_topic = os.path.join(drive_path, 'PPLM', 'arg_gen', 'b1.txt')
  # bow_ent = os.path.join(drive_path, 'PPLM', 'arg_gen', 'bow_ent.txt')

  tokens = sentence_to_seq(input_sentence)
  corpus = [dictionary.doc2bow(tokens)]
  top_topics = (
      lda_model.get_document_topics(corpus, minimum_probability=0.0)
  )
  # Pick top topic for adding entities
  top_topic = sorted(top_topics[0], key=lambda x: x[1], reverse=True)[0]
  top_topic_id = top_topic[0]

  words_l = topic_wise_words[top_topic_id]

from subprocess import Popen, PIPE

def run_model(
    cond_text, grad_len=30, length=50, stepsize=0.01, kl_scale=0.09,
    num_samples=5, window_length=10, idx=1
):

  with open(os.path.join(drive_path,'PPLM', f'arg_gen_outputs_{idx}.txt'), 'ab') as f:
    process = Popen([
      'python', 'run_pplm.py', '-B', './arg_gen/b1.txt', '-D', 'generic', '--window_length', f'{window_length}',
      '--class_label', '0', '--cond_text', f'{cond_text}', '--grad_length', f'{grad_len}',
      '--length', f'{length}', '--gamma', '1.0', '--num_iterations', '5', '--num_samples', f'{num_samples}',
      '--stepsize', f'{stepsize}', '--kl_scale', f'{kl_scale}', '--gm_scale', '0.99', '--colorama',
      '--sample', '--discrim_weights', '/content/drive/MyDrive/PPLM/arg_gen/generic_classifier_head_epoch_50.pt',
      '--discrim_meta', '/content/drive/MyDrive/PPLM/arg_gen/generic_classifier_head_meta.json',
      '--verbosity', 'quiet'
    ], stdout=PIPE)
    for line in iter(process.stdout.readline, b""):
      sys.stdout.write(line)
      f.write(line)

  #os.system(
  #    f"python run_pplm.py -B ./arg_gen/bow_topic.txt -D generic \
  #     --class_label 0 --cond_text '{cond_text}' --grad_length {grad_len} \
  #     --length {length} --gamma 1.0 --num_iterations 5 --num_samples 5 \
  #     --stepsize {stepsize} --kl_scale {kl_scale} --gm_scale 0.99 --colorama \
  #     --sample --discrim_weights /content/drive/MyDrive/PPLM/arg_gen/generic_classifier_head_epoch_8.pt \
  #     --discrim_meta /content/drive/MyDrive/PPLM/arg_gen/generic_classifier_head_meta.json --verbosity quiet"
  #)

/content


In [None]:
os.chdir(os.path.join(drive_path, 'PPLM'))
print(os.getcwd())

In [None]:
def BOW_writer(aspect):
  f = open('/content/drive/MyDrive/PPLM/arg_gen/b1.txt', 'w')
  f.write(str(aspect))
  return True

In [None]:
!pip install -r /content/drive/MyDrive/PPLM/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.7.0
  Downloading torch-1.7.0-cp37-cp37m-manylinux1_x86_64.whl (776.7 MB)
[K     |████████████████████████████████| 776.7 MB 4.5 kB/s 
[?25hCollecting nltk==3.4.5
  Downloading nltk-3.4.5.zip (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 52.9 MB/s 
[?25hCollecting colorama==0.4.4
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting transformers==3.4.0
  Downloading transformers-3.4.0-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 59.9 MB/s 
[?25hCollecting torchtext==0.3.1
  Downloading torchtext-0.3.1-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 1.2 MB/s 
Collecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Collecting tokenizers==0.9.2
  Downloading tokenizers-0.9.2-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 

In [None]:
i_sentences = [
    "Is Lending Money At Interest Wrong?"
]

cond_text = [
    "financial interest"
]


idx = 1
num_samples = 3
for i in tqdm(range(len(cond_text)), ascii=True):
  inp = i_sentences[i]
  cond = cond_text[i]
  aspect = aspects[i]
  bow = BOW_writer(aspect)
  if bow:
     generate_bow(inp, aspect=aspect)
     # Will generate num_samples perturbed samples for each input triplet
     # (inp, cond, aspect). Saved to ./arg_gen_outputs_{idx}.txt
     path_gen = os.path.join(drive_path,'PPLM', f'arg_gen_outputs_{idx}.txt')
     print(path_gen)
     #with open(path_gen, 'w+') as f:
     with open(path_gen,'w+') as f:
       f.write(f"Input: {inp}\n")
       f.write(f"Conditional text: {cond}\n")
       f.write(f"Aspect: {aspect}\n")
     run_model(cond, num_samples=num_samples, idx=idx)