<a href="https://colab.research.google.com/github/ldivrala/parapharing_and_text_information/blob/main/parapharsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
# from transformers import GPT2Tokenizer, TFGPT2Model
import tensorflow as tf
import pandas as pd

In [3]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [4]:
train_data = pd.read_csv("/content/msr_paraphrase_train.txt", 
                         delimiter="\t", 
                         error_bad_lines=False).rename(
                             columns = {"#1 String": "text", "#2 String": "para"}
                             )
                         
train_data.dropna(inplace = True)  
train_data = train_data[train_data.Quality == 1]

train_data2 = pd.read_table("train.tsv")
train_data2.dropna(inplace = True) 
train_data2.drop(columns = "id", inplace = True)
train_data2 = train_data2[train_data2.label == 1]

train_data2.head()

b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\nSkipping line 3491: expected 5 fields, saw 6\nSkipping line 3643: expected 5 fields

Unnamed: 0,sentence1,sentence2,label
1,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1
3,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1
4,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1
5,William Henry Henry Harman was born on 17 Febr...,"William Henry Harman was born in Waynesboro , ...",1
7,With a discrete amount of probabilities Formul...,Given a discrete set of probabilities formula ...,1


In [5]:
text_para1 = "<S> " + train_data["text"] + " </S>" + " <<<< " + "<P> " + train_data["para"] + " </P>"
text_para2 = "<S> " + train_data["para"] + " </S>" + " <<<< " + "<P> " + train_data["text"] + " </P>"
text_para3 = "<S> " + train_data2["sentence1"] + " </S>" + " <<<< " + "<P> " + train_data2["sentence2"] + " </P>"
text_para4 = "<S> " + train_data2["sentence2"] + " </S>" + " <<<< " + "<P> " + train_data2["sentence1"] + " </P>"

text_para1.head()

0    <S> Amrozi accused his brother, whom he called...
2    <S> They had published an advertisement on the...
4    <S> The stock rose $2.11, or about 11 percent,...
5    <S> Revenue in the first quarter of the year d...
7    <S> The DVD-CCA then appealed to the state Sup...
dtype: object

In [6]:
text_para = text_para1.append([text_para2, text_para3, text_para4], ignore_index=True)

text_para.tail()

48945    <S> The Romance language currently spoken in G...
48946    <S> It is necessary to note that k is a vector...
48947    <S> Tim Tim Henman won 6 -- 2 , 7 -- 6 against...
48948    <S> He was considered an active member of the ...
48949    <S> She was at Cork on 24 June , and arrived i...
dtype: object

In [7]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [8]:
from sklearn.utils import shuffle
text_para = shuffle(text_para)

text_para.iloc[0]

'<S> In 1989 , he travelled to Mozambique , Johannesburg , and Angola , South Africa on a peace-seeking mission . </S> <<<< <P> In 1989 he travelled to South Africa , Johannesburg and Angola , Mozambique on a peace-seeking mission . </P>'

In [9]:
batch_size = 8

def format_dataset():
    for i in range(len(text_para)):
      example = text_para.iloc[i]

      tokens = tokenizer(example, 
                         return_tensors="tf")
      
      yield ({
          "input_ids": tokens["input_ids"][0, :-1]
      }, {
          "labels": tokens["input_ids"][0, 1:]
      })

def make_dataset(dataset):

    dataset = tf.data.Dataset.from_generator(format_dataset, 
                                             output_signature= ({
                                                 "input_ids": tf.TensorSpec(shape=(None), dtype=tf.int32, name="input_ids"),
                                             },{
                                                 "labels": tf.TensorSpec(shape=(None), dtype=tf.int32, name="labels")
                                             }))
    dataset = dataset.cache()
    
    return dataset.padded_batch(batch_size, padded_shapes= ({"input_ids":[None]}, {"labels":[None]}))

In [10]:
train_ds = make_dataset(train_data)

In [11]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["input_ids"].shape}')
    # print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets['labels'].shape}")

inputs["encoder_inputs"].shape: (8, 87)
targets.shape: (8, 87)


In [12]:
# encode context the generation is conditioned on
input_check = "<S>" + train_data["text"].iloc[0] + "</S>" + "<<<<" 
input_ids = tokenizer.encode(input_check, return_tensors='tf')

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=100)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
<S>Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.</S><<<<S>Amrozi said he had been told by the witness that he had been told by the witness that he had been told by the witness that he had been told by the witness that he had been told by the witness that he had been told by the witness that he had been told by the witness that he had been told by the witness that he had


In [13]:
input = tf.keras.Input(shape= (None, ), name="input_ids", dtype = tf.int32)
model_output = model(input).logits
model_output = tf.keras.layers.Activation("softmax", name="labels")(model_output)
model2 = tf.keras.Model(input, model_output)

In [14]:
model2.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, None)]            0         
_________________________________________________________________
tfgp_t2lm_head_model (TFGPT2 TFCausalLMOutputWithPast( 124439808 
_________________________________________________________________
labels (Activation)          (None, None, 50257)       0         
Total params: 124,439,808
Trainable params: 124,439,808
Non-trainable params: 0
_________________________________________________________________


In [15]:
model2.compile("adam", tf.keras.losses.SparseCategoricalCrossentropy(), metrics = ["acc"])

In [16]:
model2.fit(train_ds)



<keras.callbacks.History at 0x7fabb7e75c10>

In [17]:
model2.fit(train_ds, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa9ee126750>

In [20]:
# encode context the generation is conditioned on
input_check = "<S>" + train_data["text"].iloc[0] + "</S>" + "<<<<" 
input_ids = tokenizer.encode(input_check, return_tensors='tf')

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=100)

print("Output:\n" + 50 * '-')
output_para = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
output_para = output_para.split("<P>")[1].split("</P>")[0]

print(input_check)
print(output_para)

Output:
--------------------------------------------------
<S>Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.</S><<<<
 Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence. </P> Referring to him as "the witness", Arraja said. </P> Referring to him. </P> The Associated Press afterward, a spokeswoman,


In [71]:
# import numpy as np

In [70]:
# encode context the generation is conditioned on
input_check = "<S>" + "how are you?" + "</S>" + "<<<<" 
input_ids = tokenizer.encode(input_check, return_tensors='tf')

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=100, num_return_sequences = 20, do_sample=True)

print("Output:\n" + 50 * '-')
output_para = []

for i in range(20):
  para = tokenizer.decode(greedy_output[i], skip_special_tokens=True)
  output_para.append(para.split("<P>")[1].split("</P>")[0])

print(input_check)
print(output_para[0])


Output:
--------------------------------------------------
<S>how are you?</S><<<<
 In Perú fall there are issues such as ESPN,: For the first amber tournament there are those supporting the bottom-of-the-de-the-de-the-de-the-deighth. 


In [222]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


In [269]:
sentence = "Machine learning is an application of artificial intelligence (AI)\
 that provides systems the ability to automatically learn and improve from experience\
  without being explicitly programmed. Machine learning focuses on the development of \
  computer programs that can access data and use it to learn for themselves.\
The process of learning begins with observations or data, such as examples, direct \
experience, or instruction, in order to look for patterns in data and make better \
decisions in the future based on the examples that we provide. The primary aim is \
to allow the computers learn automatically without human intervention or assistance \
and adjust actions accordingly."

sentence_part = sentence.split(".")
sentence_output = embed(tf.constant([sentence]))[0]
sentence_part_output = embed(tf.constant(sentence_part))

def cosine_similarity(vector,matrix):
   return ( np.sum(vector*matrix,axis=1) / ( np.sqrt(np.sum(matrix**2,axis=1)) * np.sqrt(np.sum(vector**2)) ) )[::-1]

In [270]:
(sentence_output.numpy() @ sentence_part_output.numpy().T).argsort()[::-1]

array([0, 2, 1, 3, 4])

In [271]:
sentence_part[0]

'Machine learning is an application of artificial intelligence (AI) that provides systems the ability to automatically learn and improve from experience  without being explicitly programmed'

In [290]:
import regex as re

sentence_words = re.findall("[\w]+" ,sentence)
sentence_words = sentence_words + re.findall("[\w]+ [\w]+", sentence)

sentence_words_output = embed(sentence_words)

In [291]:
sentence_words_similirity = (sentence_output.numpy() @ sentence_words_output.numpy().T).argsort()[::-1]
sentence_words_similirity

array([114, 102, 105,   8, 124, 109, 145,  26,  48,   1, 117,  24, 137,
       113,  90, 133, 144, 122, 129,  53,  68,  37, 126, 119, 135,   7,
        32,  60, 107, 142,  75,  42,  16,  91, 146,  15,  92,  51,  85,
        18,  33,  25,   0, 149, 100, 147,  44,  11, 106, 104,  10, 125,
        66,  94, 139,  72, 115, 132, 116,  27,  30, 118,  79,  56, 110,
        58,  20,  76,  13,  93,  21, 128, 108,  95, 140, 111, 148, 136,
        49,  50,   4, 138,  23,  46, 121,   6,  82, 112, 130, 127,  36,
        35, 120,  39,  99,   3,  71,  19, 123, 141,  52,  96,  59,  55,
       134,  77,  28,  88,  61,  67,  73, 143,  87,  63,  41,  14,  97,
        31,   5,  47,  78,  89,  12,  29,  45,  83,  74, 103,  54, 101,
        40,  17,  98,  69,  38,  57,  62,  84,  80,  34,   9, 131,  81,
        65,  43,   2,  86,  22,  70,  64])

In [292]:
np.array(sentence_words)[sentence_words_similirity]

array(['Machine learning', 'Machine learning', 'artificial intelligence',
       'AI', 'of learning', 'automatically learn', 'learn automatically',
       'learning', 'learning', 'learning', 'computer programs',
       'programmed', 'future based', 'explicitly programmed', 'computers',
       'in data', 'the computers', 'learn for', 'or instruction', 'data',
       'data', 'data', 'observations or', 'access data',
       'better decisions', 'intelligence', 'computer', 'instruction',
       'systems the', 'aim is', 'future', 'learn', 'learn', 'learn',
       'without human', 'automatically', 'automatically', 'observations',
       'aim', 'improve', 'programs', 'Machine', 'Machine',
       'adjust actions', 'actions', 'intervention or', 'themselves',
       'systems', 'that provides', 'application of', 'provides',
       'begins with', 'patterns', 'human', 'examples that', 'decisions',
       'focuses on', 'for patterns', 'the development', 'focuses',
       'development', 'that can', 'e