In [1]:
!pip install transformers
!pip install sentencepiece
!pip install wordpiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 6.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 4.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstall

In [2]:
%%writefile gpt2.py
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import imp
import random
from typing import List, Union
#encode = imp.load_source('encoding', 'C:\\Users\\admin\\Desktop\\GP papers\\sample\\mds20_stega-master\\src\\utils\\encoding.py')
from io import StringIO
import heapq
from heapq import heappop, heappush
import sys
TOKEN_COUNT = 16
TOKEN_COUNT_LOG = int(np.log2(TOKEN_COUNT))


def init_model(name='gpt2'):
    model = GPT2LMHeadModel.from_pretrained(name, pad_token_id=50256)
    tokenizer = GPT2Tokenizer.from_pretrained(name, pad_token_id=50256)
    model.eval()
    return model, tokenizer 



def generate(model, init_tokens, seq, tokenizer,message):
    generate_count = len(seq) % TOKEN_COUNT_LOG
    if generate_count != 0:
        generate_count = TOKEN_COUNT_LOG - generate_count

    seq = seq + [0] * generate_count
    
    max_idx = len(seq) // TOKEN_COUNT_LOG

    tokens = tokenizer.encode(init_tokens, return_tensors='pt')
    init_len = tokens.shape[1]
    

    for idx in range(max_idx):
        out = model.generate(
            tokens,
            max_length=1 + idx + init_len,
            num_return_sequences=TOKEN_COUNT,
            num_beams=TOKEN_COUNT,
            early_stopping=True
        )

      
        
        #idx refers to the probability in each index of the stegotext need then it matches the indx with the candidate if yes then this probability is the right one
        

        candidates = out[:, -1]
       
        idx= Indexing(candidates,message);
        print('index of candidate chosen in list of candidates: ', idx, 'candidate: ', tokenizer.decode(candidates[idx].item()))
        candidate= candidates[idx].item()
        tokens = torch.cat(
            (tokens, torch.tensor([[candidate]])), axis=-1 #concatinates all the given tensors
        )

    text = tokenizer.decode(tokens[0],skip_special_tokens=True) #we decode the generated output so that it can be presented in human-readable format instead of some cryptic token indices.


    return text




def Indexing(ids: List[int], message: StringIO) -> int: 
        assert len(ids) > 0
        if len(ids) == 1:
            return ids[0]
        capacity = len(ids).bit_length() - 1
        #print(capacity)
        bits_str = message.read(capacity)
        #print("secret message bit sequence chunk: ",bits_str)
        if len(bits_str) < capacity:
            padding: str = '0' * (capacity - len(bits_str))
            bits_str = bits_str + padding
            message.write(padding)
        index = int(bits_str, 2) #from binary to number
        #print("index",index)
        return index
    
def _test(seq_len,message):
    message_io = StringIO(message) 

    seq = list(
        np.random.binomial(1, 0.5, seq_len)
    )

    model, tokenizer = init_model()
    
  
# Open the file in read mode
    with open("academic.txt", "r") as file:
      data = file.read();
      words = data.split();
      n=0
      
    # Generating a random number for word position
    
      word_pos = random.randint(0, len(words)-1)
      print('Randomly selected word: ',words[word_pos])
    
      text1 = generate(model, "Al-Sisi", seq, tokenizer,message_io)
      text = text1.replace("\n", " ")
      print('GPT2 Output: ',f'{text}',end='. ')
        

def isLeaf(root):
    return root.left is None and root.right is None
         
# A Tree node
class Node:
    def __init__(self, ch, freq, left=None, right=None):
        self.ch = ch
        self.freq = freq
        self.left = left
        self.right = right
 
    
    def __lt__(self, other):
        return self.freq < other.freq
 
 
# Traverse the Huffman Tree and store Huffman Codes in a dictionary
def encode(root, s, huffman_code):
 
    if root is None:
        return
 
    # found a leaf node
    if isLeaf(root):
        huffman_code[root.ch] = s if len(s) > 0 else '1'
 
    encode(root.left, s + '0', huffman_code)
    encode(root.right, s + '1', huffman_code)
    
def buildHuffmanTree(text):
 
    # base case: empty string
    if len(text) == 0:
        return
 
    # count the frequency of appearance of each character
    # and store it in a dictionary
    freq = {i: text.count(i) for i in set(text)}
 
    # Create a priority queue to store live nodes of the Huffman tree.
    pq = [Node(k, v) for k, v in freq.items()]
    heapq.heapify(pq)
 
    # do till there is more than one node in the queue
    while len(pq) != 1:
 
        # Remove the two nodes of the highest priority
        # (the lowest frequency) from the queue
 
        left = heappop(pq)
        right = heappop(pq)
 
        # create a new internal node with these two nodes as children and
        # with a frequency equal to the sum of the two nodes' frequencies.
        # Add the new node to the priority queue.
 
        total = left.freq + right.freq
        heappush(pq, Node(None, total, left, right))
 
    # `root` stores pointer to the root of Huffman Tree
    root = pq[0]
 
    # traverse the Huffman tree and store the Huffman codes in a dictionary
    huffmanCode = {}
    encode(root, '', huffmanCode)
 
 
    # print the encoded string
    s = ''
    for c in text:
        s += huffmanCode.get(c)
 
    return s;

if __name__ == '__main__':
    model, tokenizer = init_model()
    message= input("Enter secret message ")
    tobits= buildHuffmanTree(message)
    print('Secret message encoded into bit sequence: ',tobits)
    

   
    _test(100,tobits)

Writing gpt2.py


In [4]:
!python3 gpt2.py

  import imp
Enter secret message Help
Secret message encoded into bit sequence:  01111000
Randomly selected word:  the
index of candidate chosen in list of candidates:  7 candidate:  :
index of candidate chosen in list of candidates:  8 candidate:   A
index of candidate chosen in list of candidates:  0 candidate:   Muslim
index of candidate chosen in list of candidates:  0 candidate:   Brotherhood
index of candidate chosen in list of candidates:  0 candidate:  -
index of candidate chosen in list of candidates:  0 candidate:  led
index of candidate chosen in list of candidates:  0 candidate:   government
index of candidate chosen in list of candidates:  0 candidate:   in
index of candidate chosen in list of candidates:  0 candidate:   Egypt
index of candidate chosen in list of candidates:  0 candidate:   has
index of candidate chosen in list of candidates:  0 candidate:   been
index of candidate chosen in list of candidates:  0 candidate:   accused
index of candidate chosen in list of 