In [1]:
import torch
import os
import transformers
print(transformers.__version__)

4.46.3


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
# Define the path to the DS 5690 directory
BASE_DIR = '/content/drive/My Drive/Colab Notebooks/DS 5690/final_project/poetic-gpt-chinese'
model_path = os.path.join(BASE_DIR, 'save.model.pth')
# Ensure the directory exists
import os
os.makedirs(BASE_DIR, exist_ok=True)

In [4]:
# Import the AutoTokenizer class from the transformers library
from transformers import AutoTokenizer

# Load the tokenizer
# The 'AutoTokenizer.from_pretrained()' method is used to load a pre-trained tokenizer.
# Here, we are loading the tokenizer for the 'uer/gpt2-chinese-cluecorpussmall' model,
# which is a version of GPT-2 trained on a Chinese corpus (CLUECorpusSmall).
tokenizer = AutoTokenizer.from_pretrained('uer/gpt2-chinese-cluecorpussmall')

# Print the details of the tokenizer object
# This will display information about the loaded tokenizer, such as its configuration and supported vocabularies.
print(tokenizer)

# Tokenization demo
# The 'batch_encode_plus()' method tokenizes a batch of input sentences.
# Each sentence is split into smaller sub-word units (tokens) according to the tokenizer's vocabulary and rules.
# Here, two Chinese poems are passed as a list to the method for batch processing.
encoded = tokenizer.batch_encode_plus([
    # The first poem
    # Original (Chinese):
    '欲出未出光辣达,千山万山如火发.须臾走向天上来,逐却残星赶却月.',
    # Translation (English):
    # "The light emerges but not yet fully, illuminating mountains as if ablaze.
    # In an instant, it rushes to the sky, chasing away the lingering stars and the moon."

    # The second poem
    # Original (Chinese):
    '满目江山四望幽,白云高卷嶂烟收.日回禽影穿疏木,风递猿声入小楼.远岫似屏横碧落,断帆如叶截中流.',
    # Translation (English):
    # "Vast landscapes stretch out serenely, as white clouds roll high and mountain mist recedes.
    # The sun reflects bird shadows through sparse trees, and wind carries the cries of apes to a small tower.
    # Distant peaks rise like screens in the azure sky, and broken sails drift like leaves across the midstream."
])

print(encoded)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='uer/gpt2-chinese-cluecorpussmall', vocab_size=21128, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
{'input_ids': [[101, 3617, 1139, 3313, 1139, 1045, 6793, 6809, 117, 1283, 2255, 674, 2255, 1963, 4125, 1355, 11

In [5]:
import torch

# Define a custom Dataset class for handling a simple dataset
# This class inherits from 'torch.utils.data.Dataset', a PyTorch utility for datasets.
class Dataset(torch.utils.data.Dataset):

    def __init__(self):
        """
        Initialization method for the Dataset class.
        - This method loads a file named 'chinese_poems.txt',
          reads its content line by line, and stores the cleaned lines in a list.
        """
        with open('/content/drive/My Drive/Colab Notebooks/DS 5690/final_project/poetic-gpt-chinese/chinese_poems.txt') as f:
            # Read all lines from the file
            lines = f.readlines()
        # Strip any leading/trailing whitespace from each line
        lines = [i.strip() for i in lines]

        # Store the processed lines as the dataset
        self.lines = lines

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        - This allows PyTorch to determine the size of the dataset when iterating.
        """
        return len(self.lines)

    def __getitem__(self, i):
        """
        Retrieves a single sample from the dataset.
        - `i` is the index of the sample to retrieve.
        - Returns the line of text corresponding to the index `i`.
        """
        return self.lines[i]


# Instantiate the custom Dataset
dataset = Dataset()

# Retrieve the size of the dataset and the first sample
len_dataset = len(dataset)   # Number of samples in the dataset
first_sample = dataset[0]    # The first line of the dataset

# Display the results
len_dataset, first_sample

(304752, '欲出未出光辣达,千山万山如火发.须臾走向天上来,逐却残星赶却月.')

In [6]:
import torch
import os
import pandas as pd

# Define a custom Dataset class for handling a more complex dataset
class Dataset(torch.utils.data.Dataset):

    def __init__(self):
        """
        Initialization method for the Dataset class.
        - This method reads multiple CSV files from the 'more_datas' directory,
          processes the text data, and applies filtering and cleaning steps.
        """
        data = []  # List to store data from all files

        # Iterate through all files in the 'more_datas' directory
        for i in os.listdir('/content/drive/My Drive/Colab Notebooks/DS 5690/final_project/poetic-gpt-chinese/more_datas'):
            # Skip system-related files such as Jupyter notebook checkpoints
            if i == '.ipynb_checkpoints':
                continue
            # Read each CSV file into a pandas DataFrame and append it to the list
            data.append(pd.read_csv(f'/content/drive/My Drive/Colab Notebooks/DS 5690/final_project/poetic-gpt-chinese/more_datas/{i}'))

        # Concatenate all the DataFrames into a single DataFrame
        data = pd.concat(data).reset_index()

        # Select the column
        data = data['内容']

        # Remove leading and trailing whitespace from each entry
        data = data.str.strip()

        # Remove specific punctuation characters (e.g., 《》“”「」)
        # Use regex to identify and remove these characters
        data = data.str.replace('[《》“”「」]', '', regex=True)

        # Apply regex-based filtering
        # Only keep entries that match the specified pattern:
        # ^[\w，。？、！：；]+$ matches text containing Chinese characters, punctuation, and underscores.
        select = data.str.match('^[\w，。？、！：；]+$', na=False)
        data = data[select]

        # Normalize punctuation for consistency:
        # Replace '？！；' with '。' (convert question marks, exclamation points, and semicolons to periods)
        data = data.str.replace('[？！；]', '。', regex=True)
        # Replace '、：' with '，' (convert list markers and colons to commas)
        data = data.str.replace('[、：]', '，', regex=True)

        # Store the cleaned and filtered data
        self.data = data

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        - This allows PyTorch utilities to determine the dataset size.
        """
        return len(self.data)

    def __getitem__(self, i):
        """
        Retrieves a single sample from the dataset.
        - `i` is the index of the sample to retrieve.
        - Returns the text data (as a string) at the specified index.
        """
        return self.data.iloc[i]


# Instantiate the custom Dataset
dataset = Dataset()

# Retrieve the size of the dataset and the first sample
len_dataset = len(dataset)   # Number of samples in the dataset
first_sample = dataset[0]    # The first text entry in the dataset

# Display the results
len_dataset, first_sample

(839587, '四时运灰琯，一夕变冬春。送寒馀雪尽，迎岁早梅新。')

In [7]:
def collate_fn(data):
    """
    Custom collate function to process a batch of data for the DataLoader.

    Parameters:
    - data (list): A batch of samples from the dataset (list of strings).

    Returns:
    - data (dict): A dictionary containing tokenized data in PyTorch tensor format, including:
        - 'input_ids': Encoded token IDs for the input text.
        - 'attention_mask': Attention masks for the input text.
        - 'labels': A copy of the 'input_ids', which is often used as labels for language modeling tasks.
    """
    # Tokenize the batch of text data
    data = tokenizer.batch_encode_plus(data,          # Input data (list of strings)
                                       padding=True,  # Pad sequences to the same length
                                       truncation=True,  # Truncate sequences to the maximum length
                                       max_length=512,   # Maximum token length per sequence
                                       return_tensors='pt')  # Return results as PyTorch tensors

    # Clone 'input_ids' to use as labels
    # This is a typical practice in language modeling where the model predicts the next token
    data['labels'] = data['input_ids'].clone()

    return data


# Create the DataLoader
# The DataLoader iterates over the dataset and provides batches of data for training/testing.
loader = torch.utils.data.DataLoader(
    dataset=dataset,       # The dataset object created earlier
    batch_size=8,          # Number of samples per batch
    collate_fn=collate_fn, # Custom collate function for processing batches
    shuffle=True,          # Shuffle the dataset at every epoch
    drop_last=True,        # Drop the last incomplete batch if the dataset size is not divisible by batch_size
)

# Iterate over the DataLoader to get the first batch
for i, data in enumerate(loader):
    break  # Stop after processing the first batch

# Print the keys and shapes of the tensors in the batch
for k, v in data.items():
    print(k, v.shape)  # k is the key (e.g., 'input_ids'), v.shape is the shape of the tensor

# Get the total number of batches in the DataLoader
len_loader = len(loader)  # Number of batches

len_loader

input_ids torch.Size([8, 130])
token_type_ids torch.Size([8, 130])
attention_mask torch.Size([8, 130])
labels torch.Size([8, 130])


104948

In [8]:
from transformers import AutoModelForCausalLM, GPT2Model
import torch

# Load the pre-trained causal language model
# 'AutoModelForCausalLM' is used to load a model suitable for causal language modeling tasks (e.g., GPT-2).
# The model 'uer/gpt2-chinese-cluecorpussmall' is a GPT-2 variant fine-tuned on a Chinese corpus.
model = AutoModelForCausalLM.from_pretrained('uer/gpt2-chinese-cluecorpussmall')

# Calculate the total number of model parameters
# Use 'model.parameters()' to access all parameters of the model.
# 'numel()' counts the total elements in each parameter tensor.
# The result is divided by 10,000 to display the parameter count in tens of thousands.
print(f"Number of model parameters (in tens of thousands): {sum(p.numel() for p in model.parameters()) / 10000}")

# Perform a forward pass through the model without gradient computation
# 'torch.no_grad()' disables gradient tracking, which reduces memory usage during inference.
with torch.no_grad():
    # Perform the forward pass with the tokenized input data
    # 'data' is expected to be the batch created by the DataLoader, containing 'input_ids' and other necessary tensors.
    out = model(**data)

# Extract and print the loss and the shape of the logits
# 'out' is a dictionary containing:
#   - 'loss': The loss value if labels are provided in the input.
#   - 'logits': The raw output predictions (scores before applying softmax) from the model.
print("Loss:", out['loss'])
print("Logits shape:", out['logits'].shape)


pytorch_model.bin:   0%|          | 0.00/421M [00:00<?, ?B/s]

Number of model parameters (in tens of thousands): 10206.8736
Loss: tensor(9.4741)
Logits shape: torch.Size([8, 130, 21128])


In [40]:
def generate(text, row, col):
    """
    Generates text based on a given input seed using a causal language model.

    Parameters:
    - text (str): The input seed text for generation.
    - row (int): Number of rows for the output structure.
    - col (int): Number of columns for the output structure.

    The function uses a nested loop function (`generate_loop`) to iteratively generate tokens
    until the desired text length or structure is achieved.
    """

    def generate_loop(data):
        """
        Performs the iterative generation loop to produce tokens.

        Parameters:
        - data (dict): A dictionary containing the tokenized input data.

        Returns:
        - data (dict): Updated data with additional generated tokens.
        """
        with torch.no_grad():
            # Perform a forward pass through the model
            out = model(**data)

        # Extract logits (predictions) from the model's output
        out = out['logits']  # Shape: [batch_size, sequence_length, vocab_size]
        out = out[:, -1]     # Take the last token's logits. Shape: [batch_size, vocab_size]

        # Filter tokens to keep only the top 50 probabilities
        topk_value = torch.topk(out, 50).values  # Get the top 50 values
        topk_value = topk_value[:, -1].unsqueeze(dim=1)  # Take the 50th value (threshold)

        # Mask out logits below the threshold (assign them negative infinity)
        out = out.masked_fill(out < topk_value, -float('inf'))

        # Prevent generation of special symbols
        out[:, tokenizer.sep_token_id] = -float('inf')
        out[:, tokenizer.unk_token_id] = -float('inf')
        out[:, tokenizer.pad_token_id] = -float('inf')

        # Prevent generation of specific punctuation symbols
        for i in '，。':
            out[:, tokenizer.get_vocab()[i]] = -float('inf')

        # Sample the next token based on probabilities (no replacement)
        out = out.softmax(dim=1)         # Convert logits to probabilities
        out = out.multinomial(num_samples=1)  # Sample one token per batch

        # Enforce punctuation at certain positions based on the column structure
        c = data['input_ids'].shape[1] / (col + 1)  # Calculate the position
        if c % 1 == 0:  # If it's at the end of a row
            if c % 2 == 0:  # Alternate between '。' and '，'
                out[:, 0] = tokenizer.get_vocab()['。']
            else:
                out[:, 0] = tokenizer.get_vocab()['，']

        # Append the new token to the input sequence
        data['input_ids'] = torch.cat([data['input_ids'], out], dim=1)

        # Update attention and token type masks
        data['attention_mask'] = torch.ones_like(data['input_ids'])
        data['token_type_ids'] = torch.zeros_like(data['input_ids'])

        # Clone the updated input as labels for loss calculation
        data['labels'] = data['input_ids'].clone()

        # Check if the generated sequence has reached the target length
        if data['input_ids'].shape[1] >= row * col + row + 1:
            return data

        # Recursively call the function until the condition is met
        return generate_loop(data)

    # Prepare the input data (repeat the seed text 3 times for generation)
    data = tokenizer.batch_encode_plus([text] * 3, return_tensors='pt')
    data['input_ids'] = data['input_ids'][:, :-1]  # Remove the end token for initial input
    data['attention_mask'] = torch.ones_like(data['input_ids'])  # Initialize attention mask
    data['token_type_ids'] = torch.zeros_like(data['input_ids'])  # Initialize token type IDs
    data['labels'] = data['input_ids'].clone()  # Clone input IDs as labels

    # Start the recursive generation loop
    data = generate_loop(data)

    # Decode and print the generated sequences
    for i in range(3):
        print(i, tokenizer.decode(data['input_ids'][i]))


# Generate text using the function
generate('秋高气爽', row=4, col=5)


0 [CLS] 秋 高 气 爽 人 ， 春 暖 新 衣 风 。 长 安 未 来 事 ， 时 间 流 向 长 。
1 [CLS] 秋 高 气 爽 宜 ， 冬 半 正 是 阳 。 万 里 风 沙 如 ， 水 清 秋 月 一 。
2 [CLS] 秋 高 气 爽 秋 ， 松 香 十 里 来 。 松 阴 欲 尽 夕 ， 松 枝 正 照 阳 。


In [10]:
import torch
from torch.cuda.amp import GradScaler, autocast
from transformers import AdamW
from transformers.optimization import get_scheduler
import datetime
import time

# Define the training function with Mixed Precision and Time Estimation
def train_and_save():
    """
    Train the model using Mixed Precision Training, log progress, and save the model.
    Includes:
    - Logging metrics every 1000 steps.
    - Estimated time remaining for training.
    - Mixed Precision Training with GradScaler and autocast.
    """
    global model  # Use the global model variable
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)

    # Initialize the optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=5e-5)
    scheduler = get_scheduler(
        name='linear',
        num_warmup_steps=0,
        num_training_steps=len(loader),
        optimizer=optimizer
    )

    # Mixed Precision: Initialize GradScaler
    scaler = GradScaler()

    # Set the model to training mode
    model.train()

    # Get the total number of batches
    total_batches = len(loader)

    # Start timing
    start_time = time.time()

    # Iterate through the DataLoader
    for i, data in enumerate(loader):
        # Move data to the appropriate device
        for k in data.keys():
            data[k] = data[k].to(device)

        # Mixed Precision: Forward pass with autocast
        with autocast():
            out = model(**data)
            loss = out['loss']

        # Backward pass with GradScaler
        scaler.scale(loss).backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Optimizer step with GradScaler
        scaler.step(optimizer)
        scaler.update()

        # Scheduler step
        scheduler.step()

        # Zero gradients for the next step
        optimizer.zero_grad()
        model.zero_grad()

        # Estimate time remaining every 100 steps
        if i % 100 == 0:
            elapsed_time = time.time() - start_time
            avg_time_per_batch = elapsed_time / (i + 1)
            remaining_batches = total_batches - (i + 1)
            estimated_time_left = remaining_batches * avg_time_per_batch
            formatted_time_left = str(datetime.timedelta(seconds=int(estimated_time_left)))

            # Log progress
            print(f"Step {i}/{total_batches}: Loss={loss.item():.4f}, Estimated time left: {formatted_time_left}")

        # Log metrics every 1000 steps
        if i % 1000 == 0:
            labels = data['labels'][:, 1:]  # Shifted true labels
            out = out['logits'].argmax(dim=2)[:, :-1]  # Predicted labels

            # Mask padding tokens for accuracy calculation
            select = labels != 0
            labels = labels[select]
            out = out[select]

            # Calculate accuracy
            accuracy = (labels == out).sum().item() / labels.numel()

            # Get the current learning rate
            lr = optimizer.state_dict()['param_groups'][0]['lr']

            # Log the current metrics
            print(f"Step {i}: Loss={loss.item():.4f}, LR={lr:.6e}, Accuracy={accuracy:.4f}")

    # Save the model
    model = model.to('cpu')  # Move the model to CPU for saving
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")  # Add a timestamp
    save_path = os.path.join(BASE_DIR, f"save.model.pth")
    torch.save(model.state_dict(), save_path)  # Save the state_dict

    print(f"Model saved as '{save_path}'")

# Call the training function
train_and_save()


  scaler = GradScaler()
  with autocast():


Step 0/104948: Loss=9.5048, Estimated time left: 2 days, 5:17:30
Step 0: Loss=9.5048, LR=4.999952e-05, Accuracy=0.0692
Step 100/104948: Loss=8.7636, Estimated time left: 2:06:31
Step 200/104948: Loss=5.7700, Estimated time left: 1:46:12
Step 300/104948: Loss=4.5879, Estimated time left: 1:38:54
Step 400/104948: Loss=2.5003, Estimated time left: 1:35:03
Step 500/104948: Loss=3.1449, Estimated time left: 1:33:08
Step 600/104948: Loss=2.7162, Estimated time left: 1:31:33
Step 700/104948: Loss=2.9393, Estimated time left: 1:30:11
Step 800/104948: Loss=2.8048, Estimated time left: 1:29:18
Step 900/104948: Loss=2.4995, Estimated time left: 1:28:31
Step 1000/104948: Loss=2.3800, Estimated time left: 1:28:02
Step 1000: Loss=2.3800, LR=4.952310e-05, Accuracy=0.1806
Step 1100/104948: Loss=2.3934, Estimated time left: 1:27:32
Step 1200/104948: Loss=2.8190, Estimated time left: 1:27:14
Step 1300/104948: Loss=2.4244, Estimated time left: 1:26:57
Step 1400/104948: Loss=2.8616, Estimated time left: 1

In [15]:
model = torch.load(model_path)



  model = torch.load(model_path)


In [16]:
!pip install gradio



In [41]:
from transformers import AutoModelForCausalLM

# Define the model architecture
model = AutoModelForCausalLM.from_pretrained('uer/gpt2-chinese-cluecorpussmall')

# Load the weights into the model
model.load_state_dict(torch.load(model_path))
model.eval()  # Set to evaluation mode


# Test the model by generating text
generate("秋日", row=4, col=5)

  model.load_state_dict(torch.load(model_path))


0 [CLS] 秋 日 明 月 无 ， 春 风 有 旧 阳 。 春 来 无 限 好 ， 日 已 无 尽 新 。
1 [CLS] 秋 日 何 处 闻 ， 松 篱 松 梢 见 。 不 识 春 香 草 ， 深 深 见 草 时 。
2 [CLS] 秋 日 清 流 多 ， 月 里 江 水 清 。 何 须 人 作 山 ， 亦 自 东 山 有 。


In [42]:
import io
import sys
import gradio as gr

# Wrapper function for Gradio to capture printed output
def gradio_generate(seed_text, rows, cols):
    """
    Wrapper for the generate function to use with Gradio.
    Captures the printed output and returns it as a string.
    """
    # Redirect standard output to capture prints
    old_stdout = sys.stdout
    new_stdout = io.StringIO()
    sys.stdout = new_stdout

    # Call the original generate function
    generate(seed_text, row=int(rows), col=int(cols))

    # Restore standard output and fetch printed content
    sys.stdout = old_stdout
    output = new_stdout.getvalue()

    return output

# Gradio interface
interface = gr.Interface(
    fn=gradio_generate,  # Use the wrapper function
    inputs=[
        gr.Textbox(lines=2, label="Seed Text", placeholder="Enter the seed text for the poem..."),
        gr.Number(label="Rows", value=4, precision=0),
        gr.Number(label="Columns", value=5, precision=0),
    ],
    outputs="text",
    title="Interactive Poem Generator",
    description="Enter a seed text and customize the structure (rows and columns). The model will generate a structured Chinese poem."
)

# Launch the Gradio interface
interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5c1eeadbc2c5867b6a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


