In [1]:
import torch
import pandas as pd
import os
import string
from nltk.tokenize import word_tokenize
from collections import defaultdict
import numpy as np 
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.optim as optim

curr_dir = "/home/hyarrava/Text_generation_using_lstm/"

all_headlines = []

for filename in os.listdir(os.path.join(curr_dir, "data/")):
    if 'Articles' in filename:
        articles_df = pd.read_csv(os.path.join(curr_dir, "data/", filename))
        all_headlines.extend(list(articles_df.headline.values))
        break

all_headlines = [h for h in all_headlines if h!="Unknown"]

def clean_text(txt):
    txt = ''.join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf-8").decode("ascii", "ignore")
    return txt

corpus = [clean_text(x) for x in all_headlines]

def build_word_index(corpus):
    word_index = defaultdict(int)
    index =0

    for line in corpus:
        tokens = word_tokenize(line.lower())
        for token in tokens:
            if token not in word_index:
                word_index[token] = index
                index+=1
    return word_index


def get_sequence_of_tokens(corpus, word_index):
    input_sequences = [] 
    for line in corpus:
        tokens = word_tokenize(line.lower())
        token_list = [word_index[token] for token in tokens] ## list of tokens for each sentence or line       
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            
    return input_sequences

word_index = build_word_index(corpus)
total_words = len(word_index)
input_sequences = get_sequence_of_tokens(corpus, word_index)

### Padding

def to_categorical_numpy(labels, num_classes):
    # Create a zero matrix of shape (number of labels, number of classes)
    one_hot_encoded = np.zeros((labels.size, num_classes))
    
    # Set the appropriate elements to 1 based on the labels
    one_hot_encoded[np.arange(labels.size), labels] = 1
    
    return one_hot_encoded

def making_features_labels(sequences):
    predictor, labels = sequences[:,:-1], sequences[:,-1]
    labels = to_categorical_numpy(labels, total_words)  # total_words should be defined globally or passed in
    return predictor, labels

def pad_sequences(sequences, maxlen=None, padding='pre', truncating='pre', value=0):
    if maxlen is None:
        maxlen = max(len(seq) for seq in sequences)

    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            if padding == 'post':
                padded_seq = seq + [value] * (maxlen - len(seq))
            else:
                padded_seq = [value] * (maxlen - len(seq)) + seq
        else:
            padded_seq = seq

        if len(seq) > maxlen:
            if truncating == 'pre':
                truncated_seq = seq[-maxlen:]
            else:
                truncated_seq = seq[:maxlen]

        padded_sequences.append(padded_seq)

    return np.array(padded_sequences), maxlen


# Revised workflow to generate features and labels
def generate_padded_sequences(input_sequences):
    padded_sequences, max_sequence_len = pad_sequences(input_sequences, padding='pre')
    predictors, labels = making_features_labels(padded_sequences)
    
    return predictors, labels, max_sequence_len


predictor, labels, max_sequence_len = generate_padded_sequences(input_sequences)


predictor.shape, labels.shape

max_index = np.max(predictor)
print("Max index in predictor:", max_index)
print("Total words:", total_words)

# Ensure that max_index is within bounds
if max_index >= total_words:
    raise ValueError(f"Index {max_index} in predictor exceeds the maximum index allowed ({total_words - 1}). Please check your data preprocessing.")

# Check for negative indices
if np.any(predictor < 0):
    raise ValueError("Negative indices found in predictor. All indices should be non-negative.")

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictor, labels, test_size=0.2, random_state=42)

print("Train Data Shape:", X_train.shape)
print("Test Data Shape:", X_test.shape)

## Model creation

class Text_generation(nn.Module):
    def __init__(self, maxlen_seq, total_words):
        super().__init__()
        self.num_lstm_layers = 3
        self.layer1 = nn.Embedding(num_embeddings=total_words, embedding_dim=256)
        self.layer2 = nn.LSTM(256, 32, num_layers = self.num_lstm_layers,  batch_first=True, dropout=0.3, bidirectional = True)
        self.layer3 = nn.Dropout(0.3)
        self.layer4 = nn.Linear(32*2, total_words) ## *2 is for bidirectional
        self.layernorm = nn.LayerNorm(32*2)
    def forward(self, predictor):
        out = self.layer1(predictor)         # Embedding layer
        out, (hn, cn) = self.layer2(out)     # LSTM returns (output, (h_n, c_n)), unpack both
        out = self.layernorm(out[:, -1, :])
        out = self.layer3(out)               # Dropout layer (expects a tensor, not a tuple)
        out = self.layer4(out)     # Only take the output of the last time step from LSTM
        return out

def prepare_dataset(X_train, y_train, X_test, y_test, batch_size=8):
    # Convert numpy arrays to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.long)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32) 
    X_test_tensor = torch.tensor(X_test, dtype=torch.long)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
    
    # Create DataLoader for batch processing
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, test_loader

# Training function with checkpoint saving
def train_model_with_checkpoint(model, train_loader, criterion, optimizer, num_epochs=20, checkpoint_dir='checkpoints/'):
    # Create a directory to save checkpoints if it doesn't exist
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    
    model.train()
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')
        
        # Save the model checkpoint after every epoch
        checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch+1}.pth')
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': running_loss / len(train_loader),
        }, checkpoint_path)
        print(f'Model checkpoint saved at {checkpoint_path}')

    return model


def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)   # Binary classification threshold
            labels = torch.argmax(labels, dim =1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')


batch_size = 8
num_epochs = 30
learning_rate = 0.001

# Prepare data (assuming X_train, y_train, X_test, y_test are already prepared as numpy arrays)
train_loader, test_loader = prepare_dataset(X_train, y_train, X_test, y_test, batch_size)

# Initialize model, criterion (loss function), and optimizer
model = Text_generation(max_sequence_len, total_words)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Binary cross-entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
model = train_model_with_checkpoint(model, train_loader, criterion, optimizer, num_epochs)
torch.save(model.state_dict(), 'model_weights.pth')


# Evaluate the model
evaluate_model(model, test_loader)

def build_index_word(word_index):
    return {index: word for word, index in word_index.items()}

def encode_text(text, word_index):
    tokens = text.lower().split()  # Tokenize and lower the text
    return [word_index[token] for token in tokens if token in word_index]

def decode_index(index, index_word):
    return index_word.get(index, "")

def generate_text(seed_text, next_words, model, max_sequence_len, word_index, index_word):
    for _ in range(next_words):
        # Encode the seed text
        token_list = encode_text(seed_text, word_index)
        
        # Pad the sequences
        token_list, _ = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        
        # Convert to tensor and ensure it's of long type
        token_tensor = torch.tensor(token_list, dtype=torch.long).squeeze(0)  # Remove any extra dim if it exists

        # Add a batch dimension (batch_size=1)
        token_tensor = token_tensor.unsqueeze(0)  # Shape: (1, max_sequence_len-1)
        
        # Predict the next word index
        with torch.no_grad():  # Disable gradient calculation for inference
            predicted = model(token_tensor).argmax(dim=1).item()  # Use argmax to get the predicted index

        # Map the predicted index back to a word
        output_word = decode_index(predicted, index_word)
        
        # Append the predicted word to the seed text
        seed_text += " " + output_word
        
    return seed_text.title()


Max index in predictor: 3580
Total words: 3581
Train Data Shape: (6445, 17)
Test Data Shape: (1612, 17)
Epoch [1/30], Loss: 7.7735
Model checkpoint saved at checkpoints/model_epoch_1.pth
Epoch [2/30], Loss: 7.2890
Model checkpoint saved at checkpoints/model_epoch_2.pth
Epoch [3/30], Loss: 7.0338
Model checkpoint saved at checkpoints/model_epoch_3.pth
Epoch [4/30], Loss: 6.7729
Model checkpoint saved at checkpoints/model_epoch_4.pth
Epoch [5/30], Loss: 6.4988
Model checkpoint saved at checkpoints/model_epoch_5.pth
Epoch [6/30], Loss: 6.2122
Model checkpoint saved at checkpoints/model_epoch_6.pth
Epoch [7/30], Loss: 5.9568
Model checkpoint saved at checkpoints/model_epoch_7.pth
Epoch [8/30], Loss: 5.6653
Model checkpoint saved at checkpoints/model_epoch_8.pth
Epoch [9/30], Loss: 5.4405
Model checkpoint saved at checkpoints/model_epoch_9.pth
Epoch [10/30], Loss: 5.1733
Model checkpoint saved at checkpoints/model_epoch_10.pth
Epoch [11/30], Loss: 4.9689
Model checkpoint saved at checkpoint

In [12]:
# Example usage
index_word = build_index_word(word_index)
print(generate_text("Trial of Killer’s", 20, model, max_sequence_len, word_index, index_word))

Trial Of Killer’S Killers Widow Scared Victim Of Stormy Danielss Legal Fund Joins North Korea Gamble Or Marvel China Plans To Trump For
