In [None]:
pip install rouge_score

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import gc
from rouge_score import rouge_scorer

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [6]:
input_files = [
    '/kaggle/input/persian-wikipedia-dataset/Persian-WikiText-1.txt',
    '/kaggle/input/persian-wikipedia-dataset/Persian-WikiText-2.txt',
    '/kaggle/input/persian-wikipedia-dataset/Persian-WikiText-3.txt',
    '/kaggle/input/persian-wikipedia-dataset/Persian-WikiText-4.txt',
    '/kaggle/input/persian-wikipedia-dataset/Persian-WikiText-5.txt',
    '/kaggle/input/persian-wikipedia-dataset/Persian-WikiText-6.txt',
    '/kaggle/input/persian-wikipedia-dataset/Persian-WikiText-7.txt',
    '/kaggle/input/persian-wikipedia-dataset/Persian-WikiText-8.txt',
    '/kaggle/input/persian-wikipedia-dataset/Persian-WikiText-9.txt'
]

output_file = '/kaggle/working/persian_wikipedia.txt'

with open(output_file, 'w', encoding='utf-8') as outfile:
    for fname in input_files:
        with open(fname, 'r', encoding='utf-8') as infile:
            outfile.write(infile.read())

In [None]:
class PersianWikiDataset(Dataset):
    def __init__(self, text_file, sequence_length, subset_size=None):
        self.sequence_length = sequence_length
        with open(text_file, 'r', encoding='utf-8') as f:
            self.text = f.read()

        if subset_size:
            self.text = self.text[:subset_size]
        
        self.chars = sorted(list(set(self.text)))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.chars)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.chars)}

        self.encoded_text = [self.char_to_idx[char] for char in self.text]

    def __len__(self):
        return len(self.encoded_text) - self.sequence_length

    def __getitem__(self, idx):
        x = self.encoded_text[idx:idx + self.sequence_length]
        y = self.encoded_text[idx + 1:idx + self.sequence_length + 1]
        return torch.tensor(x), torch.tensor(y)

In [None]:
class TextGeneratorModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(TextGeneratorModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out.reshape(out.size(0) * out.size(1), out.size(2)))
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.num_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [None]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
def train_model(model, dataloader, epochs, lr, accumulation_steps=4):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    model.zero_grad()

    for epoch in range(epochs):
        hidden = model.init_hidden(dataloader.batch_size)
        hidden = tuple([each.to(device) for each in hidden])
        
        for i, (x, y) in enumerate(dataloader):
            if x.size(0) != hidden[0].size(1):  # Check batch size consistency
                hidden = model.init_hidden(x.size(0))
                hidden = tuple([each.to(device) for each in hidden])
            
            x, y = x.to(device), y.to(device)  # Move data to GPU
            hidden = tuple([each.data.to(device) for each in hidden])
            output, hidden = model(x, hidden)
            loss = criterion(output, y.view(-1))
            loss.backward()
            
            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                model.zero_grad()

            # Clear memory periodically
            clear_memory()

        print(f'Epoch: {epoch + 1}/{epochs}, Loss: {loss.item()}')

In [None]:
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            hidden = model.init_hidden(x.size(0))  # Initialize hidden state with current batch size
            hidden = tuple([each.to(device) for each in hidden])
            output, hidden = model(x, hidden)
            loss = criterion(output, y.view(-1))
            total_loss += loss.item()
    
    perplexity = torch.exp(torch.tensor(total_loss / len(dataloader)))
    return perplexity.item()

In [None]:
# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Use a smaller subset for initial testing
subset_size = 100000  # Use 1 million characters for example
sequence_length = 30  # Shorter sequence length
text_file = '/kaggle/working/persian_wikipedia.txt'
dataset = PersianWikiDataset(text_file, sequence_length, subset_size=subset_size)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)  # Smaller batch size

In [None]:
# Initialize model and move to device
vocab_size = len(dataset.chars)
embedding_dim = 128
hidden_dim = 128  # Reduced hidden dimension
num_layers = 1    # Reduced number of layers
model = TextGeneratorModel(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)

In [7]:
# Train model
epochs = 5
lr = 0.001
train_model(model, dataloader, epochs, lr)

# Evaluate model
perplexity = evaluate_model(model, dataloader)
print(f'Perplexity: {perplexity}')

Epoch: 1/5, Loss: 1.7002850770950317
Epoch: 2/5, Loss: 1.5726206302642822
Epoch: 3/5, Loss: 1.9303953647613525
Epoch: 4/5, Loss: 1.3571174144744873
Epoch: 5/5, Loss: 1.3596223592758179
Perplexity: 4.25123929977417


In [9]:
# Generate some text using the model and compare it to a reference text
reference = "some reference text"
predicted = "generated text"
rouge_scores = compute_rouge(predicted, reference)
print(rouge_scores)

{'rouge1': Score(precision=0.5, recall=0.3333333333333333, fmeasure=0.4), 'rougeL': Score(precision=0.5, recall=0.3333333333333333, fmeasure=0.4)}


In [None]:
def compute_rouge(predicted, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, predicted)
    return scores