In [23]:
# CNN-RNN Model with Beam Search for Chest X-Ray Report Generation
# Modified to fix repetition issues and improve generation quality

import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
import matplotlib.pyplot as plt
from PIL import Image
import pickle
import time
import tqdm
import pydicom
import string
import certifi
import ssl
import heapq
import math

# Fix SSL certificate issues in macOS
import os
os.environ['SSL_CERT_FILE'] = certifi.where()
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()

# Simple tokenizer to avoid NLTK dependency
def simple_tokenize(text):
    """Simple tokenizer that splits text on whitespace and punctuation."""
    if not isinstance(text, str):
        return []
    # Remove punctuation and replace with space
    for char in string.punctuation:
        text = text.replace(char, ' ' + char + ' ')

    # Split on whitespace and filter empty tokens
    tokens = [token for token in text.lower().split() if token.strip()]
    return tokens

# Define paths (keep these the same)
base_path = '/Users/simeon/Documents/DLH/content/mimic-cxr-project'
data_dir = os.path.join(base_path, 'data')
files_path = os.path.join(base_path, 'new_files')
output_dir = os.path.join(base_path, 'output')
reports_dir = os.path.join(base_path, 'reports')
models_dir = os.path.join(base_path, 'models')

# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

# Import the report parser module
import sys
sys.path.append(f"{base_path}/modules")
from report_parser import parse_report, MIMIC_RE
print("Successfully imported report parser module")

# Load train and test data
train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Enhanced Vocabulary class with frequency logging
class Vocabulary:
    def __init__(self, freq_threshold=2):
        self.itos = {0: "<PAD>", 1: "< SOS >", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "< SOS >": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
        self.frequencies = {}  # Track word frequencies

    def __len__(self):
        return len(self.itos)

    def build_vocabulary(self, sentence_list):
        self.frequencies = {}
        idx = 4

        for sentence in sentence_list:
            # Use simple tokenizer
            for word in simple_tokenize(sentence):
                if word not in self.frequencies:
                    self.frequencies[word] = 0
                self.frequencies[word] += 1

                if word not in self.stoi and self.frequencies[word] >= self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

        # Print top 20 most frequent words for debugging
        print("Top 20 most frequent words:")
        for word, freq in sorted(self.frequencies.items(), key=lambda x: x[1], reverse=True)[:20]:
            print(f"{word}: {freq}")

    def numericalize(self, text):
        # Use simple tokenizer
        tokenized_text = simple_tokenize(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

# Class to load and preprocess the data
class ChestXRayReportDataset(Dataset):
    def __init__(self, df, is_train=True, transform=None, max_seq_length=100):
        self.df = df
        self.is_train = is_train
        self.transform = transform
        self.max_seq_length = max_seq_length
        self.reports = []

        # Extract reports if training
        if self.is_train:
            print("Extracting report texts for training data...")
            for _, row in tqdm.tqdm(self.df.iterrows(), total=len(self.df)):
                subject_id = row['subject_id']
                study_id = row['study_id']

                # Construct path to report (FIXED PATH)
                subject_prefix = f"p{str(subject_id)[:2]}"
                subject_dir = f"p{subject_id}"
                study_dir = f"s{study_id}"

                # Try different path structures
                report_paths = [
                    os.path.join(reports_dir, subject_prefix, subject_dir, f"{study_dir}.txt"),  # Original
                    os.path.join(reports_dir, 'new_files', subject_prefix, subject_dir, f"{study_dir}.txt"),  # With 'new_files'
                    os.path.join(reports_dir, 'files', subject_prefix, subject_dir, f"{study_dir}.txt"),  # With 'files'
                    os.path.join(reports_dir, subject_prefix, subject_dir, study_dir, "report.txt")  # Alternative structure
                ]

                for report_path in report_paths:
                    try:
                        if os.path.exists(report_path):
                            report = parse_report(report_path)
                            if 'findings' in report and report['findings']:
                                self.reports.append((row['dicom_id'], report['findings']))
                                break  # Found a valid report, move to next row
                    except Exception as e:
                        continue  # Try next path

            print(f"Extracted {len(self.reports)} reports from training data")

            # If still no reports, add some dummy data for testing
            if len(self.reports) == 0:
                print("WARNING: No reports found! Adding dummy data for testing...")
                # Create a few dummy reports to allow model testing
                for i in range(min(10, len(self.df))):
                    self.reports.append((self.df.iloc[i]['dicom_id'], "This is a dummy report for testing purposes."))

        # Build DICOM paths
        self.dicom_paths = []
        for _, row in self.df.iterrows():
            subject_id = row['subject_id']
            study_id = row['study_id']
            dicom_id = row['dicom_id']

            # Construct path to DICOM file
            subject_prefix = f"p{str(subject_id)[:2]}"
            subject_dir = f"p{subject_id}"
            study_dir = f"s{study_id}"
            dicom_file = f"{dicom_id}.dcm"
            dicom_path = os.path.join(files_path, subject_prefix, subject_dir, study_dir, dicom_file)

            if os.path.exists(dicom_path):
                self.dicom_paths.append((dicom_id, dicom_path))

    def __len__(self):
        return len(self.dicom_paths)

    def __getitem__(self, idx):
        dicom_id, dicom_path = self.dicom_paths[idx]

        # Load and transform image
        try:
            ds = pydicom.dcmread(dicom_path)
            pixel_array = ds.pixel_array

            # Normalize and convert to RGB
            pixel_array = pixel_array / np.max(pixel_array)
            img = np.uint8(pixel_array * 255)

            # Convert to RGB
            if len(img.shape) == 2:
                img_rgb = np.stack([img, img, img], axis=2)
            elif img.shape[2] == 1:
                img_rgb = np.concatenate([img, img, img], axis=2)
            else:
                img_rgb = img

            pil_img = Image.fromarray(img_rgb)
            if self.transform:
                image = self.transform(pil_img)
        except Exception as e:
            # Create blank image if loading fails
            image = torch.zeros(3, 224, 224)

        # Return image and report for training, only image for testing
        if self.is_train:
            report_text = ""
            for report_id, report in self.reports:
                if report_id == dicom_id:
                    report_text = report
                    break
            return image, report_text, dicom_id
        else:
            return image, dicom_id

    # Added method to get all reports for vocabulary building
    def get_all_reports(self):
        return [report for _, report in self.reports]

# Improved CNN-RNN model with attention and beam search
class CNNRNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, cnn_feature_size):
        super(CNNRNNModel, self).__init__()

        # CNN encoder (DenseNet121)
        print("Loading DenseNet121 model with pretrained weights...")
        self.densenet = models.densenet121(pretrained=True)
        self.densenet.classifier = nn.Linear(cnn_feature_size, embed_size)

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)

        # Initial projection for CNN features
        self.feature_projection = nn.Linear(embed_size, hidden_size)

        # LSTM decoder
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)

        # Output layer
        self.fc = nn.Linear(hidden_size, vocab_size)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.5)

        # Store dimensions
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.embed_size = embed_size

    def forward(self, images, captions=None, teacher_forcing_ratio=0.5):
        # Extract features from images using CNN encoder
        # [batch_size, 3, 224, 224] -> [batch_size, embed_size]
        features = self.densenet(images)

        # Project features to hidden state size
        projected_features = self.feature_projection(features)

        # Training mode with captions
        if captions is not None:
            batch_size = features.size(0)
            caption_length = captions.size(1)

            # Initialize tensor for storing outputs
            outputs = torch.zeros(batch_size, caption_length, self.vocab_size).to(device)

            # Initialize hidden and cell state with projected image features
            h = projected_features.unsqueeze(0)  # [1, batch_size, hidden_size]
            c = torch.zeros(1, batch_size, self.hidden_size).to(device)

            # First input to LSTM is the SOS token
            x = self.embedding(torch.ones(batch_size, 1, dtype=torch.long).to(device))  # [batch_size, 1, embed_size]

            # Generate caption one word at a time
            for t in range(caption_length):
                # Forward through LSTM
                lstm_out, (h, c) = self.lstm(x, (h, c))  # lstm_out: [batch_size, 1, hidden_size]

                # Apply simple attention (we'll skip complex attention for now to avoid dimension issues)
                # Forward through fully connected layer
                out = self.fc(lstm_out)  # [batch_size, 1, vocab_size]

                # Store output
                outputs[:, t] = out.squeeze(1)  # [batch_size, vocab_size]

                # Teacher forcing
                use_teacher_force = torch.rand(1).item() < teacher_forcing_ratio

                if use_teacher_force and t < caption_length - 1:
                    # Use ground truth as next input
                    x = self.embedding(captions[:, t+1].unsqueeze(1))  # [batch_size, 1, embed_size]
                else:
                    # Use predicted word as next input
                    pred_token = out.argmax(2)  # [batch_size, 1]
                    x = self.embedding(pred_token)  # [batch_size, 1, embed_size]

            return outputs

        # Inference mode (using beam search)
        else:
            return self.beam_search(images, beam_size=3, max_length=100)

    def beam_search(self, images, beam_size=3, max_length=100, temperature=1.0, repetition_penalty=1.2):
        """
        Beam search implementation for better text generation
        with repetition penalty to avoid word repetition
        """
        batch_size = images.size(0)
        results = []

        for idx in range(batch_size):
            # Extract features for a single image
            image_features = self.densenet(images[idx:idx+1])  # [1, embed_size]

            # Project features to hidden state size
            projected_features = self.feature_projection(image_features)  # [1, hidden_size]

            # Initialize hidden and cell states
            h = projected_features.unsqueeze(0)  # [1, 1, hidden_size]
            c = torch.zeros(1, 1, self.hidden_size).to(device)  # [1, 1, hidden_size]

            # Start with SOS token
            start_token = torch.ones(1, 1, dtype=torch.long).to(device)  # [1, 1]
            start_embedding = self.embedding(start_token)  # [1, 1, embed_size]

            # First forward pass to get initial predictions
            lstm_out, (h, c) = self.lstm(start_embedding, (h, c))  # lstm_out: [1, 1, hidden_size]

            # Get initial logits
            logits = self.fc(lstm_out)  # [1, 1, vocab_size]
            logits = logits.squeeze(0).squeeze(0)  # [vocab_size]

            # Apply temperature sampling
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # [vocab_size]

            # Get top-k candidates
            topk_probs, topk_indices = torch.topk(probs, beam_size)  # [beam_size]

            # Initialize beams
            beams = [(math.log(prob.item()), [index.item()], h, c, {index.item(): 1})
                     for prob, index in zip(topk_probs, topk_indices)]

            # Beam search
            for _ in range(max_length - 1):
                new_beams = []

                # Expand each beam
                for log_prob, seq, hidden, cell, word_counts in beams:
                    # Check if sequence ended
                    if seq[-1] == 2:  # <EOS> token
                        new_beams.append((log_prob, seq, hidden, cell, word_counts))
                        continue

                    # Prepare input for next step (last token)
                    token = torch.tensor([[seq[-1]]], dtype=torch.long).to(device)
                    token_embedding = self.embedding(token)  # [1, 1, embed_size]

                    # Forward pass
                    lstm_out, (new_hidden, new_cell) = self.lstm(token_embedding, (hidden, cell))

                    # No attention for simplicity

                    # Get logits
                    logits = self.fc(lstm_out).squeeze(0).squeeze(0)  # [vocab_size]

                    # Apply repetition penalty
                    for word_idx, count in word_counts.items():
                        logits[word_idx] /= repetition_penalty * count

                    # Apply temperature
                    logits = logits / temperature

                    # Apply softmax
                    probs = F.softmax(logits, dim=-1)  # [vocab_size]

                    # Get top-k candidates for this beam
                    topk_probs, topk_indices = torch.topk(probs, beam_size)  # [beam_size]

                    # Add to new beams
                    for prob, index in zip(topk_probs, topk_indices):
                        # Create new sequence
                        new_seq = seq + [index.item()]

                        # Update word counts for repetition penalty
                        new_word_counts = word_counts.copy()
                        if index.item() in new_word_counts:
                            new_word_counts[index.item()] += 1
                        else:
                            new_word_counts[index.item()] = 1

                        # Calculate new log probability
                        new_log_prob = log_prob + math.log(prob.item())

                        # Add beam
                        new_beams.append((new_log_prob, new_seq, new_hidden, new_cell, new_word_counts))

                # Keep top beams
                beams = sorted(new_beams, key=lambda x: x[0], reverse=True)[:beam_size]

                # Check if all beams end with <EOS>
                if all(seq[-1] == 2 for _, seq, _, _, _ in beams):
                    break

            # Get best beam
            best_beam = max(beams, key=lambda x: x[0])
            best_seq = best_beam[1]

            # Remove < SOS > and <EOS> tokens if present
            if best_seq[0] == 1:  # < SOS >
                best_seq = best_seq[1:]
            if best_seq[-1] == 2:  # <EOS>
                best_seq = best_seq[:-1]

            # Add to results
            results.append(torch.tensor(best_seq))

        # Create padded tensor
        max_len = max([len(seq) for seq in results]) if results else 0
        padded_results = torch.zeros(batch_size, max_len, dtype=torch.long)
        for i, seq in enumerate(results):
            end = len(seq)
            padded_results[i, :end] = seq

        return padded_results

# Function to generate dummy data for testing the model
def create_dummy_batch(batch_size=2, seq_len=10, vocab_size=100):
    # Create dummy images
    images = torch.randn(batch_size, 3, 224, 224)

    # Create dummy captions
    captions = torch.randint(0, vocab_size, (batch_size, seq_len))

    return images, captions

# Main execution function
def run(training_mode=False):
    # Create dataset for training
    print("Creating training dataset...")
    train_dataset = ChestXRayReportDataset(train_df, is_train=True, transform=transform)

    # Build vocabulary - FIXED: Use the get_all_reports method
    print("Building vocabulary...")
    vocab = Vocabulary()
    # Get all reports from the dataset
    all_reports = train_dataset.get_all_reports()
    vocab.build_vocabulary(all_reports)
    print(f"Built vocabulary with {len(vocab)} tokens")

    # Save vocabulary
    vocab_path = os.path.join(models_dir, 'vocab.pkl')
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        num_workers=0  # Set to 0 to avoid multiprocessing issues on Mac
    )

    # Initialize model
    model = CNNRNNModel(
        vocab_size=len(vocab),
        embed_size=256,
        hidden_size=512,
        cnn_feature_size=1024
    )

    # Move model to device
    model = model.to(device)

    # Test model with dummy data to verify dimensions
    print("Testing model with dummy data...")
    dummy_images, dummy_captions = create_dummy_batch(batch_size=2, seq_len=10, vocab_size=len(vocab))
    dummy_images = dummy_images.to(device)
    dummy_captions = dummy_captions.to(device)

    # Forward pass
    try:
        with torch.no_grad():
            outputs = model(dummy_images, dummy_captions)
            print(f"Dummy forward pass successful! Output shape: {outputs.shape}")
    except Exception as e:
        print(f"Error in dummy forward pass: {e}")
        return

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding tokens
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=16, gamma=0.5)  # LR decay every 16 epochs

    # Load model path (but we won't load the model as requested)
    model_path = os.path.join(models_dir, 'cnn_rnn_improved.pth')

    if training_mode:
        print("Training new model from scratch...")
        # Training loop
        num_epochs = 64  # Change to 64 for full training
        best_loss = float('inf')

        for epoch in range(num_epochs):
            model.train()
            train_loss = 0

            # Gradually decrease teacher forcing
            teacher_forcing_ratio = max(0.5 - (epoch // 16) * 0.1, 0.0)

            for i, (images, captions, _) in enumerate(tqdm.tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
                images = images.to(device)

                # Tokenize and pad captions
                tokenized_captions = []
                for caption in captions:
                    if caption:
                        tokens = [1]  # < SOS >
                        tokens.extend(vocab.numericalize(caption))
                        tokens.append(2)  # <EOS>
                    else:
                        tokens = [1, 2]  # < SOS >, <EOS>
                    tokenized_captions.append(tokens)

                # Pad sequences
                padded_captions = []
                for tokens in tokenized_captions:
                    if len(tokens) > 100:
                        padded_captions.append(tokens[:100])
                    else:
                        padded_captions.append(tokens + [0] * (100 - len(tokens)))

                captions_tensor = torch.tensor(padded_captions).to(device)

                # Zero the gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = model(images, captions_tensor, teacher_forcing_ratio)

                # Reshape for loss calculation
                outputs = outputs.reshape(-1, outputs.shape[2])
                targets = captions_tensor.reshape(-1)

                # Calculate loss
                loss = criterion(outputs, targets)

                # Backward pass
                loss.backward()

                # Clip gradients
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

                # Update weights
                optimizer.step()

                # Update training loss
                train_loss += loss.item()

                # Print progress
                if (i + 1) % 50 == 0:
                    print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

                # Break after a few batches for testing
                #if i >= 3:  # Remove this line for full training
                #    break

            # Update learning rate
            scheduler.step()

            # Calculate average loss for the epoch
            avg_loss = train_loss / len(train_loader)  # Calculate average over all batches
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Teacher forcing: {teacher_forcing_ratio:.2f}")

            # Save model if it's the best so far
            if avg_loss < best_loss:
                best_loss = avg_loss
                torch.save(model.state_dict(), model_path)
                print(f"Model saved with improved loss: {best_loss:.4f}")

    # Create test dataset
    print("Creating test dataset...")
    test_dataset = ChestXRayReportDataset(test_df, is_train=False, transform=transform)

    test_loader = DataLoader(
        test_dataset,
        batch_size=1,  # Process one at a time for generation
        shuffle=False,
        num_workers=0  # Set to 0 to avoid multiprocessing issues on Mac
    )

    # Generate reports using beam search
    print("Generating reports using beam search...")
    model.eval()

    generated_reports = {}

    with torch.no_grad():
        # Process a few samples for demonstration
        max_samples = 3
        sample_count = 0

        for images, dicom_ids in tqdm.tqdm(test_loader):
            if sample_count >= max_samples:
                break

            images = images.to(device)
            dicom_id = dicom_ids[0]  # Batch size is 1

            try:
                # Generate caption using beam search
                outputs = model(images)  # Uses beam search by default

                # Convert token indices to words
                caption = []
                for token_id in outputs[0]:
                    token_id = token_id.item()
                    if token_id == 2:  # <EOS>
                        break
                    if token_id > 3:  # Skip <PAD>, < SOS >, <EOS>, <UNK>
                        caption.append(vocab.itos[token_id])

                # Join words into a sentence, properly handling punctuation
                report_text = ""
                for word in caption:
                    if word in string.punctuation:
                        report_text += word
                    else:
                        if report_text and not report_text.endswith(' '):
                            report_text += ' '
                        report_text += word

                generated_reports[dicom_id] = report_text

                sample_count += 1
                print(f"Generated report {sample_count}/{max_samples}")
            except Exception as e:
                print(f"Error generating report for {dicom_id}: {e}")

    # Show sample reports
    print("\nSample generated reports:")
    for dicom_id, report in generated_reports.items():
        print(f"\nDICOM ID: {dicom_id}")
        print(f"Report: {report}")

    # Save generated reports
    if generated_reports:
        report_df = pd.DataFrame({
            'dicom_id': list(generated_reports.keys()),
            'generated': list(generated_reports.values())
        })

        output_file = os.path.join(output_dir, 'cnn_rnn_beam_search.tsv')
        report_df.to_csv(output_file, sep='\t', index=False)

        print(f"Generated {len(generated_reports)} reports and saved to {output_file}")
    else:
        print("No reports were generated")

# For this example, setting training_mode to True to train a new model
# Change to False to only generate reports using a pretrained model
if __name__ == "__main__":
    run(training_mode=True)

Successfully imported report parser module
Train data shape: (4370, 3)
Test data shape: (1688, 3)
Using device: cpu
Creating training dataset...
Extracting report texts for training data...


100%|██████████| 4370/4370 [00:01<00:00, 2950.77it/s]


Extracted 4362 reports from training data
Building vocabulary...
Top 20 most frequent words:
.: 24934
the: 15302
is: 12317
,: 5867
no: 4919
are: 4840
there: 4762
of: 4663
right: 4461
and: 4139
in: 3957
left: 3548
pleural: 3147
pneumothorax: 3025
effusion: 2869
lung: 2539
to: 2499
a: 2433
with: 2416
or: 2239
Built vocabulary with 1925 tokens
Loading DenseNet121 model with pretrained weights...
Testing model with dummy data...
Dummy forward pass successful! Output shape: torch.Size([2, 10, 1925])
Training new model from scratch...


Epoch 1/64:  36%|███▋      | 50/137 [08:17<14:34, 10.05s/it]

Epoch [1/64], Step [50/137], Loss: 4.2419


Epoch 1/64:  73%|███████▎  | 100/137 [16:56<06:18, 10.22s/it]

Epoch [1/64], Step [100/137], Loss: 3.3553


Epoch 1/64: 100%|██████████| 137/137 [23:11<00:00, 10.16s/it]


Epoch [1/64], Loss: 3.8802, Teacher forcing: 0.50
Model saved with improved loss: 3.8802


Epoch 2/64:  36%|███▋      | 50/137 [08:06<13:53,  9.58s/it]

Epoch [2/64], Step [50/137], Loss: 2.7814


Epoch 2/64:  73%|███████▎  | 100/137 [15:05<04:58,  8.07s/it]

Epoch [2/64], Step [100/137], Loss: 2.1393


Epoch 2/64: 100%|██████████| 137/137 [20:07<00:00,  8.82s/it]


Epoch [2/64], Loss: 2.5424, Teacher forcing: 0.50
Model saved with improved loss: 2.5424


Epoch 3/64:  36%|███▋      | 50/137 [07:01<12:00,  8.28s/it]

Epoch [3/64], Step [50/137], Loss: 2.3667


Epoch 3/64:  73%|███████▎  | 100/137 [13:51<05:02,  8.17s/it]

Epoch [3/64], Step [100/137], Loss: 1.7809


Epoch 3/64: 100%|██████████| 137/137 [19:04<00:00,  8.35s/it]


Epoch [3/64], Loss: 2.2265, Teacher forcing: 0.50
Model saved with improved loss: 2.2265


Epoch 4/64:  36%|███▋      | 50/137 [06:46<14:39, 10.11s/it]

Epoch [4/64], Step [50/137], Loss: 2.3172


Epoch 4/64:  73%|███████▎  | 100/137 [13:12<04:50,  7.84s/it]

Epoch [4/64], Step [100/137], Loss: 2.0434


Epoch 4/64: 100%|██████████| 137/137 [17:59<00:00,  7.88s/it]


Epoch [4/64], Loss: 2.0461, Teacher forcing: 0.50
Model saved with improved loss: 2.0461


Epoch 5/64:  36%|███▋      | 50/137 [06:30<11:17,  7.79s/it]

Epoch [5/64], Step [50/137], Loss: 2.0133


Epoch 5/64:  73%|███████▎  | 100/137 [12:45<04:41,  7.61s/it]

Epoch [5/64], Step [100/137], Loss: 1.9434


Epoch 5/64: 100%|██████████| 137/137 [17:35<00:00,  7.71s/it]


Epoch [5/64], Loss: 2.0028, Teacher forcing: 0.50
Model saved with improved loss: 2.0028


Epoch 6/64:  36%|███▋      | 50/137 [06:30<11:19,  7.81s/it]

Epoch [6/64], Step [50/137], Loss: 1.8146


Epoch 6/64:  73%|███████▎  | 100/137 [12:53<04:36,  7.47s/it]

Epoch [6/64], Step [100/137], Loss: 1.6385


Epoch 6/64: 100%|██████████| 137/137 [17:38<00:00,  7.73s/it]


Epoch [6/64], Loss: 1.9508, Teacher forcing: 0.50
Model saved with improved loss: 1.9508


Epoch 7/64:  36%|███▋      | 50/137 [06:33<11:41,  8.06s/it]

Epoch [7/64], Step [50/137], Loss: 2.1403


Epoch 7/64:  73%|███████▎  | 100/137 [12:51<04:34,  7.41s/it]

Epoch [7/64], Step [100/137], Loss: 1.9054


Epoch 7/64: 100%|██████████| 137/137 [17:28<00:00,  7.65s/it]


Epoch [7/64], Loss: 1.8939, Teacher forcing: 0.50
Model saved with improved loss: 1.8939


Epoch 8/64:  36%|███▋      | 50/137 [06:33<11:21,  7.83s/it]

Epoch [8/64], Step [50/137], Loss: 2.1788


Epoch 8/64:  73%|███████▎  | 100/137 [12:58<04:35,  7.45s/it]

Epoch [8/64], Step [100/137], Loss: 2.0571


Epoch 8/64: 100%|██████████| 137/137 [17:28<00:00,  7.66s/it]


Epoch [8/64], Loss: 1.8912, Teacher forcing: 0.50
Model saved with improved loss: 1.8912


Epoch 9/64:  36%|███▋      | 50/137 [06:32<11:16,  7.78s/it]

Epoch [9/64], Step [50/137], Loss: 1.7195


Epoch 9/64:  73%|███████▎  | 100/137 [13:02<04:49,  7.83s/it]

Epoch [9/64], Step [100/137], Loss: 1.9606


Epoch 9/64: 100%|██████████| 137/137 [17:35<00:00,  7.70s/it]


Epoch [9/64], Loss: 1.8129, Teacher forcing: 0.50
Model saved with improved loss: 1.8129


Epoch 10/64:  36%|███▋      | 50/137 [06:26<11:17,  7.79s/it]

Epoch [10/64], Step [50/137], Loss: 1.6728


Epoch 10/64:  73%|███████▎  | 100/137 [12:56<04:48,  7.80s/it]

Epoch [10/64], Step [100/137], Loss: 1.1948


Epoch 10/64: 100%|██████████| 137/137 [17:39<00:00,  7.73s/it]


Epoch [10/64], Loss: 1.7681, Teacher forcing: 0.50
Model saved with improved loss: 1.7681


Epoch 11/64:  36%|███▋      | 50/137 [06:24<11:42,  8.08s/it]

Epoch [11/64], Step [50/137], Loss: 2.0423


Epoch 11/64:  73%|███████▎  | 100/137 [13:01<04:52,  7.92s/it]

Epoch [11/64], Step [100/137], Loss: 1.7048


Epoch 11/64: 100%|██████████| 137/137 [17:51<00:00,  7.82s/it]


Epoch [11/64], Loss: 1.7632, Teacher forcing: 0.50
Model saved with improved loss: 1.7632


Epoch 12/64:  36%|███▋      | 50/137 [06:19<11:35,  7.99s/it]

Epoch [12/64], Step [50/137], Loss: 1.5623


Epoch 12/64:  73%|███████▎  | 100/137 [12:58<04:52,  7.90s/it]

Epoch [12/64], Step [100/137], Loss: 2.3188


Epoch 12/64: 100%|██████████| 137/137 [17:47<00:00,  7.79s/it]


Epoch [12/64], Loss: 1.7367, Teacher forcing: 0.50
Model saved with improved loss: 1.7367


Epoch 13/64:  36%|███▋      | 50/137 [06:15<10:50,  7.48s/it]

Epoch [13/64], Step [50/137], Loss: 2.0579


Epoch 13/64:  73%|███████▎  | 100/137 [12:53<04:54,  7.96s/it]

Epoch [13/64], Step [100/137], Loss: 2.3465


Epoch 13/64: 100%|██████████| 137/137 [17:42<00:00,  7.76s/it]


Epoch [13/64], Loss: 1.7240, Teacher forcing: 0.50
Model saved with improved loss: 1.7240


Epoch 14/64:  36%|███▋      | 50/137 [06:28<11:36,  8.00s/it]

Epoch [14/64], Step [50/137], Loss: 1.2237


Epoch 14/64:  73%|███████▎  | 100/137 [13:26<05:10,  8.38s/it]

Epoch [14/64], Step [100/137], Loss: 1.5515


Epoch 14/64: 100%|██████████| 137/137 [18:32<00:00,  8.12s/it]


Epoch [14/64], Loss: 1.7002, Teacher forcing: 0.50
Model saved with improved loss: 1.7002


Epoch 15/64:  36%|███▋      | 50/137 [06:45<11:17,  7.78s/it]

Epoch [15/64], Step [50/137], Loss: 1.6734


Epoch 15/64:  73%|███████▎  | 100/137 [13:37<05:11,  8.42s/it]

Epoch [15/64], Step [100/137], Loss: 1.3075


Epoch 15/64: 100%|██████████| 137/137 [18:43<00:00,  8.20s/it]


Epoch [15/64], Loss: 1.6965, Teacher forcing: 0.50
Model saved with improved loss: 1.6965


Epoch 16/64:  36%|███▋      | 50/137 [06:50<11:41,  8.06s/it]

Epoch [16/64], Step [50/137], Loss: 2.6812


Epoch 16/64:  73%|███████▎  | 100/137 [13:41<05:09,  8.37s/it]

Epoch [16/64], Step [100/137], Loss: 1.7123


Epoch 16/64: 100%|██████████| 137/137 [18:49<00:00,  8.25s/it]


Epoch [16/64], Loss: 1.7224, Teacher forcing: 0.50


Epoch 17/64:  36%|███▋      | 50/137 [06:49<11:30,  7.94s/it]

Epoch [17/64], Step [50/137], Loss: 2.7163


Epoch 17/64:  73%|███████▎  | 100/137 [13:20<04:57,  8.03s/it]

Epoch [17/64], Step [100/137], Loss: 1.8673


Epoch 17/64: 100%|██████████| 137/137 [18:06<00:00,  7.93s/it]


Epoch [17/64], Loss: 2.1664, Teacher forcing: 0.40


Epoch 18/64:  36%|███▋      | 50/137 [06:28<10:47,  7.45s/it]

Epoch [18/64], Step [50/137], Loss: 1.4739


Epoch 18/64:  73%|███████▎  | 100/137 [12:41<04:45,  7.71s/it]

Epoch [18/64], Step [100/137], Loss: 2.1472


Epoch 18/64: 100%|██████████| 137/137 [17:27<00:00,  7.65s/it]


Epoch [18/64], Loss: 2.1131, Teacher forcing: 0.40


Epoch 19/64:  36%|███▋      | 50/137 [06:30<11:20,  7.83s/it]

Epoch [19/64], Step [50/137], Loss: 2.0267


Epoch 19/64:  73%|███████▎  | 100/137 [12:44<04:31,  7.35s/it]

Epoch [19/64], Step [100/137], Loss: 2.0255


Epoch 19/64: 100%|██████████| 137/137 [17:28<00:00,  7.65s/it]


Epoch [19/64], Loss: 2.1665, Teacher forcing: 0.40


Epoch 20/64:  36%|███▋      | 50/137 [06:31<11:23,  7.86s/it]

Epoch [20/64], Step [50/137], Loss: 2.0215


Epoch 20/64:  73%|███████▎  | 100/137 [12:54<04:35,  7.45s/it]

Epoch [20/64], Step [100/137], Loss: 2.2655


Epoch 20/64: 100%|██████████| 137/137 [17:34<00:00,  7.70s/it]


Epoch [20/64], Loss: 2.1050, Teacher forcing: 0.40


Epoch 21/64:  36%|███▋      | 50/137 [06:31<11:22,  7.85s/it]

Epoch [21/64], Step [50/137], Loss: 2.3564


Epoch 21/64:  73%|███████▎  | 100/137 [13:15<04:53,  7.93s/it]

Epoch [21/64], Step [100/137], Loss: 2.5915


Epoch 21/64: 100%|██████████| 137/137 [18:12<00:00,  7.97s/it]


Epoch [21/64], Loss: 2.1106, Teacher forcing: 0.40


Epoch 22/64:  36%|███▋      | 50/137 [07:00<12:12,  8.41s/it]

Epoch [22/64], Step [50/137], Loss: 1.9745


Epoch 22/64:  73%|███████▎  | 100/137 [13:56<04:55,  7.99s/it]

Epoch [22/64], Step [100/137], Loss: 1.8214


Epoch 22/64: 100%|██████████| 137/137 [18:51<00:00,  8.26s/it]


Epoch [22/64], Loss: 2.0972, Teacher forcing: 0.40


Epoch 23/64:  36%|███▋      | 50/137 [07:01<12:11,  8.40s/it]

Epoch [23/64], Step [50/137], Loss: 1.6776


Epoch 23/64:  73%|███████▎  | 100/137 [13:56<04:52,  7.91s/it]

Epoch [23/64], Step [100/137], Loss: 2.1219


Epoch 23/64: 100%|██████████| 137/137 [18:49<00:00,  8.24s/it]


Epoch [23/64], Loss: 2.0264, Teacher forcing: 0.40


Epoch 24/64:  36%|███▋      | 50/137 [07:01<12:09,  8.39s/it]

Epoch [24/64], Step [50/137], Loss: 2.0630


Epoch 24/64:  73%|███████▎  | 100/137 [13:53<04:30,  7.30s/it]

Epoch [24/64], Step [100/137], Loss: 2.0920


Epoch 24/64: 100%|██████████| 137/137 [18:26<00:00,  8.07s/it]


Epoch [24/64], Loss: 2.0702, Teacher forcing: 0.40


Epoch 25/64:  36%|███▋      | 50/137 [06:35<12:07,  8.36s/it]

Epoch [25/64], Step [50/137], Loss: 2.2483


Epoch 25/64:  73%|███████▎  | 100/137 [13:39<05:15,  8.53s/it]

Epoch [25/64], Step [100/137], Loss: 1.8366


Epoch 25/64: 100%|██████████| 137/137 [18:25<00:00,  8.07s/it]


Epoch [25/64], Loss: 2.0940, Teacher forcing: 0.40


Epoch 26/64:  36%|███▋      | 50/137 [06:48<11:48,  8.14s/it]

Epoch [26/64], Step [50/137], Loss: 1.9323


Epoch 26/64:  73%|███████▎  | 100/137 [13:45<05:08,  8.34s/it]

Epoch [26/64], Step [100/137], Loss: 2.2272


Epoch 26/64: 100%|██████████| 137/137 [18:35<00:00,  8.14s/it]


Epoch [26/64], Loss: 2.0962, Teacher forcing: 0.40


Epoch 27/64:  36%|███▋      | 50/137 [06:47<11:52,  8.18s/it]

Epoch [27/64], Step [50/137], Loss: 2.0933


Epoch 27/64:  73%|███████▎  | 100/137 [13:44<04:58,  8.07s/it]

Epoch [27/64], Step [100/137], Loss: 1.8536


Epoch 27/64: 100%|██████████| 137/137 [18:40<00:00,  8.18s/it]


Epoch [27/64], Loss: 2.0409, Teacher forcing: 0.40


Epoch 28/64:  36%|███▋      | 50/137 [06:48<12:56,  8.92s/it]

Epoch [28/64], Step [50/137], Loss: 2.2824


Epoch 28/64:  73%|███████▎  | 100/137 [13:50<04:53,  7.94s/it]

Epoch [28/64], Step [100/137], Loss: 2.1093


Epoch 28/64: 100%|██████████| 137/137 [18:33<00:00,  8.13s/it]


Epoch [28/64], Loss: 2.0489, Teacher forcing: 0.40


Epoch 29/64:  36%|███▋      | 50/137 [06:29<12:18,  8.49s/it]

Epoch [29/64], Step [50/137], Loss: 1.9542


Epoch 29/64:  73%|███████▎  | 100/137 [13:09<04:55,  7.99s/it]

Epoch [29/64], Step [100/137], Loss: 2.0111


Epoch 29/64: 100%|██████████| 137/137 [17:56<00:00,  7.86s/it]


Epoch [29/64], Loss: 2.0453, Teacher forcing: 0.40


Epoch 30/64:  36%|███▋      | 50/137 [06:10<11:08,  7.69s/it]

Epoch [30/64], Step [50/137], Loss: 1.6681


Epoch 30/64:  73%|███████▎  | 100/137 [12:44<04:54,  7.96s/it]

Epoch [30/64], Step [100/137], Loss: 2.3520


Epoch 30/64: 100%|██████████| 137/137 [17:30<00:00,  7.67s/it]


Epoch [30/64], Loss: 2.0092, Teacher forcing: 0.40


Epoch 31/64:  36%|███▋      | 50/137 [06:12<10:47,  7.45s/it]

Epoch [31/64], Step [50/137], Loss: 1.8406


Epoch 31/64:  73%|███████▎  | 100/137 [12:42<04:52,  7.91s/it]

Epoch [31/64], Step [100/137], Loss: 1.9048


Epoch 31/64: 100%|██████████| 137/137 [17:29<00:00,  7.66s/it]


Epoch [31/64], Loss: 2.0189, Teacher forcing: 0.40


Epoch 32/64:  36%|███▋      | 50/137 [06:18<10:46,  7.43s/it]

Epoch [32/64], Step [50/137], Loss: 1.9738


Epoch 32/64:  73%|███████▎  | 100/137 [12:45<04:49,  7.81s/it]

Epoch [32/64], Step [100/137], Loss: 1.5485


Epoch 32/64: 100%|██████████| 137/137 [17:32<00:00,  7.68s/it]


Epoch [32/64], Loss: 2.0473, Teacher forcing: 0.40


Epoch 33/64:  36%|███▋      | 50/137 [06:26<10:49,  7.47s/it]

Epoch [33/64], Step [50/137], Loss: 2.2652


Epoch 33/64:  73%|███████▎  | 100/137 [12:49<04:48,  7.80s/it]

Epoch [33/64], Step [100/137], Loss: 2.6378


Epoch 33/64: 100%|██████████| 137/137 [17:38<00:00,  7.72s/it]


Epoch [33/64], Loss: 2.6422, Teacher forcing: 0.30


Epoch 34/64:  36%|███▋      | 50/137 [06:53<11:24,  7.86s/it]

Epoch [34/64], Step [50/137], Loss: 2.2011


Epoch 34/64:  73%|███████▎  | 100/137 [13:43<05:15,  8.53s/it]

Epoch [34/64], Step [100/137], Loss: 2.0883


Epoch 34/64: 100%|██████████| 137/137 [18:54<00:00,  8.28s/it]


Epoch [34/64], Loss: 2.6263, Teacher forcing: 0.30


Epoch 35/64:  36%|███▋      | 50/137 [06:55<11:15,  7.76s/it]

Epoch [35/64], Step [50/137], Loss: 2.8333


Epoch 35/64:  73%|███████▎  | 100/137 [13:41<05:12,  8.44s/it]

Epoch [35/64], Step [100/137], Loss: 1.8503


Epoch 35/64: 100%|██████████| 137/137 [18:56<00:00,  8.30s/it]


Epoch [35/64], Loss: 2.5869, Teacher forcing: 0.30


Epoch 36/64:  36%|███▋      | 50/137 [06:58<11:19,  7.81s/it]

Epoch [36/64], Step [50/137], Loss: 2.4964


Epoch 36/64:  73%|███████▎  | 100/137 [13:39<05:09,  8.37s/it]

Epoch [36/64], Step [100/137], Loss: 2.8879


Epoch 36/64: 100%|██████████| 137/137 [18:46<00:00,  8.22s/it]


Epoch [36/64], Loss: 2.5851, Teacher forcing: 0.30


Epoch 37/64:  36%|███▋      | 50/137 [06:45<11:21,  7.84s/it]

Epoch [37/64], Step [50/137], Loss: 2.7200


Epoch 37/64:  73%|███████▎  | 100/137 [12:57<04:38,  7.54s/it]

Epoch [37/64], Step [100/137], Loss: 2.0836


Epoch 37/64: 100%|██████████| 137/137 [17:44<00:00,  7.77s/it]


Epoch [37/64], Loss: 2.5719, Teacher forcing: 0.30


Epoch 38/64:  36%|███▋      | 50/137 [06:37<12:32,  8.65s/it]

Epoch [38/64], Step [50/137], Loss: 2.2541


Epoch 38/64:  73%|███████▎  | 100/137 [13:05<04:39,  7.54s/it]

Epoch [38/64], Step [100/137], Loss: 2.7497


Epoch 38/64: 100%|██████████| 137/137 [17:50<00:00,  7.82s/it]


Epoch [38/64], Loss: 2.5359, Teacher forcing: 0.30


Epoch 39/64:  36%|███▋      | 50/137 [06:41<12:30,  8.62s/it]

Epoch [39/64], Step [50/137], Loss: 3.3804


Epoch 39/64:  73%|███████▎  | 100/137 [13:10<04:46,  7.74s/it]

Epoch [39/64], Step [100/137], Loss: 2.5471


Epoch 39/64: 100%|██████████| 137/137 [18:02<00:00,  7.90s/it]


Epoch [39/64], Loss: 2.5398, Teacher forcing: 0.30


Epoch 40/64:  36%|███▋      | 50/137 [06:39<11:45,  8.11s/it]

Epoch [40/64], Step [50/137], Loss: 2.4646


Epoch 40/64:  73%|███████▎  | 100/137 [13:08<04:36,  7.48s/it]

Epoch [40/64], Step [100/137], Loss: 2.2485


Epoch 40/64: 100%|██████████| 137/137 [17:47<00:00,  7.79s/it]


Epoch [40/64], Loss: 2.5828, Teacher forcing: 0.30


Epoch 41/64:  36%|███▋      | 50/137 [06:30<11:28,  7.91s/it]

Epoch [41/64], Step [50/137], Loss: 2.9763


Epoch 41/64:  73%|███████▎  | 100/137 [12:52<04:27,  7.23s/it]

Epoch [41/64], Step [100/137], Loss: 3.3077


Epoch 41/64: 100%|██████████| 137/137 [17:19<00:00,  7.58s/it]


Epoch [41/64], Loss: 2.5816, Teacher forcing: 0.30


Epoch 42/64:  36%|███▋      | 50/137 [06:26<11:25,  7.88s/it]

Epoch [42/64], Step [50/137], Loss: 2.5879


Epoch 42/64:  73%|███████▎  | 100/137 [12:51<04:43,  7.67s/it]

Epoch [42/64], Step [100/137], Loss: 2.6664


Epoch 42/64: 100%|██████████| 137/137 [17:20<00:00,  7.60s/it]


Epoch [42/64], Loss: 2.5400, Teacher forcing: 0.30


Epoch 43/64:  36%|███▋      | 50/137 [06:21<11:09,  7.70s/it]

Epoch [43/64], Step [50/137], Loss: 2.3308


Epoch 43/64:  73%|███████▎  | 100/137 [12:56<04:49,  7.83s/it]

Epoch [43/64], Step [100/137], Loss: 2.9702


Epoch 43/64: 100%|██████████| 137/137 [17:36<00:00,  7.71s/it]


Epoch [43/64], Loss: 2.5598, Teacher forcing: 0.30


Epoch 44/64:  36%|███▋      | 50/137 [06:22<11:41,  8.06s/it]

Epoch [44/64], Step [50/137], Loss: 2.2564


Epoch 44/64:  73%|███████▎  | 100/137 [13:15<05:07,  8.31s/it]

Epoch [44/64], Step [100/137], Loss: 2.5993


Epoch 44/64: 100%|██████████| 137/137 [18:13<00:00,  7.98s/it]


Epoch [44/64], Loss: 2.5840, Teacher forcing: 0.30


Epoch 45/64:  36%|███▋      | 50/137 [06:23<11:26,  7.89s/it]

Epoch [45/64], Step [50/137], Loss: 1.5462


Epoch 45/64:  73%|███████▎  | 100/137 [13:14<05:15,  8.53s/it]

Epoch [45/64], Step [100/137], Loss: 2.0833


Epoch 45/64: 100%|██████████| 137/137 [18:14<00:00,  7.99s/it]


Epoch [45/64], Loss: 2.5812, Teacher forcing: 0.30


Epoch 46/64:  36%|███▋      | 50/137 [06:42<11:07,  7.67s/it]

Epoch [46/64], Step [50/137], Loss: 2.1533


Epoch 46/64:  73%|███████▎  | 100/137 [13:25<04:59,  8.08s/it]

Epoch [46/64], Step [100/137], Loss: 2.4689


Epoch 46/64: 100%|██████████| 137/137 [18:22<00:00,  8.05s/it]


Epoch [46/64], Loss: 2.5257, Teacher forcing: 0.30


Epoch 47/64:  36%|███▋      | 50/137 [06:31<11:06,  7.67s/it]

Epoch [47/64], Step [50/137], Loss: 2.4304


Epoch 47/64:  73%|███████▎  | 100/137 [13:11<04:59,  8.09s/it]

Epoch [47/64], Step [100/137], Loss: 2.4261


Epoch 47/64: 100%|██████████| 137/137 [18:07<00:00,  7.94s/it]


Epoch [47/64], Loss: 2.5600, Teacher forcing: 0.30


Epoch 48/64:  36%|███▋      | 50/137 [06:36<11:02,  7.62s/it]

Epoch [48/64], Step [50/137], Loss: 2.8065


Epoch 48/64:  73%|███████▎  | 100/137 [13:19<05:02,  8.17s/it]

Epoch [48/64], Step [100/137], Loss: 2.3207


Epoch 48/64: 100%|██████████| 137/137 [18:17<00:00,  8.01s/it]


Epoch [48/64], Loss: 2.5992, Teacher forcing: 0.30


Epoch 49/64:  36%|███▋      | 50/137 [06:36<11:12,  7.74s/it]

Epoch [49/64], Step [50/137], Loss: 3.6201


Epoch 49/64:  73%|███████▎  | 100/137 [13:05<05:06,  8.30s/it]

Epoch [49/64], Step [100/137], Loss: 3.3677


Epoch 49/64: 100%|██████████| 137/137 [18:03<00:00,  7.91s/it]


Epoch [49/64], Loss: 3.2525, Teacher forcing: 0.20


Epoch 50/64:  36%|███▋      | 50/137 [06:49<12:04,  8.32s/it]

Epoch [50/64], Step [50/137], Loss: 3.0865


Epoch 50/64:  73%|███████▎  | 100/137 [13:15<04:57,  8.03s/it]

Epoch [50/64], Step [100/137], Loss: 3.0071


Epoch 50/64: 100%|██████████| 137/137 [18:18<00:00,  8.02s/it]


Epoch [50/64], Loss: 3.2070, Teacher forcing: 0.20


Epoch 51/64:  36%|███▋      | 50/137 [06:45<11:49,  8.16s/it]

Epoch [51/64], Step [50/137], Loss: 3.2024


Epoch 51/64:  73%|███████▎  | 100/137 [13:11<04:45,  7.72s/it]

Epoch [51/64], Step [100/137], Loss: 3.1094


Epoch 51/64: 100%|██████████| 137/137 [18:13<00:00,  7.98s/it]


Epoch [51/64], Loss: 3.2482, Teacher forcing: 0.20


Epoch 52/64:  36%|███▋      | 50/137 [06:47<11:54,  8.21s/it]

Epoch [52/64], Step [50/137], Loss: 2.6375


Epoch 52/64:  73%|███████▎  | 100/137 [13:19<04:45,  7.71s/it]

Epoch [52/64], Step [100/137], Loss: 3.4411


Epoch 52/64: 100%|██████████| 137/137 [18:19<00:00,  8.03s/it]


Epoch [52/64], Loss: 3.2623, Teacher forcing: 0.20


Epoch 53/64:  36%|███▋      | 50/137 [06:42<11:39,  8.04s/it]

Epoch [53/64], Step [50/137], Loss: 3.4473


Epoch 53/64:  73%|███████▎  | 100/137 [13:28<04:40,  7.58s/it]

Epoch [53/64], Step [100/137], Loss: 3.5385


Epoch 53/64: 100%|██████████| 137/137 [18:20<00:00,  8.03s/it]


Epoch [53/64], Loss: 3.2290, Teacher forcing: 0.20


Epoch 54/64:  36%|███▋      | 50/137 [07:07<11:51,  8.18s/it]

Epoch [54/64], Step [50/137], Loss: 3.2482


Epoch 54/64:  73%|███████▎  | 100/137 [13:49<04:32,  7.37s/it]

Epoch [54/64], Step [100/137], Loss: 2.9895


Epoch 54/64: 100%|██████████| 137/137 [18:23<00:00,  8.05s/it]


Epoch [54/64], Loss: 3.2797, Teacher forcing: 0.20


Epoch 55/64:  36%|███▋      | 50/137 [06:36<11:25,  7.88s/it]

Epoch [55/64], Step [50/137], Loss: 3.8555


Epoch 55/64:  73%|███████▎  | 100/137 [13:09<04:51,  7.88s/it]

Epoch [55/64], Step [100/137], Loss: 3.4029


Epoch 55/64: 100%|██████████| 137/137 [17:40<00:00,  7.74s/it]


Epoch [55/64], Loss: 3.1896, Teacher forcing: 0.20


Epoch 56/64:  36%|███▋      | 50/137 [06:35<11:23,  7.86s/it]

Epoch [56/64], Step [50/137], Loss: 3.2059


Epoch 56/64:  73%|███████▎  | 100/137 [13:08<04:50,  7.84s/it]

Epoch [56/64], Step [100/137], Loss: 3.8957


Epoch 56/64: 100%|██████████| 137/137 [17:43<00:00,  7.76s/it]


Epoch [56/64], Loss: 3.2369, Teacher forcing: 0.20


Epoch 57/64:  36%|███▋      | 50/137 [06:31<11:44,  8.10s/it]

Epoch [57/64], Step [50/137], Loss: 3.4469


Epoch 57/64:  73%|███████▎  | 100/137 [13:03<04:50,  7.85s/it]

Epoch [57/64], Step [100/137], Loss: 3.7335


Epoch 57/64: 100%|██████████| 137/137 [17:40<00:00,  7.74s/it]


Epoch [57/64], Loss: 3.1807, Teacher forcing: 0.20


Epoch 58/64:  36%|███▋      | 50/137 [06:28<13:13,  9.12s/it]

Epoch [58/64], Step [50/137], Loss: 3.1961


Epoch 58/64:  73%|███████▎  | 100/137 [13:03<04:50,  7.85s/it]

Epoch [58/64], Step [100/137], Loss: 3.0853


Epoch 58/64: 100%|██████████| 137/137 [17:51<00:00,  7.82s/it]


Epoch [58/64], Loss: 3.2704, Teacher forcing: 0.20


Epoch 59/64:  36%|███▋      | 50/137 [06:14<11:16,  7.78s/it]

Epoch [59/64], Step [50/137], Loss: 3.4476


Epoch 59/64:  73%|███████▎  | 100/137 [12:51<04:53,  7.94s/it]

Epoch [59/64], Step [100/137], Loss: 3.1472


Epoch 59/64: 100%|██████████| 137/137 [17:40<00:00,  7.74s/it]


Epoch [59/64], Loss: 3.2095, Teacher forcing: 0.20


Epoch 60/64:  36%|███▋      | 50/137 [06:14<10:44,  7.41s/it]

Epoch [60/64], Step [50/137], Loss: 3.0944


Epoch 60/64:  73%|███████▎  | 100/137 [12:43<04:49,  7.82s/it]

Epoch [60/64], Step [100/137], Loss: 3.1422


Epoch 60/64: 100%|██████████| 137/137 [17:31<00:00,  7.68s/it]


Epoch [60/64], Loss: 3.2511, Teacher forcing: 0.20


Epoch 61/64:  36%|███▋      | 50/137 [06:23<10:46,  7.43s/it]

Epoch [61/64], Step [50/137], Loss: 3.3110


Epoch 61/64:  73%|███████▎  | 100/137 [12:49<04:50,  7.85s/it]

Epoch [61/64], Step [100/137], Loss: 3.5716


Epoch 61/64: 100%|██████████| 137/137 [17:36<00:00,  7.71s/it]


Epoch [61/64], Loss: 3.2161, Teacher forcing: 0.20


Epoch 62/64:  36%|███▋      | 50/137 [06:25<11:01,  7.61s/it]

Epoch [62/64], Step [50/137], Loss: 3.3112


Epoch 62/64:  73%|███████▎  | 100/137 [13:01<05:14,  8.49s/it]

Epoch [62/64], Step [100/137], Loss: 3.2080


Epoch 62/64: 100%|██████████| 137/137 [18:09<00:00,  7.95s/it]


Epoch [62/64], Loss: 3.2076, Teacher forcing: 0.20


Epoch 63/64:  36%|███▋      | 50/137 [06:54<11:25,  7.87s/it]

Epoch [63/64], Step [50/137], Loss: 3.9996


Epoch 63/64:  73%|███████▎  | 100/137 [13:30<05:02,  8.18s/it]

Epoch [63/64], Step [100/137], Loss: 3.6753


Epoch 63/64: 100%|██████████| 137/137 [18:42<00:00,  8.19s/it]


Epoch [63/64], Loss: 3.2136, Teacher forcing: 0.20


Epoch 64/64:  36%|███▋      | 50/137 [07:04<12:22,  8.53s/it]

Epoch [64/64], Step [50/137], Loss: 3.2165


Epoch 64/64:  73%|███████▎  | 100/137 [13:42<04:48,  7.81s/it]

Epoch [64/64], Step [100/137], Loss: 2.8529


Epoch 64/64: 100%|██████████| 137/137 [18:33<00:00,  8.13s/it]


Epoch [64/64], Loss: 3.2178, Teacher forcing: 0.20
Creating test dataset...
Generating reports using beam search...


  0%|          | 2/1688 [00:00<03:53,  7.22it/s]

Generated report 1/3
Generated report 2/3


  0%|          | 3/1688 [00:00<04:28,  6.27it/s]


Generated report 3/3

Sample generated reports:

DICOM ID: b8e14e3b-545cd663-a00812c0-9e772d64-b3d40e32
Report: portable portable upright view of the chest. there ekg leads are present. the lungs lungs appear appear clear without without focal or or pneumothorax effusion effusion

DICOM ID: 8c295118-0d590369-21de2213-312374d8-50c8a349
Report: portable portable upright view of the chest. there ekg leads are present. the lungs lungs appear appear clear without without focal or or pneumothorax effusion effusion

DICOM ID: fc75e61c-ee134385-a5d0e01b-695f8125-2ed13ad2
Report: portable portable upright view of the chest. there ekg leads are present. the lungs lungs appear appear clear without without focal or or pneumothorax effusion effusion
Generated 3 reports and saved to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/cnn_rnn_beam_search.tsv
