In [1]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [1]:
txt_files_path = "wikitext2"

In [4]:
max_unk_ratio = 0.03

total_lines = 0
kept_lines = 0
dropped_lines = 0
total_unk = 0
removed_unk = 0

with open(os.path.join(txt_files_path, "train_raw.txt"), 'r', encoding='utf-8') as fin, \
     open(os.path.join(txt_files_path, "train_fixed.txt"), 'w', encoding='utf-8') as fout:
    for line in fin:
        toks = line.strip().split()
        if not toks:
            continue
        total_lines += 1
        unk_count = toks.count('<unk>')
        total_unk += unk_count
        if unk_count / len(toks) <= max_unk_ratio:
            fout.write(line)
            kept_lines += 1
        else:
            dropped_lines += 1
            removed_unk += unk_count

print(f"Total lines processed: {total_lines}")
print(f"Lines kept: {kept_lines}")
print(f"Lines dropped: {dropped_lines}")
print(f"Total <unk> tokens encountered: {total_unk}")
print(f"<unk> tokens removed (in dropped lines): {removed_unk}")


Total lines processed: 23767
Lines kept: 16712
Lines dropped: 7055
Total <unk> tokens encountered: 54625
<unk> tokens removed (in dropped lines): 36440


In [7]:
def display_first_lines(file_path="wikitext2/train.txt", num_lines=5):
    """
    Displays the first n lines of a text file.
    
    Args:
        file_path: Path to the text file
        num_lines: Number of lines to display (default: 5)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            print(f"First {num_lines} lines of {file_path}:")
            for i, line in enumerate(file):
                if i >= num_lines:
                    break
                print(f"{i+1}: {line.strip()}")
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except Exception as e:
        print(f"Error reading file: {e}")

# Example usage
display_first_lines()

First 5 lines of wikitext2/train.txt:
1: = Valkyria Chronicles III =
2: Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " .
3: The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game 