In [1]:
import pandas as pd

In [None]:
raw_data = pd.read_csv('NIKL_NEWSPAPER_2023_CSV/NEWSPAPER_2022_1.csv')

# TODO: option to read all csv files

Selects only the sentence column and makes a new DataFrame df with one column: each row is a Korean sentence.

In [None]:
df = pd.DataFrame(raw_data['sentence'])

Calls generate_labels (from preprocess.py) to:

1) Normalize and remove spaces from each sentence,

2) Produce a binary label sequence marking where spaces originally were,

3) Collect the set of all characters seen (chars),

4) Return the augmented DataFrame (with unspaced and labels columns) and the character set.

In [None]:
import preprocess

df, chars = preprocess.generate_labels(df)

In [None]:
df.head()

Builds two mappings from your character set:

1) char2idx: maps each character (plus a special <PAD> token) to a unique integer index.

2) idx2char: the inverse lookup (list of characters by index).


In [None]:
char2idx, idx2char = preprocess.generate_mappings(chars)

Converts each row of df into PyTorch tensors:

1) input_tensor: list of 1D LongTensors where each element is the index of a character in the unspaced text.

2) label_tensor: list of 1D LongTensors of the same length, with 1s where a space should follow and 0s otherwise.

In [None]:
input_tensor, label_tensor = preprocess.generate_tensors(df, char2idx)

Let's Set up for Training

In [None]:
import glob
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

from model        import KoreanSpacingSolver
from cnn_model    import KoreanSpacingCNN
from Transformer_model import KoreanSpacingTransformer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Train/Val/Test Split

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

class SpacingDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs    
        self.labels = labels    
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

def collate_fn(batch):
    Xs, Ys = zip(*batch)
    Xp = pad_sequence(Xs, batch_first=True, padding_value=0)
    Yp = pad_sequence(Ys, batch_first=True, padding_value=-100)
    return Xp.to(device), Yp.to(device)

# build and split
full_ds = SpacingDataset(input_tensor, label_tensor)
n = len(full_ds)
n_train = int(0.8 * n)
n_val   = int(0.1 * n)
n_test  = n - n_train - n_val

train_ds, val_ds, test_ds = random_split(
    full_ds, [n_train, n_val, n_test],
    generator=torch.Generator().manual_seed(42)
)

# DataLoaders
batch_size = 256
train_loader = DataLoader(train_ds,  batch_size=batch_size, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,    batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print(f"Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}")


In [None]:
from tqdm.notebook import tqdm
import time
import torch.nn as nn
import torch.optim as optim

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total = 0
    for X, Y in loader:
        logits = model(X)                          
        B, S, C = logits.shape
        loss = criterion(logits.view(-1, C), Y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total += loss.item()
    return total / len(loader)

def eval_epoch(model, loader, criterion):
    model.eval()
    total = 0
    with torch.no_grad():
        for X, Y in loader:
            logits = model(X)
            B, S, C = logits.shape
            total += criterion(logits.view(-1, C), Y.view(-1)).item()
    return total / len(loader)

def fit(model, train_loader, val_loader, epochs=5, lr=1e-3):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    for epoch in range(1, epochs+1):
        # —— Train epoch with live batch‐loss reporting ——
        model.train()
        running_loss = 0.0
        start_time = time.time()
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}", leave=False)
        for X, Y in progress_bar:
            optimizer.zero_grad()
            logits = model(X)  # (B, S, C)
            B, S, C = logits.shape
            loss = criterion(logits.reshape(-1, C), Y.reshape(-1)) 
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            progress_bar.set_postfix(batch_loss=f"{loss.item():.4f}")

        epoch_time = time.time() - start_time
        avg_train = running_loss / len(train_loader)

        # —— Validation loss ——
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X, Y in val_loader:
                logits = model(X)
                B, S, C = logits.shape
                val_loss += criterion(logits.reshape(-1, C), Y.reshape(-1)).item() 
        avg_val = val_loss / len(val_loader)

        print(f"Epoch {epoch}/{epochs} — "
              f"train_loss: {avg_train:.4f}  "
              f"val_loss: {avg_val:.4f}  "
              f"time: {epoch_time:.1f}s")

Hyperparameter Tuning

In [None]:
lstm = KoreanSpacingSolver(
    vocab_size=len(char2idx),
    embedding_dim=256,
    hidden_dim=512
).to(device)

In [None]:
cnn = KoreanSpacingCNN(
    vocab_size=len(char2idx),
    embedding_dim=200,
    num_filters=100,
    kernel_sizes=[3,5],
    dropout=0.2
).to(device)

In [None]:
transformer = KoreanSpacingTransformer(
    vocab_size=len(char2idx),
    d_model=256,             
    nhead=8,                 
    num_encoder_layers=4,    
    dim_feedforward=512,     
    dropout=0.1,             
    max_len=1000,             
    num_labels=2
).to(device)

Training

In [None]:
# LSTM model
fit(lstm, train_loader, val_loader, epochs=5, lr=1e-3)


In [None]:
# CNN model
fit(cnn, train_loader, val_loader, epochs=5, lr=1e-3)


In [None]:
#Transformer model
fit(transformer, train_loader, val_loader, epochs=5, lr=1e-3)

Set up for testing

In [None]:
import torch.nn as nn

def test_model(model, test_loader):
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=-100)
    total_loss = 0.0
    correct, total = 0, 0

    with torch.no_grad():
        for X, Y in test_loader:
            logits = model(X)               
            B, S, C = logits.shape

            # 1a) accumulate loss
            loss = criterion(logits.reshape(-1, C), Y.reshape(-1))
            total_loss += loss.item()

            # 1b) compute spacing‐accuracy
            preds = logits.argmax(-1)      
            mask  = (Y != -100)            # ignore padded positions
            correct += (preds[mask] == Y[mask]).sum().item()
            total   += mask.sum().item()

    avg_loss = total_loss / len(test_loader)
    accuracy = correct / total
    print(f"Test  — loss: {avg_loss:.4f}, accuracy: {accuracy:.4%}")

Testing

In [None]:
print("Evaluating LSTM on test set:")
test_model(lstm, test_loader)

In [None]:
print("Evaluating CNN on test set:")
test_model(cnn, test_loader)

In [None]:
print("Evaluating Transformer on test set:")
test_model(transformer, test_loader)

Save Model + Mapping

In [None]:
torch.save({
    'lstm_state_dict': lstm.state_dict(),
    'char2idx':        char2idx,
    'idx2char':        idx2char
}, 'spacing_models_lstm.pt')
print("Saved LSTM weight and mapping to spacing_models_lstm.pt")

In [None]:
torch.save({
    'cnn_state_dict':  cnn.state_dict(),
    'char2idx':        char2idx,
    'idx2char':        idx2char
}, 'spacing_models_cnn.pt')
print("Saved CNN weight and mapping to spacing_models.pt")

In [None]:
torch.save({
    'transformer_state_dict':  transformer.state_dict(),
    'char2idx':        char2idx,
    'idx2char':        idx2char
}, 'spacing_models_transformer.pt')
print("Saved transformer weight and mapping to spacing_models_transformer.pt")