In [1]:
import pandas as pd

In [2]:
raw_data = pd.read_csv('NIKL_NEWSPAPER_2023_CSV/NEWSPAPER_2022_1.csv')

Selects only the sentence column and makes a new DataFrame df with one column: each row is a Korean sentence.

In [3]:
df = pd.DataFrame(raw_data['sentence'])

Calls generate_labels (from preprocess.py) to:

1) Normalize and remove spaces from each sentence,

2) Produce a binary label sequence marking where spaces originally were,

3) Collect the set of all characters seen (chars),

4) Return the augmented DataFrame (with unspaced and labels columns) and the character set.

In [4]:
import preprocess

df, chars = preprocess.generate_labels(df)

In [5]:
df.head()

Unnamed: 0,sentence,unspaced,labels
0,[영상]“위기를 이겨내고 일상으로” 부산 기관장 신년사,"[영상]""위기를이겨내고일상으로""부산기관장신년사","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ..."
1,취임 2년 차를 맞는 박형준 부산시장은 코로나19 위기를 이겨내고 있는 부산시민들에...,취임2년차를맞는박형준부산시장은코로나19위기를이겨내고있는부산시민들에게감사의인사를먼저전했다.,"[0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, ..."
2,"박 시장은 “코로나19는 번번이 우리의 희망의 길목을 막아섰고, 지금도 여전히 민생...","박시장은""코로나19는번번이우리의희망의길목을막아섰고,지금도여전히민생을위협하고있다""며""...","[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ..."
3,그러면서 “숱한 어려움을 헤쳐온 위대한 시민의 힘으로 2022년을 코로나를 극복하고...,"그러면서""숱한어려움을헤쳐온위대한시민의힘으로2022년을코로나를극복하고일상을회복하는해,...","[0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, ..."
4,박 시장은 “치밀한 전략으로 2030세계박람회 유치를 위한 국제박람회기구 현지 실사...,"박시장은""치밀한전략으로2030세계박람회유치를위한국제박람회기구현지실사를성공적으로이끌고...","[1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ..."


Builds two mappings from your character set:

1) char2idx: maps each character (plus a special <PAD> token) to a unique integer index.

2) idx2char: the inverse lookup (list of characters by index).


In [6]:
char2idx, idx2char = preprocess.generate_mappings(chars)

Converts each row of df into PyTorch tensors:

1) input_tensor: list of 1D LongTensors where each element is the index of a character in the unspaced text.

2) label_tensor: list of 1D LongTensors of the same length, with 1s where a space should follow and 0s otherwise.

In [7]:
input_tensor, label_tensor = preprocess.generate_tensors(df, char2idx)

Let's Set up for Training

In [8]:
import torch

from lstm_model import KoreanSpacingLSTM
from cnn_model import KoreanSpacingCNN
from transformer_model import KoreanSpacingTransformer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Using device: cuda


Train/Val/Test Split

In [9]:
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

class SpacingDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs    
        self.labels = labels    
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

def collate_fn(batch):
    Xs, Ys = zip(*batch)
    Xp = pad_sequence(Xs, batch_first=True, padding_value=0)
    Yp = pad_sequence(Ys, batch_first=True, padding_value=-100)
    return Xp.to(device), Yp.to(device)

# build and split
full_ds = SpacingDataset(input_tensor, label_tensor)
n = len(full_ds)
n_train = int(0.8 * n)
n_val   = int(0.1 * n)
n_test  = n - n_train - n_val

train_ds, val_ds, test_ds = random_split(
    full_ds, [n_train, n_val, n_test],
    generator=torch.Generator().manual_seed(42)
)

# DataLoaders
batch_size = 256
train_loader = DataLoader(train_ds,  batch_size=batch_size, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,    batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print(f"Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}")


Train/Val/Test sizes: 800000/100000/100000


In [10]:
from tqdm.notebook import tqdm
import time
import torch.nn as nn
import torch.optim as optim

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total = 0
    for X, Y in loader:
        logits = model(X)                          
        B, S, C = logits.shape
        loss = criterion(logits.view(-1, C), Y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total += loss.item()
    return total / len(loader)

def eval_epoch(model, loader, criterion):
    model.eval()
    total = 0
    with torch.no_grad():
        for X, Y in loader:
            logits = model(X)
            B, S, C = logits.shape
            total += criterion(logits.view(-1, C), Y.view(-1)).item()
    return total / len(loader)

def fit(model, train_loader, val_loader, epochs=5, lr=1e-3):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    for epoch in range(1, epochs+1):
        # —— Train epoch with live batch‐loss reporting ——
        model.train()
        running_loss = 0.0
        start_time = time.time()
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}", leave=False)
        for X, Y in progress_bar:
            optimizer.zero_grad()
            logits = model(X)  # (B, S, C)
            B, S, C = logits.shape
            loss = criterion(logits.reshape(-1, C), Y.reshape(-1)) 
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            progress_bar.set_postfix(batch_loss=f"{loss.item():.4f}")

        epoch_time = time.time() - start_time
        avg_train = running_loss / len(train_loader)

        # —— Validation loss ——
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X, Y in val_loader:
                logits = model(X)
                B, S, C = logits.shape
                val_loss += criterion(logits.reshape(-1, C), Y.reshape(-1)).item() 
        avg_val = val_loss / len(val_loader)

        print(f"Epoch {epoch}/{epochs} — "
              f"train_loss: {avg_train:.4f}  "
              f"val_loss: {avg_val:.4f}  "
              f"time: {epoch_time:.1f}s")

Hyperparameter Tuning and Training

In [17]:
lstm = KoreanSpacingLSTM(
    vocab_size=len(char2idx),
    embedding_dim=128,
    hidden_dim=128
).to(device)
fit(lstm, train_loader, val_loader, epochs=10, lr=1e-3)

Epoch 1/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 1/10 — train_loss: 0.1101  val_loss: 0.0756  time: 48.1s


Epoch 2/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 2/10 — train_loss: 0.0757  val_loss: 0.0670  time: 50.4s


Epoch 3/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 3/10 — train_loss: 0.0690  val_loss: 0.0632  time: 49.9s


Epoch 4/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 4/10 — train_loss: 0.0659  val_loss: 0.0618  time: 47.6s


Epoch 5/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 5/10 — train_loss: 0.0637  val_loss: 0.0598  time: 49.0s


Epoch 6/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 6/10 — train_loss: 0.0618  val_loss: 0.0588  time: 51.0s


Epoch 7/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 7/10 — train_loss: 0.0605  val_loss: 0.0578  time: 50.7s


Epoch 8/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 8/10 — train_loss: 0.0594  val_loss: 0.0572  time: 50.9s


Epoch 9/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 9/10 — train_loss: 0.0585  val_loss: 0.0567  time: 50.8s


Epoch 10/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 10/10 — train_loss: 0.0578  val_loss: 0.0561  time: 50.7s


In [23]:
cnn = KoreanSpacingCNN(
    vocab_size=len(char2idx),
    embedding_dim=128,
    num_filters=128,
    kernel_sizes=[3,5],
    dropout=0.1
).to(device)
fit(cnn, train_loader, val_loader, epochs=10, lr=3e-3)

Epoch 1/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 1/10 — train_loss: 0.1077  val_loss: 0.0826  time: 26.8s


Epoch 2/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 2/10 — train_loss: 0.0831  val_loss: 0.0760  time: 26.8s


Epoch 3/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 3/10 — train_loss: 0.0780  val_loss: 0.0729  time: 27.0s


Epoch 4/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 4/10 — train_loss: 0.0753  val_loss: 0.0709  time: 26.7s


Epoch 5/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 5/10 — train_loss: 0.0735  val_loss: 0.0696  time: 27.1s


Epoch 6/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 6/10 — train_loss: 0.0722  val_loss: 0.0688  time: 26.7s


Epoch 7/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 7/10 — train_loss: 0.0712  val_loss: 0.0683  time: 26.9s


Epoch 8/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 8/10 — train_loss: 0.0705  val_loss: 0.0678  time: 26.6s


Epoch 9/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 9/10 — train_loss: 0.0699  val_loss: 0.0671  time: 26.6s


Epoch 10/10:   0%|          | 0/3125 [00:00<?, ?it/s]

Epoch 10/10 — train_loss: 0.0693  val_loss: 0.0669  time: 27.7s


In [None]:
transformer = KoreanSpacingTransformer(
    vocab_size=len(char2idx),
    d_model=256,
    nhead=8,
    num_encoder_layers=4,
    dim_feedforward=512,
    dropout=0.1,
    max_len=1000,
    num_labels=2
).to(device)
fit(transformer, train_loader, val_loader, epochs=3, lr=1e-3)

Set up for testing

In [12]:
import torch.nn as nn
from sklearn.metrics import precision_score, recall_score, f1_score

def test_model(model, test_loader):
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=-100)
    total_loss = 0.0
    correct, total = 0, 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X, Y in test_loader:
            logits = model(X)               
            B, S, C = logits.shape

            # 1a) accumulate loss
            loss = criterion(logits.reshape(-1, C), Y.reshape(-1))
            total_loss += loss.item()

            # 1b) compute spacing‐accuracy
            preds = logits.argmax(-1)      
            mask  = (Y != -100)            # ignore padded positions
            correct += (preds[mask] == Y[mask]).sum().item()
            total   += mask.sum().item()

            all_preds.extend(preds[mask].cpu().numpy())
            all_labels.extend(Y[mask].cpu().numpy())

    avg_loss = total_loss / len(test_loader)
    accuracy = correct / total
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    print(f"Test  — loss: {avg_loss:.4f}, accuracy: {accuracy:.4%}")
    print(f"precision: {precision:.4f}, recall: {recall:.4f}, "
          f"F1 score: {f1:.4f}")

Testing

In [18]:
print("Evaluating LSTM on test set:")
test_model(lstm, test_loader)

Evaluating LSTM on test set:
Test  — loss: 0.0567, accuracy: 97.7645%
precision: 0.9618, recall: 0.9563, F1 score: 0.9590


In [24]:
print("Evaluating CNN on test set:")
test_model(cnn, test_loader)

Evaluating CNN on test set:
Test  — loss: 0.0672, accuracy: 97.3414%
precision: 0.9535, recall: 0.9491, F1 score: 0.9513


In [39]:
print("Evaluating Transformer on test set:")
test_model(transformer, test_loader)

Evaluating Transformer on test set:
Test  — loss: 0.3102, accuracy: 86.5673%
precision: 0.8177, recall: 0.6549, F1 score: 0.7273


Save Model + Mapping

In [20]:
torch.save({
    'lstm_state_dict': lstm.state_dict(),
    'char2idx':        char2idx,
    'idx2char':        idx2char
}, 'spacing_model_lstm.pt')
print("Saved LSTM weight and mapping to spacing_model_lstm.pt")

Saved LSTM weight and mapping to spacing_model_lstm.pt


In [25]:
torch.save({
    'cnn_state_dict':  cnn.state_dict(),
    'char2idx':        char2idx,
    'idx2char':        idx2char
}, 'spacing_model_cnn.pt')
print("Saved CNN weight and mapping to spacing_model_cnn.pt")

Saved CNN weight and mapping to spacing_model_cnn.pt


In [40]:
torch.save({
    'transformer_state_dict':  transformer.state_dict(),
    'char2idx':        char2idx,
    'idx2char':        idx2char
}, 'spacing_model_transformer.pt')
print("Saved transformer weight and mapping to spacing_model_transformer.pt")


Saved transformer weight and mapping to spacing_model_transformer.pt


Load Saved Models and Make Predictions on Text

In [None]:
import torch
from lstm_model import KoreanSpacingLSTM
from cnn_model import KoreanSpacingCNN
from transformer_model import KoreanSpacingTransformer

lstm_checkpoint = torch.load('spacing_model_lstm.pt')
lstm = KoreanSpacingLSTM(
    vocab_size=len(lstm_checkpoint['char2idx']),
    embedding_dim=128,
    hidden_dim=128
).to(device)
lstm.load_state_dict(lstm_checkpoint['lstm_state_dict'])
lstm.eval()

cnn_checkpoint = torch.load('spacing_model_cnn.pt')
cnn = KoreanSpacingCNN(
    vocab_size=len(cnn_checkpoint['char2idx']),
    embedding_dim=128,
    num_filters=128,
    kernel_sizes=[3,5],
    dropout=0.1
).to(device)
cnn.load_state_dict(cnn_checkpoint['cnn_state_dict'])
cnn.eval()

transformer_checkpoint = torch.load('spacing_model_transformer.pt')
transformer = KoreanSpacingTransformer(
    vocab_size=len(transformer_checkpoint['char2idx']),
    d_model=256,
    nhead=8,
    num_encoder_layers=4,
    dim_feedforward=512,
    dropout=0.1,
    max_len=1000,
    num_labels=2
).to(device)
transformer.load_state_dict(transformer_checkpoint['transformer_state_dict'])
transformer.eval()

char2idx = lstm_checkpoint['char2idx']
idx2char = lstm_checkpoint['idx2char']

print("Loaded all models.")

In [57]:
def predict(model, text: str) -> str:
    indices = [char2idx.get(character, 0) for character in text]
    input_ = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(input_)
        predictions = logits.argmax(dim=-1)[0].cpu().numpy()
    prediction = ''
    for i, character in enumerate(text):
        prediction += character
        if predictions[i] == 1:
            prediction += ' '
    return prediction

In [58]:
text = '이문장은띄어쓰기가없는문장입니다.'

lstm_prediction = predict(lstm, text)
cnn_prediction = predict(cnn, text)
transformer_prediction = predict(transformer, text)

print('Korean text without spaces:', text)
print('LSTM model prediction:', lstm_prediction)
print('CNN model prediction:', cnn_prediction)
print('Transformer model prediction:', transformer_prediction)

Korean text without spaces: 이문장은띄어쓰기가없는문장입니다.
LSTM model prediction: 이 문장은 띄어 쓰기가 없는 문장입니다.
CNN model prediction: 이 문장은 띄어쓰기가 없는 문장입니다.
Transformer model prediction: 이 문장은 띄어쓰기가 없는 문장입니다. 
