# Imports

In [25]:
!pip install --upgrade --force-reinstall nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting colorama (from click->nltk)
  Using cached colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.11.6-cp312-cp312-win_amd64.whl (273 kB)
Using cached click-8.2.1-py3-none-any.whl (102 kB)
Using cached joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Using cached colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: regex, joblib, colorama, tqdm, click, nltk
  Attempting uninstall: regex
  

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.5.1 requires keyring>=17.0.0, but you have keyring 8.7 which is incompatible.


In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\k\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
import os
import re
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import torch
from torch.nn.utils.rnn import pad_sequence

#

# Text processing

In [100]:
def file_to_sentence_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()


    sentences = [sentence.strip() for sentence in re.split(
        r'(?<=[.!?])\s+', text) if sentence.strip()]

    return sentences

file_path = '../data/raw/data.txt'
text_data = file_to_sentence_list(file_path)

In [101]:
len(text_data)

6216

In [None]:
import re
from collections import defaultdict, Counter

def preprocess_text(text):
    """Normalize and prepare Darija text for BPE"""
    # Remove Arabic diacritics
    text = re.sub(r'[ًٌٍَُِّْ]', '', text)
    # Normalize alef variants
    text = re.sub(r'[إأآا]', 'ا', text)
    # Convert ta-marbuta to ha
    text = re.sub(r'ة', 'ه', text)
    
    # Normalize Romanized variations
    replacements = {
        r'[éèêë]': 'e',
        r'[àâä]': 'a',
        r'[îï]': 'i',
        r'[ôö]': 'o',
        r'[ûüù]': 'u',
        r'ch': 'sh',
        r'9': 'q',
        r'5': 'kh',
    }
    for pattern, repl in replacements.items():
        text = re.sub(pattern, repl, text)
    
    return text.lower()

def get_vocab(text):
    """Extract word frequencies with word boundary info for BPE"""
    words = re.findall(r'\w+', text)
    words_with_space = ['_' + word for word in words]
    return Counter(words_with_space)

def get_stats(vocab):
    """Count pairs of adjacent symbols in vocab"""
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return pairs

def merge_vocab(pair, vocab):
    """Apply a merge operation to the vocabulary"""
    v_out = {}
    bigram = re.escape(' '.join(pair))
    pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    
    for word in vocab:
        # Merge the best pair in the word
        w_out = pattern.sub(''.join(pair), word)
        v_out[w_out] = vocab[word]
    return v_out

def train_bpe(text, num_merges=20, min_freq=100):
    """Train BPE on Darija text"""
    text = preprocess_text(text)
    vocab = get_vocab(text)
    # Initialize vocab keys as sequences of chars separated by spaces
    vocab = {' '.join(word): freq for word, freq in vocab.items()}
    
    merges = []
    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            print("No more pairs to merge.")
            break
        best_pair = max(pairs, key=pairs.get)
        if pairs[best_pair] < min_freq:
            print(f"Stopping early at merge {i+1} due to low frequency: {pairs[best_pair]} < {min_freq}")
            break
        merges.append(best_pair)
        vocab = merge_vocab(best_pair, vocab)
    return merges

def apply_bpe(text, merges):
    """Segment text using learned BPE rules"""
    text = preprocess_text(text)
    words = re.findall(r'\w+', text)
    words = ['_' + word for word in words]  # Add space before each word
    merges_set = set(merges)
    segmented_words = []
    
    for word in words:
        word_chars = list(word)
        i = 0
        while i < len(word_chars) - 1:
            pair = (word_chars[i], word_chars[i+1])
            if pair in merges_set:
                # Merge pair tokens
                word_chars[i] = word_chars[i] + word_chars[i+1]
                del word_chars[i+1]
                # After merge, check again from current position
            else:
                i += 1
        segmented_words.extend(word_chars)
    return segmented_words


text = '\n'.join(text_data)
merges = train_bpe(text , num_merges=2000, min_freq=1)

print("Learned merges:")
for i, (a, b) in enumerate(merges):
    print(f"{i+1}. {a} + {b} -> {a+b}")

segmented = apply_bpe(text, merges)

print("\nSegmented words (first 10):")
for word_tokens in segmented[:10]:
    print(word_tokens)


Learned merges:
1. s + h -> sh
2. _ + l -> _l
3. _ + m -> _m
4. h + a -> ha
5. _ + k -> _k
6. _ + t -> _t
7. a + n -> an
8. o + u -> ou
9. _ + b -> _b
10. _ + n -> _n
11. _ + d -> _d
12. l + a -> la
13. _ + 3 -> _3
14. r + a -> ra
15. _m + a -> _ma
16. y + a -> ya
17. w + a -> wa
18. ha + d -> had
19. _ + f -> _f
20. l + i -> li
21. _ + g -> _g
22. _l + i -> _li
23. e + r -> er
24. _ + s -> _s
25. t + i -> ti
26. _ + sh -> _sh
27. h + i -> hi
28. n + a -> na
29. _ + wa -> _wa
30. t + a -> ta
31. m + a -> ma
32. d + a -> da
33. e + k -> ek
34. _ + had -> _had
35. 3 + a -> 3a
36. _k + a -> _ka
37. a + l -> al
38. d + i -> di
39. e + n -> en
40. s + s -> ss
41. sh + i -> shi
42. n + i -> ni
43. r + i -> ri
44. e + l -> el
45. k + h -> kh
46. _ + a -> _a
47. _ + ra -> _ra
48. b + a -> ba
49. q + a -> qa
50. _k + an -> _kan
51. e + d -> ed
52. _d + i -> _di
53. an + a -> ana
54. _d + a -> _da
55. had + i -> hadi
56. _3 + la -> _3la
57. g + hi -> ghi
58. _ + j -> _j
59. _ + w -> _w
60. a + f

In [115]:
merges[:10]

[('s', 'h'),
 ('_', 'l'),
 ('_', 'm'),
 ('h', 'a'),
 ('_', 'k'),
 ('_', 't'),
 ('a', 'n'),
 ('o', 'u'),
 ('_', 'b'),
 ('_', 'n')]

In [116]:
def generate_ngrams(tokenized_sentences, max_len=5):
    sequences = []
    for sentence in tokenized_sentences:
        for i in range(1,len(sentence)):
            context = sentence[max(0,i-max_len):i]
            target = sentence[i]
            sequences.append((context, target))
        
    return sequences

In [121]:
# First, preprocess and tokenize sentences into words, then apply BPE on each word:
bpe_tokenized_sentences = []
for sentence in text_data:
    bpe_tokens_in_sentence = apply_bpe(sentence, merges)
    
    bpe_tokenized_sentences.append(bpe_tokens_in_sentence)


In [122]:
bpe_tokenized_sentences

[['_ho',
  'ma',
  '_mk',
  'h',
  'b',
  'b',
  'yi',
  'n',
  '_s',
  'hi',
  '_h',
  'a',
  'ja',
  '_a',
  'na',
  '_mt',
  'i',
  'q',
  'q',
  'en'],
 ['_ba',
  'yn',
  'a',
  '_ho',
  'ma',
  '_tay',
  '7a',
  'wl',
  'o',
  '_i',
  'b',
  'qa',
  'w',
  '_mb',
  'r',
  'rd',
  'in'],
 ['_lo',
  'ti',
  'lat',
  '_mab',
  'a',
  'yn',
  'a',
  'sh',
  '_fih',
  'om',
  '_mo',
  'ri',
  '7i',
  'n',
  '_bz',
  'za',
  'f'],
 ['_gh',
  'al',
  'i',
  'ba',
  'n',
  '_gh',
  'a',
  'y',
  'j',
  'r',
  'ri',
  'w',
  '_3l',
  'i',
  'h',
  '_mn',
  '_lk',
  'hd',
  'ma'],
 ['_tab', '3a', 'n', '_r', 'a', 'h', '_mk', 'ta', '2', 'eb'],
 ['_tay', 'ba', 'll', 'i', 'a', '_gh', 'an', 'm', 'shi'],
 ['_a', 'ra', '_lia', '_dak', '_sa', 'c'],
 ['_gh', 'an', 'mr', 'ed'],
 ['_knt',
  '_di',
  'ma',
  '_3a',
  'rf',
  '_a',
  'n',
  'na',
  'ha',
  '_b',
  'gh',
  'at',
  'na',
  '_nm',
  'out',
  'ou',
  '_b',
  'gh',
  'it',
  '_n3',
  'ref',
  '_s',
  'h',
  '7a',
  'l',
  '_b',
  'qa',
  '_l

In [123]:
from collections import Counter

# Flatten list of tokens from all sentences
all_bpe_tokens = [token for sent in bpe_tokenized_sentences for token in sent]

bpe_token_counts = Counter(all_bpe_tokens)

# Optional threshold to filter rare tokens
threshold = 5
vocab = [token for token, count in bpe_token_counts.items() if count > threshold]

# Build vocab to index dictionary
word2idx = {'<PAD>': 0, '<UNK>': 1}
for idx, token in enumerate(sorted(vocab), start=2):
    word2idx[token] = idx
    
idx2word = {idx: token for token, idx in word2idx.items()}
def replace_rare_words(sentence):
    return [word if word in word2idx else '<UNK>' for word in sentence]

processed_sentences = [replace_rare_words(sentence) for sentence in bpe_tokenized_sentences]

# Invert the dictionary
idx2word = {idx: word for word, idx in word2idx.items()}


In [124]:
all_words_in_processed_sentences = [word for sentence in processed_sentences for word in sentence]
word_counts_in_processed_sentences = Counter(all_words_in_processed_sentences)

In [125]:
len(word_counts_in_processed_sentences)

763

In [126]:
word_counts_in_processed_sentences['<UNK>']

26

In [127]:
ngrams = generate_ngrams(processed_sentences, max_len=5)

# Sample output:
for i in range(5):
    print(ngrams[i])

(['_ho'], 'ma')
(['_ho', 'ma'], '_mk')
(['_ho', 'ma', '_mk'], 'h')
(['_ho', 'ma', '_mk', 'h'], 'b')
(['_ho', 'ma', '_mk', 'h', 'b'], 'b')


In [128]:
sequences = [([word2idx[word] for word in context], word2idx[target]) for context, target in ngrams]

In [129]:
sequences

[([132], 555),
 ([132, 555], 242),
 ([132, 555, 242], 481),
 ([132, 555, 242, 481], 374),
 ([132, 555, 242, 481, 374], 374),
 ([555, 242, 481, 374, 374], 751),
 ([242, 481, 374, 374, 751], 568),
 ([481, 374, 374, 751, 568], 288),
 ([374, 374, 751, 568, 288], 492),
 ([374, 751, 568, 288, 492], 129),
 ([751, 568, 288, 492, 129], 360),
 ([568, 288, 492, 129, 360], 507),
 ([288, 492, 129, 360, 507], 54),
 ([492, 129, 360, 507, 54], 569),
 ([129, 360, 507, 54, 569], 248),
 ([360, 507, 54, 569, 248], 498),
 ([507, 54, 569, 248, 498], 608),
 ([54, 569, 248, 498, 608], 608),
 ([569, 248, 498, 608, 608], 445),
 ([68], 752),
 ([68, 752], 360),
 ([68, 752, 360], 132),
 ([68, 752, 360, 132], 555),
 ([68, 752, 360, 132, 555], 318),
 ([752, 360, 132, 555, 318], 22),
 ([360, 132, 555, 318, 22], 733),
 ([132, 555, 318, 22, 733], 577),
 ([555, 318, 22, 733, 577], 134),
 ([318, 22, 733, 577, 134], 374),
 ([22, 733, 577, 134, 374], 609),
 ([733, 577, 134, 374, 609], 721),
 ([577, 134, 374, 609, 721], 236

In [130]:
def left_pad_sequence(seq, max_len):
    return [0] * (max_len - len(seq)) + seq


In [131]:
import torch

# 1. Left pad function
def left_pad_sequence(seq, max_len):
    return torch.tensor([0] * (max_len - len(seq)) + seq.tolist())

# 2. Prepare inputs and targets
inputs = [torch.tensor(seq[0]) for seq in sequences]
targets = [seq[1] for seq in sequences]

# 3. Pad all inputs to same length (e.g., max_len = 5)
max_len = 5
padded_inputs = torch.stack([left_pad_sequence(seq, max_len) for seq in inputs])

# 4. Convert targets to tensor
targets = torch.tensor(targets)


In [132]:
padded_inputs

tensor([[  0,   0,   0,   0, 132],
        [  0,   0,   0, 132, 555],
        [  0,   0, 132, 555, 242],
        ...,
        [700, 284, 360, 608, 553],
        [284, 360, 608, 553, 207],
        [360, 608, 553, 207, 741]])

In [133]:
from torch.utils.data import Dataset, DataLoader

class NgramDataset(Dataset):
    def __init__(self, padded_inputs, targets):
        self.padded_inputs = padded_inputs
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.padded_inputs[idx], self.targets[idx]

In [134]:
dataset = NgramDataset(padded_inputs, targets)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [135]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim=50, hidden_size=128, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.bi_lstm = nn.LSTM(input_size=embedding_dim,
                               hidden_size=hidden_size,
                               num_layers=num_layers,
                               bidirectional=True,
                               batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)  # *2 for bidirectional

    def forward(self, x):
        # x: (batch_size, sequence_length)
        embedded = self.embedding(x)  # (batch_size, seq_len, embedding_dim)
        lstm_out, _ = self.bi_lstm(embedded)  # (batch_size, seq_len, hidden_size*2)
        last_hidden = lstm_out[:, -1, :]  # take output of the last time step
        output = self.fc(last_hidden)  # (batch_size, vocab_size)
        return output


In [136]:
model = NextWordPredictor(vocab_size=len(word2idx))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [137]:
len(dataset)

609554

In [90]:
from tqdm import tqdm  # Make sure to install tqdm if not already: pip install tqdm

num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    loop = tqdm(dataloader, desc=f"Epoch [{epoch+1}/{num_epochs}]", leave=False)

    for inputs, targets in loop:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs, targets)
        epoch_loss += loss.item()

        # Backward pass + optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accuracy
        _, predicted = torch.max(outputs, dim=1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

        # Update tqdm progress bar with loss and accuracy info
        loop.set_postfix(loss=loss.item(), acc=(correct / total) * 100)

    avg_loss = epoch_loss / len(dataloader)
    accuracy = correct / total * 100

    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {avg_loss:.4f} - Accuracy: {accuracy:.2f}%")


                                                                                       

Epoch [1/1] - Loss: 4.1496 - Accuracy: 25.07%




In [91]:
import torch
import torch.nn.functional as F

def predict_next_word(model, input_seq, word2idx, idx2word, max_len=5, device='cpu'):
    model.eval()
    
    # Tokenize & numericalize input_seq (a list of tokens or token IDs)
    # If input_seq is words, convert to indices:
    if isinstance(input_seq[0], str):
        input_seq = [word2idx.get(w, word2idx['<UNK>']) for w in input_seq]

    # Left pad to max_len
    input_tensor = torch.tensor([0] * (max_len - len(input_seq)) + input_seq).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(input_tensor)  # (1, vocab_size)
        probs = F.softmax(outputs, dim=1)
        predicted_idx = torch.argmax(probs, dim=1).item()

    predicted_word = idx2word.get(predicted_idx, '<UNK>')
    return predicted_word


In [95]:
input_seq = "salam hy"
predict_next_word(model, input_seq.split(), word2idx, idx2word, max_len=5, device=device)

'h'