## Urdu Word Segmentation
Let's start by loading in the packages.

## Dataset
Load the dataset.

In [1]:
import re
import codecs
import unicodedata
import pycrfsuite
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [8]:
sentences = list()
f = codecs.open('Data/data_diacritized.txt', 'r', encoding='utf-8')
for line in f:
    line = unicodedata.normalize('NFC',line)
    line = re.sub(r"\s{2,}", " ", line)
    #Comment below 7 lines to get results with diacritization
    line = re.sub(u"ِ", "", line)
    line = re.sub(u"ُ", "", line)
    line = re.sub(u"َ", "", line)
    line = re.sub(u"ْ", "", line)
    line = re.sub(u"ٰ", "", line)
    line = re.sub(u"ً", "", line)
    line = re.sub(u"ّ", "", line)
    sentences.append(line)
f.close()

print ("No. of sentences in the dataset:",len(sentences))


No. of sentences in the dataset: 4325


In [9]:
test_tokens = list()
train_tokens = list()
for i in sentences[-825:]:
    test_tokens.extend(i.split())
for i in sentences[:-825]:
    train_tokens.extend(i.split())
print ("No. of tokens in test set:",len(test_tokens))
print ("No. of tokens in train set:",len(train_tokens))

No. of tokens in test set: 21088
No. of tokens in train set: 90151


Prepare sentences for training by removing 'white spaces' and 'zero width non-joiner'.

In [10]:
def prepare_sentence(sentence):
    
    lengths = [len(w) for w in sentence.split(" ")]
    positions = []

    next_pos = 0
    for length in lengths:
        next_pos = next_pos + length
        positions.append(next_pos)
    concatenated = sentence.replace(" ", "")

    chars = [c for c in concatenated]
    labels = [0 if not i in positions else 1 for i, c in enumerate(concatenated)]
    
    for i, c in enumerate(chars):
        if c == u"\u200C" and i+1 < len(chars):
            labels[i+1] = 2
            del chars[i]
            del labels[i]
    
    return list(zip(chars, labels))

In [11]:
prepared_sentences = list()
for sentence in sentences:    
    prepared_sentences.append(prepare_sentence(sentence))

## Features

Next, define some features.
- N-grams consisting of the current character and up to three preceding and three succeeding characters
- Whether the current character is a digit
- Whether the current character is a joiner
- Unicode class of current character
- Direction of current character

In [12]:
def checkdigit(char):
    digits = [u'۱',u'۲',u'۳',u'۴',u'۵',u'۶',u'۷',u'۸',u'۹',u'۰']
    if char in digits:
        return "true"
    return "false"

In [13]:
def isnonjoiner(char):
    non_joiners = [u'ا', u'د', u'ڈ', u'ز', u'ذ', u'ر', u'ڑ', u'ژ', u'و', u'ے']
    if char in non_joiners:
        return "true"
    return "false"

In [14]:
def create_char_features(sentence, i):
    features = [
        'bias',
        'char=' + sentence[i][0],
        'char.isdigit=' + checkdigit(sentence[i][0]),
        'char.isnonjoiner=' + isnonjoiner(sentence[i][0]),
        'char.category=' + unicodedata.category(sentence[i][0]),
        'char.direction=' + unicodedata.bidirectional(sentence[i][0]),
    ]
    
    if i >= 1:
        features.extend([
            'char-1=' + sentence[i-1][0],
            'char-1:0=' + sentence[i-1][0] + sentence[i][0],
        ])
    else:
        features.append("BOS")
        
    if i >= 2:
        features.extend([
            'char-2=' + sentence[i-2][0],
            'char-2:0=' + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-2:-1=' + sentence[i-2][0] + sentence[i-1][0],
        ])
        
    if i >= 3:
        features.extend([
            'char-3:0=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-3:-1=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0],
        ])
        
        
    if i + 1 < len(sentence):
        features.extend([
            'char+1=' + sentence[i+1][0],
            'char:+1=' + sentence[i][0] + sentence[i+1][0],
        ])
    else:
        features.append("EOS")
        
    if i + 2 < len(sentence):
        features.extend([
            'char+2=' + sentence[i+2][0],
            'char:+2=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0],
            'char+1:+2=' + sentence[i+1][0] + sentence[i+2][0],
        ])
        
    if i + 3 < len(sentence):
        features.extend([
            'char:+3=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0]+ sentence[i+3][0],
            'char+1:+3=' + sentence[i+1][0] + sentence[i+2][0] + sentence[i+3][0],
        ])
    
    return features

def create_sentence_features(prepared_sentence):
    return [create_char_features(prepared_sentence, i) for i in range(len(prepared_sentence))]

def create_sentence_labels(prepared_sentence):
    return [str(part[1]) for part in prepared_sentence]

In [15]:
X_train = [create_sentence_features(ps) for ps in prepared_sentences[:-825]]
y_train = [create_sentence_labels(ps)   for ps in prepared_sentences[:-825]]

## Train the model

To train the model, we create pycrfsuite.Trainer, load the training data and call 'train' method.

In [16]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

Set training parameters. We will use L-BFGS training algorithm with Elastic Net (L1 + L2) regularization.

In [17]:
trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3, # coefficient for L2 penalty
    'max_iterations': 60, # stop earlier
    'feature.possible_transitions': True # include transitions that are possible, but not observed
})

Train the model.

In [18]:
trainer.train('Model/urdu-word-segmentation.crfsuite')

## Make predictions

To use the trained model, create pycrfsuite.Tagger, open the model and use 'tag' method.

In [21]:
tagger = pycrfsuite.Tagger()
tagger.open('Model/urdu-word-segmentation.crfsuite')

<contextlib.closing at 0x11ca0ca00>

Let's segment a sentence to see how it works.

In [23]:
def segment_sentence(sentence):
    sentence = sentence.replace(" ", "")
    sentence = sentence.replace(u"\u200C", "") 
    prediction = tagger.tag(create_sentence_features(sentence))
    print (prediction)
    complete = ""
    for i, p in enumerate(prediction):
        if p == "1":
            complete += " " + sentence[i]
        elif p == "2":
            complete += u"\u200C" + sentence[i]
        else:
            complete += sentence[i]
    return complete

In [24]:
#Segment Sentence with Diacritics
print(segment_sentence(u"ہم ایک آزاد براہِ‌راست سرمایہ‌کاری پالیسی کو جاری رکھنے کی لمبی تاریخ رکھتے ہیں مِسٹر دلارا کہتے ہیں"))

['0', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '2', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0']
ہم ایک آزاد براہ ِراست سرمایہ‌کاری پالیسی کو جاری رکھنے کی لمبی تاریخ رکھتے ہیں مِسٹر دلارا کہتے ہیں


In [25]:
#Segment Sentence without Diacritics
print(segment_sentence(u"ہم ایک آزاد براہ‌راست سرمایہ‌کاری پالیسی کو جاری رکھنے کی لمبی تاریخ رکھتے ہیں مسٹر دلارا کہتے ہیں"))

['0', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '2', '0', '0', '0', '1', '0', '0', '0', '0', '0', '2', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0']
ہم ایک آزاد براہ‌راست سرمایہ‌کاری پالیسی کو جاری رکھنے کی لمبی تاریخ رکھتے ہیں مسٹر دلارا کہتے ہیں


## Evaluate the model
Segment all sentences in our test set.

In [26]:
y_true = list()
y_pred = list()
for s in prepared_sentences[-825:]:
    prediction = tagger.tag(create_sentence_features(s))
    y_pred.extend(prediction)
    correct = create_sentence_labels(s)
    y_true.extend(correct)

Print classification report to check results. Here 
'I' denote continuation of a word or sub-word,
'Bw' denote beginning of a word,
'Bs' denote beginning of a sub-word.

In [27]:
target_names = ['I', 'Bw', 'Bs']
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

           I       0.99      0.99      0.99     59665
          Bw       0.97      0.97      0.97     20264
          Bs       0.91      0.80      0.85      1200

    accuracy                           0.98     81129
   macro avg       0.96      0.92      0.94     81129
weighted avg       0.98      0.98      0.98     81129



Print confusion matrix to see class wise stats.

In [28]:
print(confusion_matrix(y_true, y_pred, labels=["0", "1", "2"]))

[[59149   474    42]
 [  574 19637    53]
 [  119   116   965]]
