In [1]:
import sys
import re
import glob
import codecs
import unicodedata
import os
import wave
import contextlib
import numpy as np
import pycrfsuite
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

In [2]:
sentences = list()
f = codecs.open('Data\data_diacritized.txt', 'r', encoding='utf-8')
for line in f:
    line = unicodedata.normalize('NFC',line)
    line = re.sub(r"\s{2,}", " ", line)
    #Comment below 7 lines to get results with diacritization
    line = re.sub(u"ِ", "", line)
    line = re.sub(u"ُ", "", line)
    line = re.sub(u"َ", "", line)
    line = re.sub(u"ْ", "", line)
    line = re.sub(u"ٰ", "", line)
    line = re.sub(u"ً", "", line)
    line = re.sub(u"ّ", "", line)
    sentences.append(line)
f.close()

print "No. of sentences in the dataset:",len(sentences)
print sentences[0]

No. of sentences in the dataset: 4325
مسٹر لیمنز نے کہا یہ توالدی ہنر‌مندی اس پودے کی نمو کو نقصان نہیں پہنچاتی ہے



In [517]:
#Optionally Shuffle the data (Results may vary)
random_sentences = shuffle(sentences, random_state=0)
sentences = list()
sentences = random_sentences
print sentences[0]

تجزیہ‌نگار مایوس تھے کہ سرمایہ‌کاروں نے جارجیا پیسیفک کی گریٹ ناتھرن نیکوسا کے ذخائر کے لیے ڈالر ۳۱۸ بلین کی بولی کے لیے دکھائی گئی گرم‌جوشی بہت تیزی سے ختم ہو گئی



In [3]:
test_tokens = list()
train_tokens = list()
for i in sentences[-825:]:
    test_tokens.extend(i.split())
for i in sentences[:-825]:
    train_tokens.extend(i.split())
print "No. of tokens in test set:",len(test_tokens)
print "No. of tokens in train set:",len(train_tokens)

No. of tokens in test set: 21088
No. of tokens in train set: 90151


In [4]:
def prepare_sentence(sentence):
    
    lengths = [len(w) for w in sentence.split(" ")]
    positions = []

    next_pos = 0
    for length in lengths:
        next_pos = next_pos + length
        positions.append(next_pos)
    concatenated = sentence.replace(" ", "")

    chars = [c for c in concatenated]
    labels = [0 if not i in positions else 1 for i, c in enumerate(concatenated)]
    
    for i, c in enumerate(chars):
        if c == u"\u200C" and i+1 < len(chars):
            labels[i+1] = 2
            del chars[i]
            del labels[i]
    
    return list(zip(chars, labels))

In [5]:
prepared_sentences = list()
for sentence in sentences:    
    prepared_sentences.append(prepare_sentence(sentence))
    
print([d for d in prepared_sentences[100]])

[(u'\u0645', 0), (u'\u0648', 0), (u'\u0627', 0), (u'\u0632', 0), (u'\u0646', 0), (u'\u06c1', 0), (u'\u06a9', 1), (u'\u06d2', 0), (u'\u0630', 1), (u'\u0631', 0), (u'\u06cc', 0), (u'\u0639', 0), (u'\u06d2', 0), (u'\u067e', 1), (u'\u06c1', 0), (u'\u0644', 0), (u'\u06d2', 0), (u'\u0646', 1), (u'\u0648', 0), (u'\u0645', 1), (u'\u06c1', 0), (u'\u06cc', 0), (u'\u0646', 0), (u'\u0648', 0), (u'\u06ba', 0), (u'\u06a9', 1), (u'\u06d2', 0), (u'\u0644', 1), (u'\u06cc', 0), (u'\u06d2', 0), (u'\u0632', 1), (u'\u06cc', 0), (u'\u0631', 0), (u'\u0648', 0), (u'\u06a9', 0), (u'\u0633', 0), (u'\u0646', 1), (u'\u06d2', 0), (u'\u0688', 1), (u'\u0627', 0), (u'\u0644', 0), (u'\u0631', 0), (u'\u06f1', 1), (u'\u06f2', 0), (u'\u06f9', 0), (u'\u06f7', 0), (u'\u0628', 1), (u'\u0644', 0), (u'\u06cc', 0), (u'\u0646', 0), (u'\u06a9', 1), (u'\u06d2', 0), (u'\u0645', 1), (u'\u0627', 0), (u'\u0644', 0), (u'\u06cc', 0), (u'\u06c1', 0), (u'\u067e', 1), (u'\u0631', 0), (u'\u0688', 1), (u'\u0627', 0), (u'\u0644', 0), (u'\u06

In [6]:
def checkdigit(char):
    digits = [u'۱',u'۲',u'۳',u'۴',u'۵',u'۶',u'۷',u'۸',u'۹',u'۰']
    if char in digits:
        return "true"
    return "false"

In [7]:
def isnonjoiner(char):
    non_joiners = [u'ا', u'د', u'ڈ', u'ز', u'ذ', u'ر', u'ڑ', u'ژ', u'و', u'ے']
    if char in non_joiners:
        return "true"
    return "false"

In [8]:
def create_char_features(sentence, i):
    features = [
        'bias',
        'char=' + sentence[i][0],
        'char.isdigit=' + checkdigit(sentence[i][0]),
        'char.isnonjoiner=' + isnonjoiner(sentence[i][0]),
        'char.category=' + unicodedata.category(sentence[i][0]),
        'char.direction=' + unicodedata.bidirectional(sentence[i][0]),
    ]
    
    if i >= 1:
        features.extend([
            'char-1=' + sentence[i-1][0],
            'char-1:0=' + sentence[i-1][0] + sentence[i][0],
        ])
    else:
        features.append("BOS")
        
    if i >= 2:
        features.extend([
            'char-2=' + sentence[i-2][0],
            'char-2:0=' + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-2:-1=' + sentence[i-2][0] + sentence[i-1][0],
        ])
        
    if i >= 3:
        features.extend([
            'char-3:0=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-3:-1=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0],
        ])
        
        
    if i + 1 < len(sentence):
        features.extend([
            'char+1=' + sentence[i+1][0],
            'char:+1=' + sentence[i][0] + sentence[i+1][0],
        ])
    else:
        features.append("EOS")
        
    if i + 2 < len(sentence):
        features.extend([
            'char+2=' + sentence[i+2][0],
            'char:+2=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0],
            'char+1:+2=' + sentence[i+1][0] + sentence[i+2][0],
        ])
        
    if i + 3 < len(sentence):
        features.extend([
            'char:+3=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0]+ sentence[i+3][0],
            'char+1:+3=' + sentence[i+1][0] + sentence[i+2][0] + sentence[i+3][0],
        ])
    
    return features



def create_sentence_features(prepared_sentence):
    return [create_char_features(prepared_sentence, i) for i in range(len(prepared_sentence))]

def create_sentence_labels(prepared_sentence):
    return [str(part[1]) for part in prepared_sentence]

In [9]:
X_train = [create_sentence_features(ps) for ps in prepared_sentences[:-825]]
y_train = [create_sentence_labels(ps)   for ps in prepared_sentences[:-825]]

In [10]:
print X_train[0]
print y_train[0]

[['bias', u'char=\u0645', 'char.isdigit=false', 'char.isnonjoiner=false', 'char.category=Lo', 'char.direction=AL', 'BOS', u'char+1=\u0633', u'char:+1=\u0645\u0633', u'char+2=\u0679', u'char:+2=\u0645\u0633\u0679', u'char+1:+2=\u0633\u0679', u'char:+3=\u0645\u0633\u0679\u0631', u'char+1:+3=\u0633\u0679\u0631'], ['bias', u'char=\u0633', 'char.isdigit=false', 'char.isnonjoiner=false', 'char.category=Lo', 'char.direction=AL', u'char-1=\u0645', u'char-1:0=\u0645\u0633', u'char+1=\u0679', u'char:+1=\u0633\u0679', u'char+2=\u0631', u'char:+2=\u0633\u0679\u0631', u'char+1:+2=\u0679\u0631', u'char:+3=\u0633\u0679\u0631\u0644', u'char+1:+3=\u0679\u0631\u0644'], ['bias', u'char=\u0679', 'char.isdigit=false', 'char.isnonjoiner=false', 'char.category=Lo', 'char.direction=AL', u'char-1=\u0633', u'char-1:0=\u0633\u0679', u'char-2=\u0645', u'char-2:0=\u0645\u0633\u0679', u'char-2:-1=\u0645\u0633', u'char+1=\u0631', u'char:+1=\u0679\u0631', u'char+2=\u0644', u'char:+2=\u0679\u0631\u0644', u'char+1:+2=\

In [11]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [12]:
trainer.set_params({
    'c1': 1.0, 
    'c2': 1e-3,
    'max_iterations': 60,
    'feature.possible_transitions': True
})

In [13]:
trainer.train('Model\urdu-word-segmentation.crfsuite')

In [14]:
tagger = pycrfsuite.Tagger()
tagger.open('Model\urdu-word-segmentation.crfsuite')

<contextlib.closing at 0x4273d5f8>

In [15]:
def segment_sentence(sentence):
    sentence = sentence.replace(" ", "")
    sentence = sentence.replace(u"\u200C", "") 
    prediction = tagger.tag(create_sentence_features(sentence))
    print prediction
    complete = ""
    for i, p in enumerate(prediction):
        if p == "1":
            complete += " " + sentence[i]
        elif p == "2":
            complete += u"\u200C" + sentence[i]
        else:
            complete += sentence[i]
    return complete

In [513]:
#Segment Sentence with Diacritics
print(segment_sentence(u"ہم ایک آزاد براہِ‌راست سرمایہ‌کاری پالیسی کو جاری رکھنے کی لمبی تاریخ رکھتے ہیں مِسٹر دلارا کہتے ہیں"))

['0', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '2', '0', '0', '0', '1', '0', '0', '0', '0', '0', '2', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0']
ہم ایک آزاد براہِ‌راست سرمایہ‌کاری پالیسی کو جاری رکھنے کیلمبی تاریخ رکھتے ہیں مِسٹر دلارا کہتے ہیں


In [16]:
#Segment Sentence without Diacritics
print(segment_sentence(u"ہم ایک آزاد براہ‌راست سرمایہ‌کاری پالیسی کو جاری رکھنے کی لمبی تاریخ رکھتے ہیں مسٹر دلارا کہتے ہیں"))

['0', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '2', '0', '0', '0', '1', '0', '0', '0', '0', '0', '2', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0']
ہم ایک آزاد براہ‌راست سرمایہ‌کاری پالیسی کو جاری رکھنے کی لمبی تاریخ رکھتے ہیں مسٹر دلارا کہتے ہیں


In [17]:
y_true = list()
y_pred = list()
for s in prepared_sentences[-825:]:
    prediction = tagger.tag(create_sentence_features(s))
    y_pred.extend(prediction)
    correct = create_sentence_labels(s)
    y_true.extend(correct)

In [18]:
target_names = ['No space', 'Space', 'ZWNJ']
print(classification_report(y_true, y_pred, target_names=target_names))

             precision    recall  f1-score   support

   No space       0.99      0.99      0.99     59665
      Space       0.97      0.97      0.97     20264
       ZWNJ       0.91      0.80      0.85      1200

avg / total       0.98      0.98      0.98     81129

