In [66]:
import pandas as pd
import json
from sklearn_crfsuite import CRF, metrics

### Loading the Data

In [67]:
df = pd.read_csv('data/pos_tagged_sample.csv')
df.head()

Unnamed: 0,id,original,segmented,tagged
0,1,د افغانستان د کرکټ بورډ سرپرست مشر میرویس اشرف...,دBافغانستانBدBکرکټBبورډBسرپرستBمشرBمیرویسSاشرف...,"[[""د"", ""IN""], [""افغانستان"", ""NN.P""], [""د"", ""IN..."
1,2,طالبان لا د خپل مشرتابه د ټاکلو لپاره هم کوم م...,طالبانBلاBدBخپلBمشرتابهBدBټاکلوBلپارهBهمBکومBم...,"[[""طالبان"",""NN.C.2""],[""لا"",""RB""],[""د"",""IN""],[""..."
2,3,له ستونزو سره سره د افغانستان اوسنی اقتصادي وض...,لهBستونزوBسرهSسرهBدBافغانستانBاوسنیBاقتصاديBوض...,"[[""له"",""IN""],[""ستونزو"",""NN.C.2""],[""سره سره"",""R..."
3,4,چې امریکايي ځواکونو افغانستان ته د ترهګرو د ځپ...,چېBامریکاييBځواکونوBافغانستانBتهBدBترهګروBدBځپ...,"[[""چې"",""CC""],[""امریکايي"",""JJ""],[""ځواکونو"",""NN...."
4,5,هغوی پوښتنې کولې او مولوي صاحب يې ځوابونه ويل,هغویBپوښتنېBکولېBاوBمولويSصاحبBيېBځوابونهBويل,"[[""هغوی"",""PR.P.iii""],[""پوښتنې"",""NN.C.2""],[""کول..."


### Preprocessing

In [69]:
tagged_sentences = df['tagged']

sentences = []
for sentence in tagged_sentences:
    sentence = json.loads(sentence)
    sentence_ = []
    for word in sentence:
        word, tag = word[0], word[1]
        tag = tag.replace('.','').replace('NNC2', 'NNS').replace('NNC1M', 'NNM').replace('NNC1F', 'NNF')
        sentence_.append((word,tag))
    sentences.append(sentence_)
    
cutoff = int(.20 * len(sentences))
train = dataset[cutoff:]
test = dataset[:cutoff]

print('Total: ', len(sentences))
print('Train: ', len(train))
print('Test: ', len(test))

Total:  500
Train:  400
Test:  100


### Features Extraction

In [70]:
def features(sentence, index):
    token = sentence[index]
    prev_1 = ''.join(sentence[index-1:index])
    next_1 = ''.join(sentence[index+1:index+2])
    prev_2 = '' if(index<2) else ''.join(sentence[index-2:index-1])
    next_2 = ''.join(sentence[index+2:index+3])
    
    features = {
        'token': token,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'length': len(token),
        'is_numeric': token.isdigit(),
        'pfx_1': token[0] if(len(token) > 2) else '',
        'pfx_2': token[:2] if(len(token) > 3) else '',
        'pfx_3': token[:3] if(len(token) > 4) else '',
        'sfx_1': token[-1] if(len(token) > 2) else '',
        'sfx_2': token[-2:] if(len(token) > 3) else '',
        'sfx_3': token[-3:] if(len(token) > 4) else '',
        
        'prev_1': prev_1,
        'prev_1_len': len(prev_1),
        'prev_1_pfx_1': '' if (not prev_1 or len(prev_1)<3) else prev_1[0],
        'prev_1_pfx_2': '' if (not prev_1 or len(prev_1)<4) else prev_1[:2],
        'prev_1_sfx_1': '' if (not prev_1 or len(prev_1)<3) else prev_1[-1],
        'prev_1_sfx_2': '' if (not prev_1 or len(prev_1)<4) else prev_1[-2:],
        'prev_2': prev_2,
        
        'next_1': next_1,
        'next_1_len': len(next_1),
        'next_1_pfx_1': '' if (not next_1 or len(next_1)<3) else next_1[0],       
        'next_1_pfx_2': '' if (not next_1 or len(next_1)<4) else next_1[:2],       
        'next_1_sfx_1': '' if (not next_1 or len(next_1)<3) else next_1[-1],       
        'next_1_sfx_2': '' if (not next_1 or len(next_1)<4) else next_1[-2:],       
        'next_2': next_2,
      }
    return features

def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        untagged = [token for token, tag in tagged]
        X.append([features(untagged, index) for index in range(len(untagged))])
        y.append([tag for _, tag in tagged])
    return X, y

X_train, y_train = transform_to_dataset(train)
X_test, y_test = transform_to_dataset(test)

### Model Training

In [75]:
model = CRF(algorithm = 'lbfgs', c1 = 0.1, c2 = 0.1, max_iterations = 100, all_possible_transitions=True)
model.fit(X_train, y_train)

### Testing

In [74]:
y_pred = model.predict(X_test)

prec = metrics.flat_precision_score(y_test, y_pred, average='macro')
rec = metrics.flat_recall_score(y_test, y_pred, average='macro')
f1 = metrics.flat_f1_score(y_test, y_pred, average='macro')
acc = metrics.flat_accuracy_score(y_test, y_pred)
print(prec, rec, f1, acc)

0.7696129659555626 0.7060889619761475 0.7266334982702931 0.8892601431980907
