In [1]:
import pandas as pd
import textwrap, joblib, random
from sklearn_crfsuite import CRF, metrics

# Load Data

In [8]:
df = pd.read_csv('dataset/sample.csv', header=None)
df = df.loc[:,0]
tagged_sentences = []
for sent in df:
    sent = sent.strip(' B').split('B')
    tagged_sent = []
    for word in sent:
        word = word.split('S')
        for i, w in enumerate(word):
            if(i+1 < len(word)): el = (w,'S')
            else: el = (w,'B')
            tagged_sent.append(el)
    tagged_sentences.append(tagged_sent)

random.seed(42)
random.shuffle(tagged_sentences)    
cutoff = int(.8 * len(tagged_sentences))
train = tagged_sentences[:cutoff]
test = tagged_sentences[cutoff:]

print('Total: ', len(tagged_sentences))
print('Train: ', len(train))
print('Test: ', len(test))

Total:  3000
Train:  2400
Test:  600


# Features Extraction

In [4]:
def features(sentence, index):
    token = sentence[index]
    prev_1 = ''.join(sentence[index-1:index+1])
    next_1 = ''.join(sentence[index+1:index+2])
    
    features = {
        'token': token,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'length': len(token),
        'is_numeric': token.isdigit(),
        'pfx_1': token[0] if(len(token) > 2) else '',
        'pfx_2': token[:2] if(len(token) > 3) else '',
        'pfx_3': token[:3] if(len(token) > 4) else '',
        'sfx_1': token[-1] if(len(token) > 2) else '',
        'sfx_2': token[-2:] if(len(token) > 3) else '',
        'sfx_3': token[-3:] if(len(token) > 4) else '',
        
        'prev_1': prev_1,
        'prev_1_len': len(prev_1),
        'prev_1_pfx_1': '' if (not prev_1 or len(prev_1)<3) else prev_1[0],
        'prev_1_pfx_2': '' if (not prev_1 or len(prev_1)<4) else prev_1[:2],
        'prev_1_sfx_1': '' if (not prev_1 or len(prev_1)<3) else prev_1[-1],
        'prev_1_sfx_2': '' if (not prev_1 or len(prev_1)<4) else prev_1[-2:],
        'prev_2': '' if(index<2) else ''.join(sentence[index-2:index-1]),
        
        'next_1': next_1,
        'next_1_len': len(next_1),
        'next_1_pfx_1': '' if (not next_1 or len(next_1)<3) else next_1[0],       
        'next_1_pfx_2': '' if (not next_1 or len(next_1)<4) else next_1[:2],       
        'next_1_sfx_1': '' if (not next_1 or len(next_1)<3) else next_1[-1],       
        'next_1_sfx_2': '' if (not next_1 or len(next_1)<4) else next_1[-2:],       
        'next_2': ''.join(sentence[index+2:index+3])
      }
    return features

def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        untagged = [first for first, second in tagged]
        X.append([features(untagged, index) for index in range(len(untagged))])
        y.append([tag for _, tag in tagged])
    return X, y

X_train, y_train = transform_to_dataset(train)
X_test, y_test = transform_to_dataset(test)

# Training

In [5]:
model = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
model.fit(X_train, y_train)
joblib.dump(model, 'models/segmenter.sav')

['models/segmenter.sav']

# Testing

In [6]:
y_pred = model.predict(X_test)

prec = round(metrics.flat_precision_score(y_test, y_pred, average='macro')*100, 2)
rec = round(metrics.flat_recall_score(y_test, y_pred, average='macro')*100, 2)
f1 = round(metrics.flat_f1_score(y_test, y_pred, average='macro')*100, 2)
acc = round(metrics.flat_accuracy_score(y_test, y_pred)*100, 2)

print(' P\t R\t Acc\t F1')
row = f'{prec}\t{rec}\t{acc}\t{f1}\n'
print(row)

 P	 R	 Acc	 F1
95.55	93.77	98.69	94.64

