In [1]:
import pandas as pd
import textwrap, joblib, random
from sklearn_crfsuite import CRF, metrics

In [2]:
df = pd.read_csv('dataset/sample.csv', header=None)
df = df.loc[:,0]
tagged_sentences = []
for sent in df:
    sent = sent.replace('B', 'S')
    sent = 'J'.join(sent)
    sent = sent.replace('JSJ', 'S')
    sent+='S'
    sent = textwrap.wrap(sent, 2)
    tagged_sent = []
    for pair in sent:
        el = (pair[0], pair[1])
        tagged_sent.append(el)
    tagged_sentences.append(tagged_sent)

random.seed(42)
random.shuffle(tagged_sentences)    
cutoff = int(.80 * len(tagged_sentences))
train = tagged_sentences[:cutoff]
test = tagged_sentences[cutoff:]

print('Total: ', len(tagged_sentences))
print('Train: ', len(train))
print('Test: ', len(test))

Total:  3000
Train:  2400
Test:  600


In [3]:
def features(sentence, index):
    char = sentence[index]
    features = {
        'c': char,
        'is_first': index == 0,
        'is_last': index == len(sentence)-1
    }
    for n in range(1,5):
        features['prev_'+str(n)] = ''.join(sentence[index-n:index+1])
        features['next_'+str(n)] = ''.join(sentence[index+1:index+n+2]) 
    return features

def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        untagged = [first for first, second in tagged]
        X.append([features(untagged, index) for index in range(len(untagged))])
        y.append([tag for _, tag in tagged])
    return X, y

X_train, y_train = transform_to_dataset(train)
X_test, y_test = transform_to_dataset(test)

In [4]:
model = CRF(algorithm = 'lbfgs', c1 = 0.1, c2 = 0.1, max_iterations = 100, all_possible_transitions=True)
model.fit(X_train, y_train)
joblib.dump(model, 'models/proofing.sav')

['models/proofing.sav']

In [5]:
y_pred = model.predict(X_test)

prec = round(metrics.flat_precision_score(y_test, y_pred, average='macro')*100, 2)
rec = round(metrics.flat_recall_score(y_test, y_pred, average='macro')*100, 2)
f1 = round(metrics.flat_f1_score(y_test, y_pred, average='macro')*100, 2)
acc = round(metrics.flat_accuracy_score(y_test, y_pred)*100, 2)

print(' P\t R\t F1\t Acc')
row = f'{prec}\t{rec}\t{f1}\t{acc}\n'
print(row)

 P	 R	 F1	 Acc
98.03	97.76	97.89	98.28

