### Conditional Random Fields (CRFs)

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import sklearn_crfsuite
import os
import random
import nltk
import re
from collections import Counter, defaultdict, namedtuple, OrderedDict
from itertools import chain
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from io import BytesIO
from itertools import chain
from nltk.corpus import stopwords

In [101]:
Test_Sentence = namedtuple("Sentence", "words")

def getMeTestSentences(data):
    sentences = []
    for key in data:
        sentence = []
        for val in zip(data[key].words):
            sentence.append(val)
        sentences.append(sentence)
    return sentences

def read_test_data(filename):
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
        index = 1
        a = OrderedDict()
        for s in sentence_lines:
            temp = []
            for l in s:
                temp.append(l.strip().split("\t")[1:])   
            temp2 = []
            for val in temp:
                if len(val) == 1:
                    temp2.append(val[0])
                          
            a[index] = Test_Sentence(tuple(temp2))
            index += 1
        return a
        
class TestDataset(namedtuple("_TDataset", "sentences keys vocab X N")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=None):
        sentences = read_test_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences,N)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [67]:
Sentence = namedtuple("Sentence", "words tags")
def getMeSentences(data):
    sentences = []
    for key in data:
        sentence = []
        for val in zip(data[key].words,data[key].tags):
            sentence.append(val)
        sentences.append(sentence)
    return sentences

def read_data(filename):
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
        index = 1
        a = OrderedDict()
        for s in sentence_lines:
            temp = []
            for l in s:
                temp.append(l.strip().split("\t")[1:])
            
            temp2 = []
            temp3 = []
            for val in temp:
                if len(val) == 2:
                    if counter[val[0]] == 1:
                        temp2.append('UNK')
                    else:
                        temp2.append(val[0])
                    temp3.append(val[1])
                          
            a[index] = Sentence(tuple(temp2),tuple(temp3))
            index += 1
        return a

def read_tags(filename):
    with open(filename, 'r') as f:
        tags = f.read().split("\n")
    return frozenset(tags)

class Subset(namedtuple("BaseSet", "sentences keys vocab X tagset Y N stream")):
    def __new__(cls, sentences, keys):
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        wordset = frozenset(chain(*word_sequences))
        tagset = frozenset(chain(*tag_sequences))
        N = sum(1 for _ in chain(*(sentences[k].words for k in keys)))
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, {k: sentences[k] for k in keys}, keys, wordset, word_sequences,
                               tagset, tag_sequences, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

class Dataset(namedtuple("_Dataset", "sentences keys vocab X tagset Y training_set testing_set N stream")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=None):
        tagset = read_tags(tagfile)
        sentences = read_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        _keys = list(keys)
        if seed is not None: random.seed(seed)
        random.shuffle(_keys)
        split = int(train_test_split * len(_keys))
        training_data = Subset(sentences, _keys[:split])
        testing_data = Subset(sentences, _keys[split:])
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences, tagset,
                               tag_sequences, training_data, testing_data, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [68]:
data = Dataset("tags-universal.txt", "S21-gene-train.txt", train_test_split=0.8)

In [70]:
classes = list(data.tagset)

In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/hunar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def iscamelcase(string):
    non_alpha = [i for i in string if not i.isalpha()]
    substrings= string.translate({ord(i): ' ' for i in non_alpha}).split(' ')
    for string in substrings:
        if not all(char.isupper() for char in string):
            for idx,i in enumerate(string):
                if i.isupper() and idx > 0:
                    return True
    return False

#### Features extraction



In [71]:
def word2features(sent, i):
    word = sent[i][0]    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isStopword()': word in stop_words,
        'word.isalnum()': word.isalnum(),
        'word.endWithASE()': word.lower().endswith('ase'),
        'word.endWithIN()': word.lower().endswith('in'),
        'word.logWordLength': np.log(len(word)),
        'word.isCamelCase': iscamelcase(word),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isStopword()': word1 in stop_words,
            '-1:word.isalnum()': word1.isalnum(),
            '-1:word.endWithASE()': word1.lower().endswith('ase'),
            '-1:word.endWithIN()': word1.lower().endswith('in'),
            '-1:word.logWordLength': np.log(len(word1)),
            'isPreviousWordDash()': True if word1 == '-' else False,
            'isPreviousBracket()': True if word1 in ['(','['] else False
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isStopword()': word1 in stop_words,
            '+1:word.isalnum()': word1.isalnum(),
            '+1:word.endWithASE()': word1.lower().endswith('ase'),
            '+1:word.endWithIN()': word1.lower().endswith('in'),
            '+1:word.logWordLength': np.log(len(word1)),
            'isNextWordDash()': True if word1 == '-' else False,
            'isNextBracket()': True if word1 in [')',']'] else False

        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [72]:
X_train = [sent2features(s) for s in getMeSentences(data.training_set.sentences)]
X_test = [sent2features(s) for s in getMeSentences(data.testing_set.sentences)]
y_train = [sent2labels(s) for s in getMeSentences(data.training_set.sentences)]
y_test = [sent2labels(s) for s in getMeSentences(data.testing_set.sentences)]

In [73]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [80]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=classes)

0.9513313418418229

In [25]:
print(metrics.flat_classification_report(y_test, y_pred, labels = classes))



              precision    recall  f1-score   support

           I       0.78      0.72      0.75      4928
           B       0.82      0.73      0.77      3309
           O       0.97      0.98      0.98     68134

    accuracy                           0.95     76371
   macro avg       0.86      0.81      0.83     76371
weighted avg       0.95      0.95      0.95     76371



In [None]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=classes))

In [26]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common())

Top likely transitions:
O      -> O       1.757874
I      -> I       1.360925
O      -> B       0.763143
B      -> I       0.728457
B      -> O       -0.604968
I      -> O       -0.921152
B      -> B       -5.131351
I      -> B       -5.647604
O      -> I       -10.845987


In [27]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(3))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-3:])

Top positive:
8.399155 O        BOS
7.610397 B        BOS
6.913589 B        word.lower():interferon

Top negative:
-3.870480 B        word.isStopword()
-3.968371 B        word[-2:]:Da
-4.521928 B        word.isdigit()


### ELI5

ELI5 is a Python package which helps to debug machine learning classifiers and explain their predictions. ELI5 allows to check weights of sklearn_crfsuite.CRF models.

In [28]:
import eli5


eli5.show_weights(crf, top=10)

From \ To,B,I,O
B,-5.131,0.728,-0.605
I,-5.648,1.361,-0.921
O,0.763,-10.846,1.758

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+7.610,BOS,
+6.914,word.lower():interferon,
+5.679,word.lower():ets,
+4.979,word.lower():fibrinogen,
+4.734,word.lower():histone,
+4.336,word.lower():albumin,
+4.335,word[-2:]:4p,
+4.294,word[-2:]:g1,
+4.231,word[-2:]:k2,
… 9160 more positive …,… 9160 more positive …,

Weight?,Feature
+7.610,BOS
+6.914,word.lower():interferon
+5.679,word.lower():ets
+4.979,word.lower():fibrinogen
+4.734,word.lower():histone
+4.336,word.lower():albumin
+4.335,word[-2:]:4p
+4.294,word[-2:]:g1
+4.231,word[-2:]:k2
… 9160 more positive …,… 9160 more positive …

Weight?,Feature
+4.167,word.lower():sites
+3.606,-1:word.lower():hly
+3.450,word.lower():region
+3.302,-1:word.lower():gcn3
+3.274,-1:word.lower():cych
+3.242,-1:word.lower():long
+3.227,-1:word.lower():5
+3.166,-1:word.lower():activation
+3.149,-1:word.lower():homeotic
+3.101,+1:word.lower():35

Weight?,Feature
+8.399,BOS
+6.560,word.lower():release
+5.250,word.lower():increase
+5.194,word.lower():disease
+5.025,word.lower():contains
+4.459,word.lower():phase
+4.416,word.lower():within
+4.184,word.lower():decrease
+4.168,word.lower():represses
+4.119,-1:word.lower():spc1


In [15]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=10)

From \ To,B,I,O
B,0.0,1.965,-1.455
I,0.0,2.447,-0.983
O,2.815,0.0,2.49

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+1.122,-1:word.lower():the,
+0.944,-1:word.lower():of,
+0.890,+1:word.lower():-,
+0.719,word.isupper(),
+0.618,-1:word.lower():and,
… 14 more positive …,… 14 more positive …,
… 14 more negative …,… 14 more negative …,
-0.335,word[-2:]:-,
-0.335,word.lower():-,
-0.335,word[-3:]:-,

Weight?,Feature
+1.122,-1:word.lower():the
+0.944,-1:word.lower():of
+0.890,+1:word.lower():-
+0.719,word.isupper()
+0.618,-1:word.lower():and
… 14 more positive …,… 14 more positive …
… 14 more negative …,… 14 more negative …
-0.335,word[-2:]:-
-0.335,word.lower():-
-0.335,word[-3:]:-

Weight?,Feature
+0.684,-1:word.lower():-
+0.619,word[-3:]:ase
+0.506,word[-3:]:tor
+0.446,word[-2:]:se
+0.430,word[-2:]:ne
+0.395,word[-3:]:ene
+0.371,word.lower():gene
+0.333,word.isdigit()
… 23 more positive …,… 23 more positive …
… 36 more negative …,… 36 more negative …

Weight?,Feature
+2.106,EOS
+0.911,word[-2:]:ed
+0.790,BOS
+0.787,word[-2:]:.
+0.787,word[-3:]:.
+0.787,word.lower():.
+0.738,word.lower():the
+0.737,word[-2:]:he
… 48 more positive …,… 48 more positive …
… 43 more negative …,… 43 more negative …


In [16]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train);
eli5.show_weights(crf, top=5, show=['transition_features'])

From \ To,B,I,O
B,-5.696,1.307,-0.873
I,-6.373,2.111,-1.05
O,-0.249,-10.799,1.407


In order to easy to read, we can check only a subset of tags.

In [29]:
eli5.show_weights(crf, top=10, targets=['O', 'B', 'I'])

From \ To,O,B,I
O,1.758,0.763,-10.846
B,-0.605,-5.131,0.728
I,-0.921,-5.648,1.361

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+8.399,BOS,
+6.560,word.lower():release,
+5.250,word.lower():increase,
+5.194,word.lower():disease,
+5.025,word.lower():contains,
+4.459,word.lower():phase,
+4.416,word.lower():within,
+4.184,word.lower():decrease,
+4.168,word.lower():represses,
+4.119,-1:word.lower():spc1,

Weight?,Feature
+8.399,BOS
+6.560,word.lower():release
+5.250,word.lower():increase
+5.194,word.lower():disease
+5.025,word.lower():contains
+4.459,word.lower():phase
+4.416,word.lower():within
+4.184,word.lower():decrease
+4.168,word.lower():represses
+4.119,-1:word.lower():spc1

Weight?,Feature
+7.610,BOS
+6.914,word.lower():interferon
+5.679,word.lower():ets
+4.979,word.lower():fibrinogen
+4.734,word.lower():histone
+4.336,word.lower():albumin
+4.335,word[-2:]:4p
+4.294,word[-2:]:g1
+4.231,word[-2:]:k2
… 9160 more positive …,… 9160 more positive …

Weight?,Feature
+4.167,word.lower():sites
+3.606,-1:word.lower():hly
+3.450,word.lower():region
+3.302,-1:word.lower():gcn3
+3.274,-1:word.lower():cych
+3.242,-1:word.lower():long
+3.227,-1:word.lower():5
+3.166,-1:word.lower():activation
+3.149,-1:word.lower():homeotic
+3.101,+1:word.lower():35


Or check only some of the features for all tags.

In [31]:
eli5.show_weights(crf, top=10, feature_re='^is',
                  horizontal_layout=False, show=['targets'])



Weight?,Feature
0.121,isNextWordDash()
0.067,isPreviousBracket()
-0.094,isNextBracket()
-0.859,isPreviousWordDash()

Weight?,Feature
0.88,isNextBracket()
0.713,isPreviousWordDash()
-0.161,isPreviousBracket()
-0.382,isNextWordDash()

Weight?,Feature
0.256,isNextWordDash()
0.065,isPreviousBracket()
-0.081,isNextBracket()
-0.304,isPreviousWordDash()


In [30]:
eli5.show_weights(crf, top=10, feature_re='^word.is',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
+6.914,word.lower():interferon
+5.679,word.lower():ets
+4.979,word.lower():fibrinogen
+4.734,word.lower():histone
+4.336,word.lower():albumin
+4.335,word[-2:]:4p
+4.294,word[-2:]:g1
+4.231,word[-2:]:k2
+4.182,word[-2:]:k1
… 7193 more positive …,… 7193 more positive …

Weight?,Feature
+4.167,word.lower():sites
+3.450,word.lower():region
+2.958,word.lower():hydrolases
+2.948,word.lower():hormone
+2.828,word.lower():promoters
+2.764,word.lower():histocompatibility
+2.754,word.lower():sequence
+2.668,word.lower():ras
… 3864 more positive …,… 3864 more positive …
… 657 more negative …,… 657 more negative …

Weight?,Feature
+6.560,word.lower():release
+5.250,word.lower():increase
+5.194,word.lower():disease
+5.025,word.lower():contains
+4.459,word.lower():phase
+4.416,word.lower():within
+4.184,word.lower():decrease
+4.168,word.lower():represses
+4.044,word.lower():min
+3.998,word.lower():t1


## Writing To Files

In [81]:
with open('yoursystemoutput.txt', 'w') as f:
    k = 0
    for key in data.testing_set.sentences:
        for i,val in enumerate(zip(data.testing_set.sentences[key].words,y_pred[k])):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        k += 1
        f.write("\n")

with open('goldstandardfile.txt', 'w') as f:
    for key in data.testing_set.sentences:
        for i,val in enumerate(zip(data.testing_set.sentences[key].words,data.testing_set.sentences[key].tags)):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        f.write("\n")

In [20]:
test_data = TestDataset("tags-universal.txt", "S21-gene-test.txt")

X_testFinal = [sent2features(s) for s in getMeTestSentences(test_data.sentences)]

y_predTestFinal = crf.predict(X_testFinal)

with open('testFinal.txt', 'w') as f:
    k = 0
    for key in test_data.sentences:
        for i,val in enumerate(zip(test_data.sentences[key].words,y_predTestFinal[k])):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        k += 1
        f.write("\n")

# Experimentation



In [None]:


def getMeSentences(data):
    sentences = []
    for key in data:
        sentence = []
        for val in zip(data[key].words,data[key].tags):
            if val[0] not in counter:
                sentence.append(('UNK',val[1]))
            else:
                sentence.append(val)
        sentences.append(sentence)
    return sentences

In [None]:
X_test = [sent2features(s) for s in getMeSentences(data.testing_set.sentences)]


In [None]:
getMeSentences(data.testing_set.sentences)

In [None]:
with open('S21-gene-train.txt', 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
        index = 1
        temp2 = []
        for s in sentence_lines:
            temp = []
            for l in s:
                temp.append(l.strip().split("\t")[1:])    
            for val in temp:
                if len(val) == 2:
                    temp2.append(val[0])                        
        counter = Counter(temp2)
temp = []
for key in counter:
    if counter[key] == 1:
        temp.append(key)

# print(temp)
print(len(temp))


17197


In [119]:
test_data = TestDataset("tags-universal.txt", "S21-gene-test_hunar.txt")

X_testFinal = [sent2features(s) for s in getMeTestSentences(test_data.sentences)]

y_predTestFinal = crf.predict(X_testFinal)

with open('testFinal_hunar2.txt', 'w') as f:
    k = 0
    for key in test_data.sentences:
        for i,val in enumerate(zip(test_data.sentences[key].words,y_predTestFinal[k])):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        k += 1
        f.write("\n")

In [116]:
getMeTestSentences(test_data.sentences)

[[('UNK', ''), (',',), ('UNK', ''), ('UNK', '')]]

In [114]:
# def getMeTestSentences(data):
#     sentences = []
#     for key in data:
#         sentence = []
#         for val in zip(data[key].words):
#             if val[0] not in counter:
#                 sentence.append('UNK')
#             else:
#                 sentence.append(val)
#         sentences.append(sentence)
#     return sentences

def getMeTestSentences(data):
    sentences = []
    for key in data:
        sentence = []
        for val in zip(data[key].words):
            if val[0] not in counter:
                sentence.append(('UNK',''))
            else:
                sentence.append(val)
        sentences.append(sentence)
    return sentences

In [117]:
X_testFinal = [sent2features(s) for s in getMeTestSentences(test_data.sentences)]

In [98]:
getMeTestSentences(test_data.sentences)

[['UNK', (',',), 'UNK', 'UNK']]

In [118]:
X_testFinal

[[{'bias': 1.0,
   'word.lower()': 'unk',
   'word[-3:]': 'UNK',
   'word[-2:]': 'NK',
   'word.isupper()': True,
   'word.istitle()': False,
   'word.isdigit()': False,
   'word.isStopword()': False,
   'word.isalnum()': True,
   'word.endWithASE()': False,
   'word.endWithIN()': False,
   'word.logWordLength': 1.0986122886681098,
   'word.isCamelCase': False,
   'BOS': True,
   '+1:word.lower()': ',',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.isStopword()': False,
   '+1:word.isalnum()': False,
   '+1:word.endWithASE()': False,
   '+1:word.endWithIN()': False,
   '+1:word.logWordLength': 0.0,
   'isNextWordDash()': False,
   'isNextBracket()': False},
  {'bias': 1.0,
   'word.lower()': ',',
   'word[-3:]': ',',
   'word[-2:]': ',',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'word.isStopword()': False,
   'word.isalnum()': False,
   'word.endWithASE()': False,
   'word.endWithIN()': False,
   'word.logWordLeng