# Assignment2NLP19: POS Tagging on hi-ud.conllu using CRFSuite

### ReadFile(filename, delimiter) 

In [1]:
# This function reads the file<filename> delimited by<delimiter> in the given format
def ReadFile(filename, delimiter):
    sentences = []
    with open(filename, "r") as f:
        sentence = []
        
        for tag in f.readlines()[1:]:
            if (tag == "{}{}\n".format(delimiter, delimiter)):
                sentences.append(sentence)
                sentence = []
                continue
            fields = tag.strip().split(delimiter)
            sentence.append((fields[1].strip('\"'), fields[2]))
        sentences.append(sentence)
        
    return sentences

## Features Chosen:
**Word**           - The Word Itself  
**Work.Lower()**   - The Word reduced to lowercase  
**Word.isTitle()** - Boolean True if first character is in UpperCase  
**Word.isUpper()** - Boolean True if all characters of the string are UpperCase  
**Word.isDigit()** - Boolean True if all characters of the string are Digits  
**Prefix-1**       - Word[0:1]  
**Prefix-2**       - Word[0:2]  
**Prefix-3**       - Word[0:3]  
**Suffix-1**       - Word[-3:0]  
**Suffix-2**       - Word[-2:0]  
**Suffix-3**       - Word[-1:0]  
**has_Hyphen**     - Whether word has hyphen in it  

**BOS**               - If Word is the Beginning of the Sentence  
**-1:Word.Lower()**   - Previous Word reduced to LowerCase   
**-1:Word.isTitle()** - Boolean True if first character of the Previous Word is in UpperCase  
**-1:Word.isUpper()** - Boolean True if all characters of the Previous word are UpperCase  

**EOS**               - If Word is the End of the Sentence  
**+1:Word.Lower()**   - Next Word reduced to LowerCase  
**+1:Word.isTitle()** - Boolean True if first character of the Next Word is in UpperCase  
**+1:Word.isUpper()** - Boolean True if all characters of the Next word are UpperCase  

### WordToFeatures(sentence, index_of_word),   SentenceToFeatures(sentence),    SentenceToLabels(sentence)


In [2]:
# This function returns the features of a word
# Input:  'sentence' <list of tuples(Word, POS_Tag)
# Output: dict {"feature" : value}
def WordToFeatures(sentence, index):
    Word = sentence[index][0]
    
    features = {
        'Word':           Word,
        'Word.Lower()':   Word.lower(),
        'Word.isTitle()': Word.istitle(),
        'Word.isUpper()': Word.isupper(),
        'Word.isDigit()': Word.isdigit(),
        'Prefix-1':       Word[0] if len(Word)>0 else '',
        'Prefix-2':       Word[:2] if len(Word)>1 else '',
        'Prefix-3':       Word[:3] if len(Word)>2 else '',
        'Suffix-1':       Word[-1] if len(Word)>0 else '',
        'Suffix-2':       Word[-2:] if len(Word)>1 else '',
        'Suffix-3':       Word[-3:] if len(Word)>2 else '',
        'has_Hyphen':     '-' in Word,
    }
    
    if (index > 0):
        PrevWord = sentence[index-1][0]
        features.update({
            '-1:Word.Lower()'   : PrevWord.lower(),
            '-1:Word.isTitle()' : PrevWord.istitle(),
            '-1:Word.isUpper()' : PrevWord.isupper(),
        })
    else:
        features['BOS'] = True

    if (index < len(sentence)-1):
        NextWord = sentence[index+1][0]
        features.update({
            '+1:Word.Lower()'   : NextWord.lower(),
            '+1:Word.isTitle()' : NextWord.istitle(),
            '+1:Word.isUpper()' : NextWord.isupper(),
        })
    else:
        features['EOS'] = True

    return features


# This function returns the features of each word in a sentence
# Input: a 'sentence' <list of tuples(Word, POS_tag)>
# Output: a <list of <dict of {"feature" : value}>> corresponsing to each word
def SentenceToFeatures(sentence):
    return [WordToFeatures(sentence, i) for i in range(len(sentence))]


# This function returns the labels of each word in a sentence
# Input: 'sentence' <list of tuples(Word, POS_tags)>
# Output: <list of POS_tags>
def SentenceToLabels(sentence):
    return [fields[1] for fields in sentence]

In [3]:
Train_Set   = ReadFile('hi-ud-train.conllu', ',')
Test_Set    = ReadFile('hi-ud-test.conllu', '\t')

X_Train     = [SentenceToFeatures(sentence) for sentence in Train_Set]
Y_TrainTrue = [SentenceToLabels(sentence) for sentence in Train_Set]

X_Test      = [SentenceToFeatures(sentence) for sentence in Test_Set]
Y_TestTrue  = [SentenceToLabels(sentence) for sentence in Test_Set]

## Fitting the Model on the Training Data

In [4]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

Model = sklearn_crfsuite.CRF(
    algorithm      = 'lbfgs',
    c1             = 0.1,
    c2             = 0.1,
    max_iterations = 300,
    all_possible_transitions = True
)
Model.fit(X_Train, Y_TrainTrue)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=300,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

## Running the Model on the Training Data

In [5]:
print("MODEL PREDICTION ON TRAINING DATA".center(53))
print("-"*53)

Y_TrainPredicted = Model.predict(X_Train)

print(metrics.flat_classification_report(Y_TrainTrue, Y_TrainPredicted))

print('precision: ',  metrics.flat_precision_score(Y_TrainTrue, Y_TrainPredicted, average = 'weighted'))
print('recall:    ',  metrics.flat_recall_score(Y_TrainTrue, Y_TrainPredicted, average = 'weighted'))
print('f1-score:  ',  metrics.flat_f1_score(Y_TrainTrue, Y_TrainPredicted, average = 'weighted'))
print('accuracy:  ',  metrics.flat_accuracy_score(Y_TrainTrue, Y_TrainPredicted))

          MODEL PREDICTION ON TRAINING DATA          
-----------------------------------------------------
              precision    recall  f1-score   support

         ADJ       1.00      1.00      1.00       570
         ADP       1.00      1.00      1.00      1387
         ADV       0.97      0.98      0.98       111
         AUX       0.98      1.00      0.99       730
       CCONJ       0.99      1.00      1.00       150
       COMMA       1.00      1.00      1.00       114
         DET       1.00      0.99      0.99       231
        NOUN       1.00      1.00      1.00      1597
         NUM       1.00      1.00      1.00       152
        PART       1.00      1.00      1.00       163
        PRON       1.00      1.00      1.00       431
       PROPN       1.00      1.00      1.00       708
       PUNCT       1.00      1.00      1.00       564
       SCONJ       0.98      1.00      0.99        61
        VERB       1.00      0.98      0.99       640
           X       1.00    

## Running the Model on the Testing Data

In [6]:
print("MODEL PREDICTION ON TESTING DATA".center(53))
print("-"*53)

Y_TestPredicted = Model.predict(X_Test)

print(metrics.flat_classification_report(Y_TestTrue, Y_TestPredicted))

print('precision: ',  metrics.flat_precision_score(Y_TestTrue, Y_TestPredicted, average = 'weighted'))
print('recall:    ',  metrics.flat_recall_score(Y_TestTrue, Y_TestPredicted, average = 'weighted'))
print('f1-score:  ',  metrics.flat_f1_score(Y_TestTrue, Y_TestPredicted, average = 'weighted'))
print('accuracy:  ',  metrics.flat_accuracy_score(Y_TestTrue, Y_TestPredicted))

           MODEL PREDICTION ON TESTING DATA          
-----------------------------------------------------
              precision    recall  f1-score   support

         ADJ       0.69      0.74      0.71        94
         ADP       0.96      0.98      0.97       309
         ADV       0.71      0.48      0.57        21
         AUX       0.98      0.96      0.97       139
       CCONJ       1.00      1.00      1.00        25
         DET       0.82      0.89      0.85        36
        NOUN       0.78      0.90      0.83       329
         NUM       0.92      0.92      0.92        25
        PART       0.97      1.00      0.99        33
        PRON       0.92      0.83      0.87        65
       PROPN       0.71      0.46      0.56       145
       PUNCT       1.00      1.00      1.00       135
       SCONJ       0.75      1.00      0.86         3
        VERB       0.89      0.87      0.88        99

    accuracy                           0.87      1458
   macro avg       0.86   

### printTransitions(transition)

In [7]:
# This function prints the transition from POS_Tag1 -> POS_Tag2 with its corresponding weight
def printTransitions(transitions):
    for edge, weight in transitions:
        print("%-6s =>  %-9s %0.5f" % (edge[0], edge[1], weight))

## Printing the 10 Most Common and Least Common Transtition Features

In [8]:
from collections import Counter

print("Top 10 Most Common POS Transition Features:")
print("-"*43)
printTransitions(Counter(Model.transition_features_).most_common(10))
print("\n")

print("Top 10 Least Common POS Transition Features:")
print("-"*44)
printTransitions(Counter(Model.transition_features_).most_common()[-10:])
print("\n")

Top 10 Most Common POS Transition Features:
-------------------------------------------
VERB   =>  AUX       4.24992
PROPN  =>  PROPN     3.52655
ADJ    =>  NOUN      3.06800
NUM    =>  NOUN      2.47544
DET    =>  NOUN      2.22345
AUX    =>  AUX       2.13554
NOUN   =>  ADP       2.09563
PROPN  =>  ADP       2.05048
NOUN   =>  VERB      1.77167
VERB   =>  SCONJ     1.69359


Top 10 Least Common POS Transition Features:
--------------------------------------------
ADP    =>  CCONJ     -1.24472
ADP    =>  AUX       -1.24605
AUX    =>  ADP       -1.27069
PROPN  =>  AUX       -1.31203
DET    =>  CCONJ     -1.34330
ADP    =>  COMMA     -1.40946
CCONJ  =>  AUX       -1.74072
ADJ    =>  PRON      -1.92173
ADJ    =>  ADP       -2.21294
DET    =>  ADP       -2.41440


