In [None]:
!python --version

Python 2.7.17


In [None]:
VERBOSE_LEVEL = 0

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [None]:
import os
os.chdir('/gdrive/MyDrive/AMFM')

In [None]:
# Common python modules
import os
import sys
import string
import json
import argparse
import signal
import pickle as pickle
from lm import ArpaLM
import unicodedata
import joblib as jlib
from functools import partial

try:
    import numpy as np
except:
    print("Error: Requires numpy from http://www.numpy.org/. Have you installed numpy?")
    sys.exit()


try:
    from sklearn.externals import joblib
except:
    print("Error: Requires sklearn from http://scikit-learn.org/. Have you installed scikit?")
    sys.exit()


try:
    from scipy.spatial.distance import cosine
except:
    print("Error: Requires scipy from http://scipy.org/. Have you installed scipy?")
    sys.exit()

In [None]:
models_dir = os.getcwd() + '/models/'
print(models_dir)

/gdrive/MyDrive/AMFM/models/


In [None]:
# Global variables
WORD_TOKENS = ['en', 'in', 'ko', 'hi', 'my', 'bn', 'ml', 'si', 'ta', 'te', 'ur', 'ru', 'km']

In [None]:
CONF_VALUES = {
    'EN_HI_MULTIMODAL':
        {
            'ROOT_DIR': 'EN_HI_MULTIMODAL/en_hi_multimodal',
            'en': {
                'NUM_TRAIN_SENT': 20000,  # Number of sentences used during training
                'FULL_AM_SIZE': 2500,  # Max size of the trained AM model
                'OPT_AM_SIZE': 1500,  # Optimal value for the trained AM model
                'NGRAM_ORDER': 1,  # Order the FM score calculation
                'NUM_FOLD': 0,  # Fold used to train the models
                'PREFIX_AM_FM': 'train',  # Prefix for the AM-FM models
                'ALPHA': 0.5  # Interpolation value for AM-FM
            },
            'hi': {
                'NUM_TRAIN_SENT': 20000,  # Number of sentences used during training
                'FULL_AM_SIZE': 2500,  # Max size of the trained AM model
                'OPT_AM_SIZE': 150,  # Optimal value for the trained AM model
                'NGRAM_ORDER': 1,  # Order the FM score calculation
                'NUM_FOLD': 0,  # Fold used to train the models
                'PREFIX_AM_FM': 'train',  # Prefix for the AM-FM models
                'ALPHA': 0.5  # Interpolation value for AM-FM
            },
        },
    'WAT2019_EN_HI':
        {
            'ROOT_DIR': 'WAT2019_EN_HI/en_hi',
            'hi': {
                'NUM_TRAIN_SENT': 20000,  # Number of sentences used during training
                'FULL_AM_SIZE': 2500,  # Max size of the trained AM model
                'OPT_AM_SIZE': 2000,  # Optimal value for the trained AM model
                'NGRAM_ORDER': 3,  # Order the FM score calculation
                'NUM_FOLD': 0,  # Fold used to train the models
                'PREFIX_AM_FM': 'train',  # Prefix for the AM-FM models
                'ALPHA': 0.5  # Interpolation value for AM-FM
            },
            'en': {
                'NUM_TRAIN_SENT': 20000,  # Number of sentences used during training
                'FULL_AM_SIZE': 2500,  # Max size of the trained AM model
                'OPT_AM_SIZE': 500,  # Optimal value for the trained AM model
                'NGRAM_ORDER': 2,  # Order the FM score calculation
                'NUM_FOLD': 0,  # Fold used to train the models
                'PREFIX_AM_FM': 'train',  # Prefix for the AM-FM models
                'ALPHA': 0.5  # Interpolation value for AM-FM
            },
        },
        'WAT2019_EN_TA':
        {
            'ROOT_DIR': 'WAT2019_EN_TA/en_ta',
            'ta': {
                'NUM_TRAIN_SENT': 15000,  # Number of sentences used during training
                'FULL_AM_SIZE': 2500,  # Max size of the trained AM model
                'OPT_AM_SIZE': 2000,  # Optimal value for the trained AM model
                'NGRAM_ORDER': 3,  # Order the FM score calculation
                'NUM_FOLD': 0,  # Fold used to train the models
                'PREFIX_AM_FM': 'train',  # Prefix for the AM-FM models
                'ALPHA': 0.5  # Interpolation value for AM-FM
            },
            'en': {
                'NUM_TRAIN_SENT': 20000,  # Number of sentences used during training
                'FULL_AM_SIZE': 2500,  # Max size of the trained AM model
                'OPT_AM_SIZE': 250,  # Optimal value for the trained AM model
                'NGRAM_ORDER': 1,  # Order the FM score calculation
                'NUM_FOLD': 0,  # Fold used to train the models
                'PREFIX_AM_FM': 'train',  # Prefix for the AM-FM models
                'ALPHA': 0.5  # Interpolation value for AM-FM
            },
        },
}

In [None]:
sc = set(['-', "'", '%'])
to_remove = ' '.join([c for c in string.punctuation if c not in sc])
table = dict((ord(char), u' ') for char in to_remove)


sc = set([',', '!', '?', '.'])
to_separate = ''.join([c for c in string.punctuation if c not in sc])
table_separate = dict((ord(char), u' ' + char) for char in to_separate)

In [None]:
#tbl = dict((char, u' ') for char in range(sys.maxunicode) if unicodedata.category(chr(char)).startswith('P'))


In [None]:
# Implementation of the vector space model
class VSM:
    def __init__(self, model_file, size_am):
        self.am = None
        self.vectorizer = None
        self.load(model_file)
        self.am_components = self.am[:, 0:size_am]
        self.cache_refvectors = dict()

    # Function to get the distance between a set of reference and test sentences
    def search(self, ref_sentence, test_sentence):
        """ search for documents that match based on a list of terms """
        reference_vector = self.vectorizer.transform([ref_sentence])
        target_vector = self.vectorizer.transform([test_sentence])

        if ref_sentence not in self.cache_refvectors:
            ref = reference_vector.dot(self.am_components)
            self.cache_refvectors[ref_sentence] = ref
        else:
            ref = self.cache_refvectors[ref_sentence]

        if test_sentence not in self.cache_refvectors:
            tgt = target_vector.dot(self.am_components)
            self.cache_refvectors[test_sentence] = tgt
        else:
            tgt = self.cache_refvectors[test_sentence]

        return max(0.0, 1.0 - cosine(ref, tgt))  # Avoid sending negative distances

    # Load models
    def load(self, name_model):
        # WAT2019: added because incompatibilities when reading old files created using python2
        try:
            self.am = jlib.load(name_model + '.h5')
            print("Loading using jlib")
            print("Data type of AM model is ")
            print(type(self.am))
            print(self.am.shape)
        except:
            try:
                print("Loading Python 2.7 compatible model using joblib/sklearn 0.17.1")
                self.am = joblib.load(name_model + '.h5')
                print("Data type of AM model is ")
                print(type(self.am))
            except:
                file_h = open(name_model + '.h5', "rb")
                self.am = pickle.load(name_model + '.h5')
                print("Loading using pickle")
                print("Data type of AM model is ")
                print(type(self.am))
                file_h.close()

        # WAT2019: added because incompatibilities when reading old files created using python2
        try:
            self.vectorizer = jlib.load(name_model + '.dic')
        except:
            try:
                self.vectorizer = joblib.load(name_model + '.dic')

            except:
                file_h = open(name_model + '.dic', "rb")
                self.vectorizer = pickle.load(file_h)
                file_h.close()

In [None]:
class calcScoresAMFM:
    def __init__(self, dataset, lang='en', am=True, fm=True):
        # Load configuration variables for language
        self.DATASET_DIR = CONF_VALUES[dataset]['ROOT_DIR']
        self.FULL_AM_SIZE = CONF_VALUES[dataset][lang]['FULL_AM_SIZE']
        self.OPT_AM_SIZE = CONF_VALUES[dataset][lang]['OPT_AM_SIZE']
        self.NUM_TRAINING_SIZE = CONF_VALUES[dataset][lang]['NUM_TRAIN_SENT']
        self.PREFIX_AM_FM = CONF_VALUES[dataset][lang]['PREFIX_AM_FM']
        self.NGRAM_ORDER = CONF_VALUES[dataset][lang]['NGRAM_ORDER']
        self.NUM_FOLD = CONF_VALUES[dataset][lang]['NUM_FOLD']
        self.alpha = CONF_VALUES[dataset][lang]['ALPHA']
        self.lang = lang
        self.am = am
        self.fm = fm
        self.cache_lm = dict()  # Store previously calculated n-gram values for speed

        if self.am is True:
            # Check that the AM models exist
            am_full_matrix = models_dir + '/' + self.DATASET_DIR + '/' + self.PREFIX_AM_FM + '.' + lang + '.' \
                             + str(self.NUM_TRAINING_SIZE) + \
                             '.' + str(self.FULL_AM_SIZE) + '.' + str(self.NUM_FOLD)
            if not os.path.isfile(am_full_matrix + '.h5') or not os.path.isfile(am_full_matrix + '.dic'):
                print ('******* ERROR: files: ' + am_full_matrix + '.h5 or ' + am_full_matrix + '.dic does not exists.')
                exit()
            elif os.path.getsize(am_full_matrix + '.h5') == 0 or os.path.getsize(am_full_matrix + '.dic') == 0:
                print ('******* ERROR: Check if files: ' + am_full_matrix + '.h5 or ' + am_full_matrix +
                       '.dic are not empty.')
                exit()

        print('Starting loading models for language %s ...' % (lang))
        if self.am is True:
            # Load the models
            print('Loading AM model... for size' + str(self.OPT_AM_SIZE))
            print(type(am_full_matrix))
            self.vs = VSM(am_full_matrix, self.OPT_AM_SIZE)

        if self.fm is True:
            # Check that the LM model exists
            lm_model = models_dir + '/' + self.DATASET_DIR + '/' + self.PREFIX_AM_FM + '.' + lang + '.' + str(self.NGRAM_ORDER) + '.lm'
            if not os.path.exists(lm_model):
                print("******* ERROR: LM file " + lm_model + ' does not exists.')
                exit()
            elif os.path.getsize(lm_model) == 0:
                print("******* ERROR: LM file " + lm_model + ' is empty.')
                exit()
            print('Loading FM model...' + str(self.NGRAM_ORDER))
            print(lm_model)
            self.lm = ArpaLM(lm_model)

        print('Finished loading models for language %s ...' % (lang))

    # Perform basic pre-processing applied during training
    def doProcessFromStrings(self, ref, pred):
        ref = self.preProcess(ref, self.lang)
        pred = self.preProcess(pred, self.lang)
        return ref, pred

    def remove_punctuation(self, word):
        return "".join(char for char in word if not unicodedata.category(char).startswith('P'))

    # Pre-Processing for each sentence. In the case of languages different to English we perform tokenization
    # per character
    def preProcess(self, s, lang):
        if len(s) == 0:  # To avoid empty lines
            return '_EMPTY_'
        #print(s)
        #print(type(s))
        # Perform some normalization for UTF-8
        s = unicodedata.normalize('NFKC', s)

        # Remove some punctuation
        s = s.translate(table)
        s = s.translate(table_separate)

        # Translation for UTF-8 punctuation characters
        #s = s.translate(tbl)

        # Tokenization by characters except for those in the list
        if lang not in WORD_TOKENS:
            tokens = [' '.join([c for c in list(word.strip())]) for word in s.split()]
        else:
            tokens = s.split()

        s = ' '.join(tokens).lower()
        return s

    # Function to calculate the FM metric using language models
    def calculateFMMetric(self, ref, tst):
        if self.lang not in WORD_TOKENS:
            ref = ' '.join(list(ref.strip()))
            tst = ' '.join(list(tst.strip()))

        sent = '<s> ' + ref.strip() + ' </s>'
        if VERBOSE_LEVEL > 1:
            print('REF: ' + sent)
        aWords = sent.split()
        num_words_ref = len(aWords) - 2
        prob_ref = 0.0
        # Calculates the log-prob for the different n-grams
        for i in range(1, len(aWords)):
            words = aWords[max(0, i - self.NGRAM_ORDER + 1):i + 1]
            ngram = ' '.join(words)
            # Try to speed calculation by using cache values
            try:
                val = self.cache_lm[ngram]
                prob_ref += self.cache_lm[ngram]
            except:
                val = self.lm.score(tuple(words))
                self.cache_lm[ngram] = val
                prob_ref += val
            if VERBOSE_LEVEL > 2:
                print('words: ' + ngram + ' value: ' + str(val))

        sent = '<s> ' + tst.strip() + ' </s>'
        if VERBOSE_LEVEL > 1:
            print('SUB: ' + sent)
        aWords = sent.split()
        num_words_tst = len(aWords) - 2
        prob_tst = 0.0
        # Calculates the log-prob for the different n-grams
        for i in range(1, len(aWords)):
            words = aWords[max(0, i - self.NGRAM_ORDER + 1):i + 1]
            ngram = ' '.join(words)
            # Try to speed calculation by using cache values
            try:
                val = self.cache_lm[ngram]
                prob_tst += self.cache_lm[ngram]
            except:
                val = self.lm.score(tuple(words))
                self.cache_lm[ngram] = val
                prob_tst += val
            if VERBOSE_LEVEL > 2:
                print('words: ' + ngram + ' value: ' + str(val))

        # Calculate the scaled probability
        prob_ref = np.exp(prob_ref / num_words_ref)
        prob_tst = np.exp(prob_tst / num_words_tst)
        if VERBOSE_LEVEL > 0:
            print('LM -> REF: ' + str(prob_ref) + ' SUB: ' + str(prob_tst))
        return max(0.0, min(prob_tst, prob_ref)/max(prob_tst, prob_ref))

    # Functionality to calculate the AM score using monolingual SVM
    def calculateAMMetric(self, ref, pred):
        return self.vs.search(ref, pred)


In [None]:
#instance for the above classes
targetlang = 'ta'
targetdataset= 'WAT2019_EN_TA'
cs = calcScoresAMFM(dataset=targetdataset, lang=targetlang, am=True, fm=True)

Starting loading models for language ta ...
Loading AM model... for size2000
<type 'str'>
Loading using jlib
Data type of AM model is 
<class 'sklearn.externals.joblib.numpy_pickle_compat.ZNDArrayWrapper'>
Loading Python 2.7 compatible model using joblib/sklearn 0.17.1
Data type of AM model is 
<class 'numpy.matrix'>




Loading FM model...3
/gdrive/MyDrive/AMFM/models//WAT2019_EN_TA/en_ta/train.ta.3.lm
/gdrive/MyDrive/AMFM/models//WAT2019_EN_TA/en_ta/train.ta.3.lm
Finished loading models for language ta ...


In [None]:
def processSubmission(target, submission, cs, fm, am):
        (target, submission) = cs.doProcessFromStrings(ref=target, pred=submission)
        if VERBOSE_LEVEL > 0:
            print('POST_REF: ' + target + ' SUB: ' + submission)

        if len(target) > 0 and len(submission) > 0:
            res_fm = -1.0
            if fm is True:
                res_fm = cs.calculateFMMetric(target, submission)

            res_am = -1.0
            if am is True:
                res_am = min(1.0, cs.calculateAMMetric(target, submission))

            res_am_fm = -1.0
            if am is True and fm is True:
                res_am_fm = cs.alpha * res_am + (1.0 - cs.alpha) * res_fm

            return (res_am_fm, res_am, res_fm, cs.alpha)
        else:
            return (0.0, 0.0, 0.0, cs.alpha)


In [None]:
processSubmission(unicode("காலை வணக்கம்","utf-8"),unicode("மாலை  வணக்கம்","utf-8"),cs,True,True)

(0.6564248247632654, 0.7500000000000002, 0.5628496495265305, 0.5)

In [None]:
candidateFile = "/gdrive/MyDrive/AMFM/EnTa/microsoft_en_ta_results.ta"
referenceFile ="/gdrive/MyDrive/AMFM/EnTa/corpus.bcn.dev.ta"

In [None]:
import pandas as pd 
dict = {'ReferenceTxt':['sample text'], 
        'CandidateTxt':['sample translation'], 
        'res_am_fm':[0] ,
        'res_am':[0] ,
        'res_fm':[0] ,
        'res_alpha':[0]         
       }
resultDf = pd.DataFrame(dict)

In [None]:
with open(referenceFile) as refFile,open(candidateFile) as canFile: 
    s_reference = refFile.readline()
    s_candidate = canFile.readline()
    count = 1
    while s_candidate :
      res_am_fm,res_am,res_fm,res_alpha= processSubmission(unicode(s_reference,"utf-8"),unicode(s_candidate,"utf-8"),cs,True,True)
      resultDf.loc[len(resultDf.index)] = [s_reference,s_candidate,res_am_fm,res_am,res_fm,res_alpha]
      s_reference = refFile.readline()
      s_candidate = canFile.readline()

In [None]:
resultDf.to_excel("output_ta.xlsx")