In [None]:
# Installing the Required Libraries
!pip -q install h5py
!pip -q install typing-extensions
!pip -q install wheel
!pip -q install imgaug==0.2.5
!pip -q install malaya
!pip -q install tensorflow==2.9.0
!pip -q install tensorflow_addons
!pip -q install xlsxwriter

In [None]:
import regex as re
import malaya
import math
import pandas as pd
import nltk
import xlsxwriter
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

In [None]:
model = malaya.pos.transformer(model = 'bert')

In [None]:
mal_noun = ["NOUN","PROPN"]
mal_pron = ["PRON"]
mal_verb = ["ADX","VERB"]
mal_adv = ["ADV"]
mal_adj = ["ADJ"]
mal_adp = ["ADP"]
mal_conj = ["CCONJ","SCONJ"]
mal_det = ["DET"]
mal_par = ["PART"]
mal_num = ["NUM"]
mal_sym = ["SYM"]
mal_oth = ["X"]

eng_noun = ["NN","NNP","NNPS","NNS"]
eng_pron = ["PRP","PRP$","WP","WP$"]
eng_verb = ["VB","VBD","VBG","VBN","VBP","VBZ","MD"]
eng_adv = ["RB","RBR","RBS"]
eng_adj = ["JJ","JJR","JJS"]
eng_adp = ["IN"]
eng_conj = ["CC"]
eng_det = ["DT","PDT","TO","EX"]
eng_par = ["RP"]
eng_num = ["CD"]
eng_sym = ["$","(",")",",","--",".",":","SYM","``","''"]
eng_oth = ["FW","LS","POS","UH"]


In [None]:
def english_pos(phrase,english_arr):
            phrase = re.sub(r'[^\w\s]', '', phrase)
            tokenized_sentence = word_tokenize(phrase)
            resulting_model = nltk.pos_tag(tokenized_sentence)
            for i in range(len(resulting_model)):
                pos = resulting_model[i][1]
                if pos in eng_noun:
                    english_arr[0] += 1
                elif pos in eng_pron:
                    english_arr[1] += 1
                elif pos in eng_verb:
                    english_arr[2] += 1
                elif pos in eng_adv:
                    english_arr[3] += 1
                elif pos in eng_adj:
                    english_arr[4] += 1
                elif pos in eng_adp:
                    english_arr[5] += 1
                elif pos in eng_conj:
                    english_arr[6] += 1
                elif pos in eng_det:
                    english_arr[7] += 1
                elif pos in eng_par:
                    english_arr[8] += 1
                elif pos in eng_num:
                    english_arr[9] += 1
                elif pos in eng_sym:
                    english_arr[10] += 1
                else:
                    english_arr[11] += 1
            return english_arr

In [None]:
def malay_pos(phrase, malay_arr):
            phrase = re.sub(r'[^\w\s]', '', phrase)
            resulting_model = model.analyze(phrase)
            for i in range(len(resulting_model)):
                pos = resulting_model[i]["type"]
                if pos in mal_noun:
                    malay_arr[0] += 1
                elif pos in mal_pron:
                    malay_arr[1] += 1
                elif pos in mal_verb:
                    malay_arr[2] += 1
                elif pos in mal_adv:
                    malay_arr[3] += 1
                elif pos in mal_adj:
                    malay_arr[4] += 1
                elif pos in mal_adp:
                    malay_arr[5] += 1
                elif pos in mal_conj:
                    malay_arr[6] += 1
                elif pos in mal_det:
                    malay_arr[7] += 1
                elif pos in mal_par:
                    malay_arr[8] += 1
                elif pos in mal_num:
                    malay_arr[9] += 1
                elif pos in mal_sym:
                    malay_arr[10] += 1
                else:
                    malay_arr[11] += 1
            return malay_arr

In [None]:
def single_pos(filename):
    with open(filepath) as f:
        data = f.readlines()
    f.close()
    english_ar = [0]*(n-1)
    malay_ar = [0]*(n-1)
    eng_flag = False
    mal_flag = False
    for line in data:
        words = line.split()
        for word in words:
            if word.starswith("<english>") and word.endswith("</english>")):
                english_ar = english_pos(word[9:-10],english_ar)
            elif word.starswith("<english>"):
                english_ar = english_pos(word[9:],english_ar)
                eng_flag = True
            elif word.endswith("</english>"):
                eng_flag = False
                english_ar = english_pos(word[:-10],english_ar)
            elif eng_flag:
                english_ar = english_pos(word,english_ar)
            elif word.starswith("<malay>") and word.endswith("</malay>"):
                malay_ar = malay_pos(word[7:-8],malay_ar)
            elif word.starswith("<malay>"):
                malay_ar = malay_pos(word[7:],malay_ar)
                mal_flag = True
            elif word.endswith("</malay>"):
                mal_flag = False
                malay_ar = malay_pos(word[:-8],malay_ar)
            elif mal_flag:
                malay_ar = malay_pos(word,malay_ar)

            elif word.starswith("<interjection>") and word.endswith("</interjection>")):
                english_ar = english_pos(word[14:-15],english_ar)
            elif word.starswith("<interjection>"):
                english_ar = english_pos(word[14:],english_ar)
                eng_flag = True
            elif word.endswith("</interjection>"):
                eng_flag = False
                english_ar = english_pos(word[:-15],english_ar)
            elif eng_flag:
                english_ar = english_pos(word,english_ar)
    return english_ar, malay_ar

In [None]:
order = ["Noun", "Pronoun", "Verb", "Adverb", "Adjective", "Adposition", "Conjunction", "Determiner", "Particle", "Number", "Symbol", "Other", "Total"]
head = ["Part of Speech", "No of English Words", "No of Malay Words","Total no of Words", "Substitution Rate", "CMI"]
n = len(order)

In [None]:
import math
def analyzer(english_arr,malay_arr):
    k = []
    for i in range(n-1):
        if english_arr[i] == 0 and malay_arr[i] == 0:
            k.append([order[i], english_arr[i], malay_arr[i], english_arr[i]+malay_arr[i], "NA", "NA", "NA", "NA"])
        else:
            P_eng = english_arr[i]/(english_arr[i] + malay_arr[i])
            P_mal = 1 - P_eng
            eng_substitution_rate = "{:.2f}".format(P_eng * 100)
            CMI = "{:.2f}".format(100 * (1 - (max(P_eng,P_mal)) ))
            k.append([order[i], english_arr[i], malay_arr[i], english_arr[i]+malay_arr[i], eng_substitution_rate,CMI])
    eng = sum(english_arr)
    mal = sum(malay_arr)
    if eng == 0 and mal == 0:
        k.append([order[-1], eng, mal, eng+mal, "NA", "NA", "NA", "NA"])
    else:
        P_eng = eng/(eng+mal)
        P_mal = 1 - P_eng
        eng_substitution_rate = "{:.2f}".format(P_eng * 100)
        CMI = "{:.2f}".format(100 * (1 - (max(P_eng,P_mal)) ))
        k.append([order[-1], eng, mal, eng+mal, eng_substitution_rate,CMI])
    df = pd.DataFrame(k, columns = head).set_index('Part of Speech')
    return df

In [None]:
english_ar, malay_ar = single_pos("CS_Lang_Data.txt")
df = analyzer(english_ar,malay_ar)
df.to_excel("POS_results.xlsx")