In [1]:
import nltk
import spacy
import random
import pickle
import numpy as np
import pandas as pd

In [None]:
# !pip install deep_translator flair contractions language_tool_python

import contractions

from deep_translator import GoogleTranslator
gt = GoogleTranslator(source='en', target='ga')

from flair.nn import Classifier
from flair.data import Sentence
tagger = Classifier.load('pos-fast')

from gensim.models import Word2Vec
import gensim.downloader as api
w2v = api.load('glove-wiki-gigaword-200')

import language_tool_python
lt = language_tool_python.LanguageTool('en-UK')

In [4]:
cgpt = pd.read_csv("data/CGPT-corpus.csv", usecols=[1,2], dtype=str)
tatoeba = pd.read_csv("data/tatoeba.tsv", sep='\t', usecols=[1,3], names=['English', 'Gaeilge'], dtype=str)

pairs = pd.concat([cgpt, tatoeba], ignore_index=True)
pairs

Unnamed: 0,English,Gaeilge
0,The papers and the machine are sitting there.,Tá na páipéir agus an mheaisín ina suí.
1,It's cold in winter.,Bíonn sé fuar sa Gheimhreadh.
2,I am looking for a new job.,Tá mé ag lorg post nua.
3,That song was on the radio yesterday.,Bhí an t-amhrán sin ar an raidió inné.
4,The water is coming down the wall.,Tá an t-uisce ag teacht ar an mballa.
...,...,...
3383,She has received a good education.,Tá oideachas maith faighte aici.
3384,I'm going to go back to Australia.,Táim ag dul ar ais go dtí an Astráil.
3385,Tom lives in a small house not far from the ri...,Tá Tomás ina chónaí i dteach beag nach bhfuil ...
3386,I won the fight.,Bhuaigh mé an troid.


In [5]:
def preproc_en(x):
    return " ".join(nltk.word_tokenize(contractions.fix(str(x).lower())))

def preproc_ga(x):
    return nltk.word_tokenize(str(x).lower())


zipped = list(zip(pairs["English"], pairs["Gaeilge"]))

tokens = [[preproc_en(en), preproc_ga(ga)] for en, ga in zipped]

tagged = []
for en, ga in tokens:
    s = Sentence(en)
    tagger.predict(s)
    tags = [str(list(s)[i]).split()[-2] for i in range(len(en.split()))]
    tagged.append([list(zip(en.split(), tags)), ga])
    
tagged[random.randint(0, len(tagged)-1)]

[[('okay', 'UH'), ('.', '.')], ['ceart', 'go', 'leor', '.']]

In [6]:
pickle.dump(tagged, open("tagged.pkl", "wb"))

In [7]:
tagged = pickle.load(open("/kaggle/working/tagged.pkl", "rb"))

In [8]:
en_list = [en for en, ga in tagged]
en_flat = [(word[0].lower(), word[1]) for sent in en_list for word in sent if word[0].lower().isalpha()]
word_corpus = dict(en_flat)

In [9]:
t = GoogleTranslator(source='en', target='ga')

idx = nltk.text.ContextIndex([word for sent in [nltk.word_tokenize(contractions.fix(en.lower())) for en in pairs["English"]] for word in sent])

take random pos-tagged sentence pair from corpus

iterate through english sentence, if pos_tag in pos_list:

find similar words to chosen word, check for matching pos-tag

get irish translation for chosen word and check that it exists in the irish sentence

for first word found with matching tag, get translation and replace irish word with it

In [10]:
r = random.choice(tagged)
lr = [[word, tag] for word, tag in r[0]]

print(" ".join([word[0] for word in r[0]]), "->", " ".join(r[1]))

pos_list = ["NN", "VBG", "NNP", "NNS", "JJ", 'PRP', 'PRP$', 'RB']

en_new = []
for i, (word, tag) in enumerate(lr):
    if tag in pos_list:
        print("in at word", i, word, tag)
        similar_words = idx.similar_words(word) + [word for word, sim in w2v.most_similar(word)]
        random.shuffle(similar_words)
        print(similar_words)
        for similar in similar_words:
            lr[i][0] = similar
            s = Sentence(" ".join([word for word, tag in lr]))
            print(s)
            tagger.predict(s)
            new_tag = str(list(s)[i]).split()[-2]
            print(tag, new_tag)

            if tag == new_tag:
                print(w2v.similarity(word, similar))
                en_new.append(similar)
                break
            
        else:
            en_new.append(word)
    else:
        en_new.append(word)

en = " ".join(en_new)
if len(lt.check(en)) > 0:
    en = lt.correct(en)
    
ga = gt.translate(en)

print(en, "->", ga)

have you tried it yet ? -> an ndearna tú iarracht é fós ?
in at word 1 you PRP
['why', 'me', 'not', 'i', '?', "n't", 'we', 'her', 'it', 'she', 'really', 'those', 'who', 'i', 'where', 'do', 'school', 'sure', 'tom', "'ll", 'what', 'sleep', 'know', 'ireland', 'want', 'how', 'do', 'they', 'there', 'me']
Sentence[6]: "have why tried it yet ?"
PRP WRB
Sentence[6]: "have me tried it yet ?"
PRP PRP
0.8585071
in at word 3 it PRP
['he', 'peige', 'way', 'everything', 'cáit', 'i', 'colm', 'that', 'everyone', 'english', 'which', 'this', 'irish', 'here', 'tom', 'this', 'so', 'now', 'john', 'what', 'but', '.', 'what', 'she', 'there', 'how', 'just', 'where', 'that', 'life']
Sentence[6]: "have me tried he yet ?"
PRP PRP
0.7161806
in at word 4 yet RB
['exactly', 'why', 'so', 'anything', 'indeed', 'now', 'them', 'that', 'that', 'fact', 'one', 'done', 'her', 'still', 'french', 'but', 'german', 'you', 'though', 'although', 'not', 'tom', 'even', 'him', 'russian', 'english']
Sentence[6]: "have me tried he ex

In [11]:
augmented_list = []
for r in random.sample(tagged, len(tagged)):
    
    lr = [[word, tag] for word, tag in r[0]]
    en_new = []
    
    for i, (word, tag) in enumerate(lr):
        try:
            if tag in pos_list:
                similar_words = idx.similar_words(word) + [word for word, sim in w2v.most_similar(word)]
                random.shuffle(similar_words)
                for similar in similar_words:
                    lr[i][0] = similar
                    s = Sentence(" ".join([word for word, tag in lr]))
                    tagger.predict(s)
                    new_tag = str(list(s)[i]).split()[-2]

                    if tag == new_tag:
                        en_new.append(similar)
                        break

                else:
                    en_new.append(word)
            else:
                en_new.append(word)
        except Exception:
            pass

    if en_new:
        en = " ".join(en_new)
        if len(lt.check(en)) > 0:
            en = lt.correct(en)

        ga = gt.translate(en)
        augmented_list.append([en, ga])


augmented = pd.DataFrame(augmented_list, columns=["English", "Gaeilge"]).sample(frac=1).reindex(range(len(augmented_list)))
augmented.to_csv("data/augmented.csv")
augmented

KeyboardInterrupt: 

In [None]:
nltk.pos_tag("What does a Sovietologist study ?".split())

In [None]:
nlp = spacy.load('en_core_web_lg')
doc = nlp("What does a Biologist study?")

for token in doc:
    print(token.text, token.pos_)

In [None]:
from flair.nn import Classifier
from flair.data import Sentence

# load the model
tagger = Classifier.load('pos-fast')

# make a sentence
sentence = Sentence('What does a Biologist study?')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence)

str(list(sentence)[0]).split()[-2]

In [None]:
str(list(sentence)[0]).split()[-2]