In [1]:
import random
import pandas as pd
import numpy as np
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
import spacy
from spacy.lang.fr.examples import sentences
from spacy.lang.en.examples import sentences
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

In [2]:
df = pd.read_csv('questions_fr_train.csv')
fr = spacy.load("fr_core_news_md")

In [10]:
def get_tokens_lemmas(sentence):
    sen = fr(sentence)
    lemmas = []
    tokens = []
    for token in sen:
        lemmas.append(token.lemma_)
        tokens.append(token.text) 
    return tokens, lemmas

In [12]:
def get_synonyms(word):
    syns=[synset.lemma_names('fra') for synset in wn.synsets(word, lang='fra')]
    ss = set()
    for s in syns:
        for syn in s:
            ss.add(syn)
    if word in ss:
        ss.remove(word)
    return list(ss)

In [3]:
def synonym_replacement(sentence, alpha):
    tokens, lemmas = get_tokens_lemmas(sentence)
    tokens_copy = tokens.copy()
    n = round(alpha*len(lemmas))
    k = 0
    candidates = []
    while(k < n):
        j = random.randint(0, len(lemmas)-1)
        if lemmas[j].lower() not in fr_stop:
            candidates.append(j)
            k += 1
    idx = set(candidates)
    for i in idx:
        synset = get_synonyms(lemmas[i].lower())
        if synset != []:
            j = random.randint(0, len(synset)-1)
            syn = synset[j]
            tokens[i] = syn
    if tokens == tokens_copy:
        return sentence
    else:
        return ' '.join(e for e in tokens).replace(' -', '-').replace('\' ', '\'').replace(' )', ')')

In [4]:
def random_insertion(sentence, alpha):
    tokens, lemmas = get_tokens_lemmas(sentence)
    tokens_copy = tokens.copy()
    n = round(alpha*len(lemmas))
    candidates = []
    for i in range(len(lemmas)):
        if lemmas[i].lower() not in fr_stop:
            candidates.append(i)
    for i in range(n):
        j = random.choice(candidates)
        synset = get_synonyms(lemmas[j].lower())
        if synset != []:
            k = random.randint(0, len(synset)-1)
            syn = synset[k]
            pos = random.randint(0, len(tokens))
            tokens.insert(pos, syn)
    if tokens == tokens_copy:
        return sentence
    else:
        return ' '.join(e for e in tokens).replace(' -', '-').replace('\' ', '\'').replace(' )', ')')

In [5]:
def random_swap(sentence, alpha):
    wt = word_tokenize(sentence)
    n = round(alpha*len(wt))
    for i in range(n):
        w1 = wt.index(random.choice(wt))
        w2 = wt.index(random.choice(wt))
        wt[w1], wt[w2] = wt[w2], wt[w1]
    return ' '.join(e for e in wt).replace(' -', '-').replace('\' ', '\'').replace(' )', ')')

In [6]:
def random_deletion(sentence, alpha):
    wt = word_tokenize(sentence)
    candidates = []
    for i in range(len(wt)):
        if random.random() > alpha:
            candidates.append(wt[i])
    return ' '.join(e for e in candidates).replace(' -', '-').replace('\' ', '\'').replace(' )', ')')

In [7]:
def create_augmentation(data, alpha, operation):
    n = len(data)
    data_copy = data.copy()
    for i in range(n):
        if operation.lower() == 'sr':
            data_copy.at[i, 'question'] = synonym_replacement(data.at[i, 'question'], alpha)
        if operation.lower() == 'ri':
            data_copy.at[i, 'question'] = random_insertion(data.at[i, 'question'], alpha)
        if operation.lower() == 'rs':
            data_copy.at[i, 'question'] = random_swap(data.at[i, 'question'], alpha)
        if operation.lower() == 'rd':
            data_copy.at[i, 'question'] = random_deletion(data.at[i, 'question'], alpha)
    return data_copy

In [8]:
def augment_data(data, k, alpha, operation):
    data_copy = data.copy()
    for i in range(k):
        df = create_augmentation(data, alpha, operation)
        data_copy = pd.concat([data_copy, df], ignore_index = True)
    return data_copy.drop_duplicates()

In [13]:
# augment the data 2 times using synonym replacement operations
sr = augment_data(df, 2, 0.2, 'sr')

In [15]:
# original data and sentences after synonym replacement
sr[sr['id'] == 1102]['question'].values

array(['Je suis travailleur salarié(e). Puis-je refuser de faire des heures supplémentaires ou de travailler de nuit ?',
       'Je suis travailleur salarié(e) . Puis-je refuser de faire des moment autre ou de travailler de soir ?',
       'Je suis travailleur salarié(e) . Puis-je refuser de ouvrir des heures autre ou de travailler de nuit ?'],
      dtype=object)