In [None]:
# Data Augmentation 


In [1]:
import pandas as pd
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
from nlpaug.flow.sometimes import Sometimes
import os
path_models = os.getenv('MODELS_PATH')

In [2]:
path_models='/Users/philippedesaintchamas/ML_models/'

In [3]:
text = 'The quick brown fox jumps over the lazy dog .'

## Keyboard distance
#aug = nac.KeyboardAug()
#augmented_text = aug.augment(text)
print("Original:")
print(text)

Original:
The quick brown fox jumps over the lazy dog .


In [None]:
#BERT Augmentator
TOPK=20 #default=100
ACT = 'insert' #"substitute"

aug_bert = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', 
    device='cuda',
    action=ACT, top_k=TOPK)
print("Original:")
print(text)
print("Augmented Text:")
for ii in range(5):
    augmented_text = aug_bert.augment(text)
    print(augmented_text)

In [4]:
synonym_aug = naw.SynonymAug(aug_src='wordnet')
glove_aug = naw.WordEmbsAug(
    #model_type='word2vec', model_path=path_models + 'GoogleNews-vectors-negative300',
    model_type='glove', model_path=path_models + 'glove.6B.300d.txt',
    action="substitute")
w2v_aug = naw.WordEmbsAug(
    model_type='word2vec', model_path=path_models + 'GoogleNews-vectors-negative300',
    #model_type='glove', model_path=path_models + 'glove.6B.300d.txt',
    action="substitute")

In [25]:
pipelines= [Sometimes([nac.KeyboardAug(),synonym_aug, glove_aug], pipeline_p=0.1, aug_p=0.1),
            Sometimes([nac.KeyboardAug(),synonym_aug, w2v_aug], pipeline_p=0.1, aug_p=0.1),
            Sometimes([nac.KeyboardAug(),synonym_aug, glove_aug], pipeline_p=0.3, aug_p=0.1),
            Sometimes([nac.KeyboardAug(),synonym_aug, glove_aug], pipeline_p=0.1, aug_p=0.3),
            Sometimes([nac.KeyboardAug(),synonym_aug, w2v_aug], pipeline_p=0.2, aug_p=0.2)]

In [24]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional,GlobalMaxPooling1D
from keras.optimizers import Adam

maxlen=300

# Read train, validation and test data, and train TF-IDF matrix
def prepare_dataset_imdb(which_data, for_sklearn=True, augmenter=None, number_copy=1) :
    for dataset in ['train','dev'] :
        url = f'https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/sentiment/combined/paired/{dataset}_paired.tsv'
        df = pd.read_csv(url, error_bad_lines=False, sep='\t')
        
        # Split data on original/counterfactual
        if which_data == 'counterfactual' :
            _df = df[df.index%2 == 0] 
            df = df[df.index%2 == 1] # keep only the revised reviews
        else :
            df = df[df.index%2 == 0] # keep only the original reviews
            _df = df.copy(deep=True)

        # Data augmentation
        if which_data =='augmented':
            for text,label,_id in tqdm(zip(_df['Text'].tolist(),_df['Sentiment'].tolist(),_df['batch_id'].tolist())) :
                augmented_texts = augmenter.augment(text,n=number_copy)
                if type(augmented_texts) != list :
                    augmented_texts = [augmented_texts]
                for new_text in augmented_texts :
                    if new_text != text :
                        df = df.append({'Text' : new_text , 'Sentiment' : label, 'batch_id': _id} , ignore_index=True)
        
        if dataset == 'train':
            y_train = df['Sentiment'].tolist()
            if for_sklearn :
                vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', encoding='utf-8', min_df=10, max_df=500)
                vec.fit(_df['Text'])
                X_train = vec.transform(df['Text']) # fit and transform are made on different datasets
            else :
                tokenizer = Tokenizer(num_words=20001,oov_token='UNK')
                tokenizer.fit_on_texts(_df['Text'].tolist())
                print('Found %s unique tokens.' % len(tokenizer.word_index))
                sequences = tokenizer.texts_to_sequences(df['Text'].tolist())
                X_train = pad_sequences(sequences, maxlen=maxlen)
                y_train = np.array([int(y == 'Positive') for y in y_train]).reshape((-1,1))
            print("Train matrix dimensionality: ", X_train.shape)
        elif dataset == 'dev': 
            y_dev = df['Sentiment'].tolist()
            if for_sklearn :
                X_dev = vec.transform(df['Text'])
            if not for_sklearn :
                sequences = tokenizer.texts_to_sequences(df['Text'].tolist())
                X_dev = pad_sequences(sequences, maxlen=maxlen)
                y_dev = np.array([int(y == 'Positive') for y in y_dev]).reshape((-1))
            print("Dev matrix dimensionality: ", X_dev.shape)

    return({'X_train':X_train, 'y_train':y_train, 'X_dev':X_dev,'y_dev':y_dev})

def print_results(original, augmented) :
    print(f"Train Accuracy augmented: {accuracy_score(augmented['y_train'], augmented['model'].predict(augmented['X_train']))}")
    print(f"Train Accuracy original : {accuracy_score(original['y_train'], original['model'].predict(original['X_train']))}")
    print(f"Train Accuracy augmented on original data : {accuracy_score(original['y_train'], augmented['model'].predict(original['X_train']))}")
    print(f"Dev Accuracy augmented : {accuracy_score(augmented['y_dev'], augmented['model'].predict(augmented['X_dev']))}")
    print(f"Dev Accuracy original: {accuracy_score(original['y_dev'], original['model'].predict(original['X_dev']))}")
    print(f"Dev Accuracy augmented on original data: {accuracy_score(original['y_dev'], augmented['model'].predict(original['X_dev']))}")
    return True

def print_results_nn(original, augmented) :
    print(f"Train Accuracy augmented: {accuracy_score(augmented['y_train'], np.round(augmented['model'].predict(augmented['X_train'])))}")
    print(f"Train Accuracy original : {accuracy_score(original['y_train'], np.round(original['model'].predict(original['X_train'])))}")
    print(f"Train Accuracy augmented on original data : {accuracy_score(original['y_train'], np.round(augmented['model'].predict(original['X_train'])))}")
    print(f"Dev Accuracy augmented : {accuracy_score(augmented['y_dev'], np.round(augmented['model'].predict(augmented['X_dev'])))}")
    print(f"Dev Accuracy original: {accuracy_score(original['y_dev'], np.round(original['model'].predict(original['X_dev'])))}")
    print(f"Dev Accuracy augmented on original data: {accuracy_score(original['y_dev'], np.round(augmented['model'].predict(original['X_dev'])))}")
    return True

def train_models(model,list_dict_data, percentage_train=1) :
    """
    Train, and print learning curve if NN
    """
    list_dict_data_models = []
    for dataset in list_dict_data :
        if model == 'biLSTM' :
            bi_LSTM = Sequential()
            bi_LSTM.add(Embedding(20000, 50, input_length=300))
            #model.add(GlobalMaxPooling1D())
            bi_LSTM.add(Bidirectional(LSTM(50,recurrent_dropout=0.5,recurrent_activation='tanh')))
            bi_LSTM.add(Dense(50, activation='relu'))
            bi_LSTM.add(Dense(1, activation='sigmoid'))

            opt = Adam(learning_rate=1e-3)
            bi_LSTM.compile(opt, 'binary_crossentropy', metrics=['accuracy'])

            callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
            print('Train...')
            history = bi_LSTM.fit(dataset['X_train'], dataset['y_train'],batch_size=32, callbacks=[callback],
          epochs=20,validation_data=[dataset['X_dev'], dataset['y_dev']])
            dataset['model'] = bi_LSTM
            dataset['history'] = history
        else :
            history = model.fit(dataset['X_train'], dataset['y_train'])
            dataset['model'] = model
            dataset['history'] = history
        list_dict_data_models.append(dataset)
    return list_dict_data_models


In [26]:
original = prepare_dataset_imdb('original', for_sklearn=True)
counterfactual = prepare_dataset_imdb('counterfactual', for_sklearn=True)
[original,counterfactual] = train_models(SVC(),[original,counterfactual], percentage_train=1)
print_results(original, counterfactual)
for pipeline in pipelines :
    augmented = prepare_dataset_imdb('augmented', for_sklearn=True, augmenter= pipeline)
    [augmented] = train_models(SVC(),[augmented], percentage_train=1)
    print_results(original, augmented)

Train matrix dimensionality:  (1707, 2173)
Dev matrix dimensionality:  (245, 2173)
Train matrix dimensionality:  (1707, 2173)
Dev matrix dimensionality:  (245, 2173)
Train Accuracy augmented: 0.9988283538371412
Train Accuracy original : 0.2032806092560047
Train Accuracy augmented on original data : 0.2032806092560047
Dev Accuracy augmented : 0.889795918367347
Dev Accuracy original: 0.5142857142857142
Dev Accuracy augmented on original data: 0.5142857142857142


1707it [05:22,  5.29it/s]


Train matrix dimensionality:  (2966, 2173)


245it [00:52,  4.70it/s]


Dev matrix dimensionality:  (426, 2173)
Train Accuracy augmented: 0.9996628455832771
Train Accuracy original : 0.2032806092560047
Train Accuracy augmented on original data : 0.9994141769185706
Dev Accuracy augmented : 0.8661971830985915
Dev Accuracy original: 0.5142857142857142
Dev Accuracy augmented on original data: 0.8653061224489796


1707it [25:06,  1.13it/s]


Train matrix dimensionality:  (2930, 2173)


245it [03:11,  1.28it/s]


Dev matrix dimensionality:  (419, 2173)
Train Accuracy augmented: 0.9989761092150171
Train Accuracy original : 0.2032806092560047
Train Accuracy augmented on original data : 0.9982425307557118
Dev Accuracy augmented : 0.8568019093078759
Dev Accuracy original: 0.5142857142857142
Dev Accuracy augmented on original data: 0.8612244897959184


1707it [09:23,  3.03it/s]


Train matrix dimensionality:  (3387, 2173)


245it [01:16,  3.22it/s]


Dev matrix dimensionality:  (487, 2173)
Train Accuracy augmented: 1.0
Train Accuracy original : 0.2032806092560047
Train Accuracy augmented on original data : 1.0
Dev Accuracy augmented : 0.864476386036961
Dev Accuracy original: 0.5142857142857142
Dev Accuracy augmented on original data: 0.8612244897959184


1707it [05:07,  5.55it/s]


Train matrix dimensionality:  (2940, 2173)


245it [00:37,  6.50it/s]


Dev matrix dimensionality:  (431, 2173)
Train Accuracy augmented: 0.9996598639455783
Train Accuracy original : 0.2032806092560047
Train Accuracy augmented on original data : 0.9994141769185706
Dev Accuracy augmented : 0.877030162412993
Dev Accuracy original: 0.5142857142857142
Dev Accuracy augmented on original data: 0.8653061224489796


1707it [1:42:36,  3.61s/it]


Train matrix dimensionality:  (3303, 2173)


245it [08:32,  2.09s/it]


Dev matrix dimensionality:  (477, 2173)
Train Accuracy augmented: 1.0
Train Accuracy original : 0.2032806092560047
Train Accuracy augmented on original data : 1.0
Dev Accuracy augmented : 0.8553459119496856
Dev Accuracy original: 0.5142857142857142
Dev Accuracy augmented on original data: 0.8571428571428571


In [None]:
original = prepare_dataset_imdb('original', for_sklearn=False)
counterfactual = prepare_dataset_imdb('counterfactual', for_sklearn=False)
[original,counterfactual] = train_models(SVC(),[original,counterfactual], percentage_train=1)
print_results_nn(original, counterfactual)
for pipeline in pipelines :
    augmented = prepare_dataset_imdb('augmented', for_sklearn=False, augmenter= pipeline)
    print(pipeline)
    [augmented] = train_models(SVC(),[augmented], percentage_train=1)
    print_results_nn(original, augmented)

Found 19981 unique tokens.
Train matrix dimensionality:  (1707, 300)
Dev matrix dimensionality:  (245, 300)
Found 19981 unique tokens.
Train matrix dimensionality:  (1707, 300)
Dev matrix dimensionality:  (245, 300)


  return f(**kwargs)
  return f(**kwargs)


Train Accuracy augmented: 0.8705330990041008
Train Accuracy original : 0.29349736379613356
Train Accuracy augmented on original data : 0.29349736379613356
Dev Accuracy augmented : 0.5183673469387755
Dev Accuracy original: 0.4816326530612245
Dev Accuracy augmented on original data: 0.4816326530612245


1707it [07:13,  3.94it/s]


Found 19981 unique tokens.
Train matrix dimensionality:  (2961, 300)


245it [00:55,  4.45it/s]
  return f(**kwargs)


Dev matrix dimensionality:  (406, 300)
Name:Sometimes_Pipeline, Action:sometimes, Method:flow
Train Accuracy augmented: 0.8682877406281662
Train Accuracy original : 0.29349736379613356
Train Accuracy augmented on original data : 0.8711189220855302
Dev Accuracy augmented : 0.5640394088669951
Dev Accuracy original: 0.4816326530612245
Dev Accuracy augmented on original data: 0.5469387755102041


1707it [38:09,  1.34s/it]


Found 19981 unique tokens.
Train matrix dimensionality:  (2936, 300)


245it [12:21,  3.03s/it]


Dev matrix dimensionality:  (417, 300)
Name:Sometimes_Pipeline, Action:sometimes, Method:flow


  return f(**kwargs)


Train Accuracy augmented: 0.8671662125340599
Train Accuracy original : 0.29349736379613356
Train Accuracy augmented on original data : 0.872290568248389
Dev Accuracy augmented : 0.5539568345323741
Dev Accuracy original: 0.4816326530612245
Dev Accuracy augmented on original data: 0.5346938775510204


1587it [09:13,  2.33it/s]

In [None]:
import pickle
pickle.dump( new, open( "save.p", "wb" ))

In [None]:
augmented = prepare_dataset_imdb('original', for_sklearn=False, create_variation=True, augmenter= pipeline)
counterfactual = prepare_dataset_imdb('revised', for_sklearn=False, create_variation=False)

In [None]:
original = prepare_dataset_imdb('original', for_sklearn=False, create_variation=False)

1

In [None]:
train_models('biLSTM',original,augmented, percentage_train=1)
train_models('biLSTM',original,augmented, percentage_train=1)
print_results(original, augmented)