In [7]:
import pandas as pd
import numpy as np
import data_loader 
import sklearn 
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt


In [8]:
import torch
import torch.nn as nn

# Generate synthetic data
def generate_synthetic_data(batch_size=4, input_dim=768):
    # Create random tensors to simulate input data
    data1 = torch.randn(batch_size, input_dim)
    data2 = torch.randn(batch_size, input_dim)
    return data1, data2

# Define a simple linear layer for processing the data
class SimpleLinearModel(nn.Module):
    def __init__(self, input_dim=768, output_dim=4):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        return self.linear(x)

# Main function to process synthetic data
def main():
    batch_size = 4
    input_dim = 768
    output_dim = 4

    # Generate synthetic data
    data1, data2 = generate_synthetic_data(batch_size, input_dim)

    # Print the shapes of the data tensors
    print("Shape of data1:", data1.shape)
    print("Shape of data2:", data2.shape)

    # Initialize the model
    model = SimpleLinearModel(input_dim=input_dim, output_dim=output_dim)

    # Process the data through the linear layer
    logits1 = model(data1)
    logits2 = model(data2)

    # Add the two logits
    combined_logits = logits1 + logits2

    # Print the shapes and values for verification
    print("logits1 shape:", logits1.shape)
    print("logits1 values:\n", logits1)
    print("logits2 shape:", logits2.shape)
    print("logits2 values:\n", logits2)
    print("combined_logits shape:", combined_logits.shape)
    print("combined_logits values:\n", combined_logits)


main()


Shape of data1: torch.Size([4, 768])
Shape of data2: torch.Size([4, 768])
logits1 shape: torch.Size([4, 4])
logits1 values:
 tensor([[ 0.3866, -0.3396,  0.2579,  0.2788],
        [-0.1393, -0.0660,  0.9027, -0.6021],
        [-0.4014,  0.8838,  0.1307,  1.0273],
        [-0.0736,  0.1333, -1.3827, -0.0546]], grad_fn=<AddmmBackward0>)
logits2 shape: torch.Size([4, 4])
logits2 values:
 tensor([[ 0.5090,  0.1629, -0.6491, -0.2242],
        [-0.6609,  0.4547,  0.4353, -0.0064],
        [-0.1772, -1.3830,  0.3625,  0.6326],
        [-0.9981,  0.8459, -0.3374,  0.4102]], grad_fn=<AddmmBackward0>)
combined_logits shape: torch.Size([4, 4])
combined_logits values:
 tensor([[ 0.8956, -0.1767, -0.3912,  0.0546],
        [-0.8002,  0.3887,  1.3380, -0.6085],
        [-0.5786, -0.4992,  0.4933,  1.6599],
        [-1.0717,  0.9792, -1.7201,  0.3555]], grad_fn=<AddBackward0>)


In [9]:
dataset = data_loader.ViTBERT(data_path="/media/data3/home/khiemdd/ViTBERT/dataset/data500/donedataset_after.csv",
                                      stop_words_file= "/media/data3/home/khiemdd/ViTBERT/dataset/needed_files/vietnamese-stopwords.txt",
                                      wordnet_file= "/media/data3/home/khiemdd/ViTBERT/dataset/needed_files/word_net_vi.json",
                                      indices= None,
                                      type = "test",
                                      tokenizer="demdecuong/vihealthbert-base-word" ) # Ensure ViTBERTDataset is implemented

In [10]:
kfold = KFold(n_splits=5, shuffle=True)
for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
    print(f"fold {fold}:\n  ")
    print("train ids: ")
    print(type(train_ids))
    print("test ids: ")
    print(type(test_ids))

fold 0:
  
train ids: 
<class 'numpy.ndarray'>
test ids: 
<class 'numpy.ndarray'>
fold 1:
  
train ids: 
<class 'numpy.ndarray'>
test ids: 
<class 'numpy.ndarray'>
fold 2:
  
train ids: 
<class 'numpy.ndarray'>
test ids: 
<class 'numpy.ndarray'>
fold 3:
  
train ids: 
<class 'numpy.ndarray'>
test ids: 
<class 'numpy.ndarray'>
fold 4:
  
train ids: 
<class 'numpy.ndarray'>
test ids: 
<class 'numpy.ndarray'>


In [13]:
import pandas as pd
import random
import json
from random import shuffle
from mtranslate import translate
import re
import seaborn as sns

class DataAugmentation:
    def __init__(self, stop_words_file, wordnet_file, seed=1):
        self.seed = seed
        random.seed(self.seed)
        
        self.stop_words = self.load_stop_words(stop_words_file)
        self.wordnet_data = self.load_wordnet(wordnet_file)
    
    @staticmethod
    def load_stop_words(file_path):
        stop_words = []
        with open(file_path, "r", encoding='utf-8') as f:
            for line in f:
                stop_words.append(line.strip())
        return stop_words
    
    @staticmethod
    def load_wordnet(file_path):
        try:
            with open(file_path, "r", encoding='utf-8') as f:
                return json.load(f)
        except FileNotFoundError:
            print("WordNet file not found.")
            return {}
    
    @staticmethod
    def back_translation(sentence, intermediate_langs=['en', 'fr', 'ru']):
        intermediate_lang = random.choice(intermediate_langs)
        translated_sentence = translate(sentence, intermediate_lang)
        back_translated_sentence = translate(translated_sentence, 'vi')
        return back_translated_sentence
    
    def get_synonyms(self, word):
        synonyms = set()
        for key, value in self.wordnet_data.items():
            if key.strip() == word:
                synonyms.update([v.strip() for v in value])
        synonyms.discard(word)  # Remove the word itself if present
        return list(synonyms)
    
    @staticmethod
    def random_deletion(words, p):
        if len(words) == 1:
            return words
        new_words = [word for word in words if random.uniform(0, 1) > p]
        return new_words if new_words else [random.choice(words)]
    
    @staticmethod
    def random_swap(words, n):
        for _ in range(n):
            if len(words) > 1:
                idx1, idx2 = random.sample(range(len(words)), 2)
                words[idx1], words[idx2] = words[idx2], words[idx1]
        return words
    
    def random_insertion(self, words, n):
        for _ in range(n):
            synonyms = []
            while not synonyms:
                random_word = random.choice(words)
                synonyms = self.get_synonyms(random_word)
            random_synonym = random.choice(synonyms)
            random_idx = random.randint(0, len(words))
            words.insert(random_idx, random_synonym)
        return words
    
    def synonym_replacement(self, words, n):
        new_words = words.copy()
        random_word_list = [word for word in words if word not in self.stop_words]
        shuffle(random_word_list)
        num_replaced = 0
        for random_word in random_word_list:
            synonyms = self.get_synonyms(random_word)
            if synonyms:
                synonym = random.choice(synonyms)
                new_words = [synonym if word == random_word else word for word in new_words]
                num_replaced += 1
                if num_replaced >= n:
                    break
        return new_words
    
    def eda(self, sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9, bt_langues=['en', 'fr', 'ru']):
        words = sentence.split(' ')
        num_words = len(words)
        augmented_sentences = [sentence]
    
        if num_words > 1:
            num_new_per_technique = int(num_aug / 5) + 1  # Updated to account for back_translation too
            n_sr = max(1, int(alpha_sr * num_words))
            n_ri = max(1, int(alpha_ri * num_words))
            n_rs = max(1, int(alpha_rs * num_words))
    
            for _ in range(num_new_per_technique):
                augmented_sentences.append(' '.join(self.synonym_replacement(words, n_sr)))
            for _ in range(num_new_per_technique):
                augmented_sentences.append(' '.join(self.random_insertion(words, n_ri)))
            for _ in range(num_new_per_technique):
                augmented_sentences.append(' '.join(self.random_swap(words, n_rs)))
            for _ in range(num_new_per_technique):
                augmented_sentences.append(' '.join(self.random_deletion(words, p_rd)))
            for _ in range(num_new_per_technique):
                augmented_sentences.append(self.back_translation(sentence, intermediate_langs=bt_langues))
        
        return list(set(augmented_sentences))
    
    def edafor3(self, sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9, bt_langues=['en', 'fr', 'ru']):
        words = sentence.split(' ')
        num_words = len(words)
        augmented_sentences = [sentence]
    
        if num_words > 1:
            num_new_per_technique = int(num_aug / 5) + 1  # Updated to account for back_translation too
            n_sr = max(1, int(alpha_sr * num_words))
            n_ri = max(1, int(alpha_ri * num_words))
            n_rs = max(1, int(alpha_rs * num_words))
    
            augmented_sentences.append(self.back_translation(sentence, intermediate_langs=bt_langues))
        
        return list(set(augmented_sentences))
    
    @staticmethod
    def clear_punctuation(sentence):
        return re.sub(r'[^\w\s]', '', sentence)
    
    def augment_dataframe(self, df, num_aug=9, alpha=0.1, max_aug_for_3=40):
        augmented_rows = []
        augment_count_3 = 0
        
        for _, row in df.iterrows():
            original_col1 = row[df.columns[0]].strip()
            label = str(row[df.columns[1]]).strip()
    
            if label == "1.0" or label == "4.0":
                augmented_col1 = self.eda(original_col1, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
                for sent1 in augmented_col1:
                    augmented_rows.append({df.columns[0]: self.clear_punctuation(sent1), df.columns[1]: float(label)})
            elif label == "3.0" and augment_count_3 < max_aug_for_3:
                augmented_col1 = self.edafor3(original_col1, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=1)
                for sent1 in augmented_col1:
                    augmented_rows.append({df.columns[0]: self.clear_punctuation(sent1), df.columns[1]: float(label)})
                augment_count_3 += 1
            else:
                augmented_rows.append(row.to_dict())
    
        return pd.DataFrame(augmented_rows)

# Example Usage
stop_words_file = r"/media/data3/home/khiemdd/ViTBERT/dataset/needed_files/vietnamese-stopwords.txt"
wordnet_file = r"/media/data3/home/khiemdd/ViTBERT/dataset/needed_files/word_net_vi.json"

data_augmentor = DataAugmentation(stop_words_file, wordnet_file)

df = pd.read_csv('/media/data3/home/khiemdd/ViTBERT/dataset/datakfold/after/fold1_train.csv')
augmented_df = data_augmentor.augment_dataframe(df, num_aug=2, alpha=0.1, max_aug_for_3= 40)
augmented_df.to_csv('DATA_TRAINING_AUGMENT.csv', index=False)

# Load the augmented dataset
dataset_final_train = pd.read_csv("DATA_TRAINING_AUGMENT.csv")
# Function to plot label distribution
def plot_label_distribution(df, title):
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='label')
    plt.title(title)
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.show()

# Load the original and augmented datasets
original_df = pd.read_csv('/media/data3/home/khiemdd/ViTBERT/dataset/datakfold/after/fold1_train.csv')
augmented_df = pd.read_csv("DATA_TRAINING_AUGMENT.csv")

# Plot distributions
plot_label_distribution(original_df, 'Label Distribution Before Augmentation')
plot_label_distribution(augmented_df, 'Label Distribution After Augmentation')


NameError: name 'plt' is not defined