In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

df = pd.read_csv('train.csv')
df['label'] = df['class'].apply(lambda x: 1 if x == 1 else 0)
df = df[['tweet', 'label']]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\katarina.stanojkovic\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\katarina.stanojkovic\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\katarina.stanojkovic\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\katarina.stanojkovic\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
def preprocess_text(df, text_column):
    """
    Funkcija za predobradu teksta.
    
    Argumenti:
    - df: Pandas DataFrame koji sadrži kolonu sa tekstom.
    - text_column: Ime kolone koja sadrži tekstualne podatke.
    
    Vraća:
    - df: Pandas DataFrame sa novom kolonom 'clean_text' koja sadrži predobrađen tekst.
    """
    # Kopiramo originalni DataFrame da ne bismo menjali originalne podatke
    df = df.copy()
    
    # Uklanjanje URL-ova i email adresa
    df['clean_tweet'] = df[text_column].apply(lambda x: re.sub(r'http\S+|www.\S+|mailto:\S+', '', x))
    
    # Uklanjanje HTML tagova
    df['clean_tweet'] = df['clean_tweet'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
    
    # Uklanjanje emotikona i specijalnih karaktera
    def remove_emojis(text):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emotikoni
                                   u"\U0001F300-\U0001F5FF"  # simobli i ikone
                                   u"\U0001F680-\U0001F6FF"  # transport i simobli
                                   u"\U0001F1E0-\U0001F1FF"  # zastave
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    df['clean_tweet'] = df['clean_tweet'].apply(remove_emojis)
    
    # Uklanjanje specijalnih karaktera i interpunkcije
    df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r'[^A-Za-zšđčćžŠĐČĆŽ ]+', ' ', x))
    
    # Pretvaranje u mala slova
    df['clean_tweet'] = df['clean_tweet'].str.lower()
    
    # Uklanjanje dijakritika
    def remove_diacritics(text):
        text = unicodedata.normalize('NFKD', text)
        text = ''.join([c for c in text if not unicodedata.combining(c)])
        return text

    df['clean_tweet'] = df['clean_tweet'].apply(remove_diacritics)
    
    # Uklanjanje višestrukih razmaka
    df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub('\s+', ' ', x).strip())
    
    # Tokenizacija
    df['tokens'] = df['clean_tweet'].apply(nltk.word_tokenize)
    
    # Uklanjanje stop-reči
    stop_words = set(stopwords.words('english'))  # Ako imaš stop-reči za srpski, zameni ovde
    df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
    
    # Lematizacija
    lemmatizer = WordNetLemmatizer()
    df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # Spajanje tokena nazad u string
    df['clean_tweet'] = df['tokens'].apply(lambda x: ' '.join(x))
    
    # Uklanjanje nepotrebnih kolona
    df = df.drop(columns=['tokens'])
    
    return df


In [3]:
data = preprocess_text(df, 'tweet')


  df['clean_tweet'] = df['clean_tweet'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())


In [4]:
from textattack.augmentation import EmbeddingAugmenter
from textattack.augmentation import Augmenter, BackTranslationAugmenter
from textattack.transformations import WordInsertionMaskedLM, WordSwapWordNet
from textattack.augmentation.recipes import CLAREAugmenter

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import numpy as np
from textattack.augmentation import (
    EmbeddingAugmenter,
    CLAREAugmenter
)

import random

# Inicijalizacija TextAttack augmentera
embedding_augmenter = EmbeddingAugmenter()
clare_augmenter = CLAREAugmenter(model='distilroberta-base', tokenizer='distilroberta-base')

# Definisanje wrapper funkcija
def embedding_augment(text, **kwargs):
    augmented_texts = embedding_augmenter.augment(text)
    return random.choice(augmented_texts) if augmented_texts else text

def clare_augment(text, **kwargs):
    augmented_texts = clare_augmenter.augment(text)
    return random.choice(augmented_texts) if augmented_texts else text

# Lista TextAttack augmentera kao funkcija
textattack_methods = [
    clare_augment,
    embedding_augment
]


  return self.fget.__get__(instance, owner)()
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model t

In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
def augment_dataset_parallel(df, text_column, method, num_augmented_instances, max_workers=4, **kwargs):
    """
    Primeni izabranu metodu augmentacije na dataset koristeći paralelizaciju.
    
    Argumenti:
    - df: Originalni DataFrame.
    - text_column: Naziv kolone sa tekstom.
    - method: Funkcija metode augmentacije.
    - num_augmented_instances: Broj instanci koje treba generisati.
    - max_workers: Broj paralelnih radnika.
    - **kwargs: Dodatni argumenti za metodu augmentacije.
    
    Vraća:
    - DataFrame sa augmentiranim podacima.
    """
    augmented_texts = []
    indices = df.index.tolist()
    num_samples = len(indices)
    
    # Ako je broj instanci veći od broja dostupnih uzoraka, uzmi uzorke sa zamjenom
    replace = num_augmented_instances > num_samples
    sampled_indices = np.random.choice(indices, size=num_augmented_instances, replace=replace)
    
    def augment_text(idx):
        original_text = df.loc[idx, text_column]
        try:
            augmented_text = method(original_text, **kwargs)
            return augmented_text
        except Exception as e:
            print(f"Greška pri augmentaciji teksta na indeksu {idx}: {e}")
            return original_text  # Vraća originalni tekst u slučaju greške
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Koristimo tqdm za prikaz napretka
        futures = {executor.submit(augment_text, idx): idx for idx in sampled_indices}
        for future in tqdm(as_completed(futures), total=num_augmented_instances, desc=f'Augmenting with {method.__name__}'):
            augmented_texts.append(future.result())
    
    augmented_df = pd.DataFrame({text_column: augmented_texts})
    return augmented_df

In [8]:
negative_class_df = data[data['label'] == 0]

# Broj instanci po klasi
num_class_0 = len(negative_class_df)
num_class_1 = len(data[data['label'] == 1])

# Izračunajte razliku
class_difference = abs(num_class_0 - num_class_1)

# Broj metoda
num_methods = len(textattack_methods)

# Podelite negativnu klasu na delove po broju metoda
split_dataframes = np.array_split(negative_class_df, num_methods)

# Broj augmentisanih instanci po metodi
num_instances_per_method = class_difference // num_methods

augmented_dfs = []
for method, split_df in zip(textattack_methods, split_dataframes):
    if method==clare_augment:
        num_instances_per_method=2000
    else:
        num_instances_per_method=11000
    augmented_df = augment_dataset_parallel(
        split_df, 
        text_column='clean_tweet', 
        method=method, 
        num_augmented_instances=num_instances_per_method
    )
    augmented_df['label'] = 0
    augmented_dfs.append(augmented_df)

final_augmented_df = pd.concat(augmented_dfs, ignore_index=True)

# Kombinujte originalni i augmentisani dataset
final = pd.concat([data, final_augmented_df], ignore_index=True)

print("Originalni dataset:", data.shape)
print("Augmentisani dataset:", final_augmented_df.shape)
print("Konačni dataset:", final.shape)

final.to_csv('augmented_train.csv', index=False, encoding='utf-8')
print("Augmentirani dataset je sačuvan u 'augmented_train.csv'")

  return bound(*args, **kwds)
Augmenting with clare_augment:   0%|          | 0/2000 [00:00<?, ?it/s]

2024-11-15 19:08:40,007 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, VERB, PUNCT, ADP, DET, PROPN, PRON, ADJ, ADV, CCONJ, PART, NUM, AUX, INTJ, SYM, X, <START>, <STOP>
2024-11-15 19:08:40,109 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, VERB, PUNCT, ADP, DET, PROPN, PRON, ADJ, ADV, CCONJ, PART, NUM, AUX, INTJ, SYM, X, <START>, <STOP>



From c:\Users\katarina.stanojkovic\AppData\Local\Programs\Python\Python311\Lib\site-packages\tf_keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.






From c:\Users\katarina.stanojkovic\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow_hub\resolver.py:369: The name tf.gfile.Exists is deprecated. Please use tf.io.gfile.exists instead.






From c:\Users\katarina.stanojkovic\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow_hub\resolver.py:120: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.






From c:\Users\katarina.stanojkovic\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow_hub\module_v2.py:126: The name tf.saved_model.load_v2 is deprecated. Please use tf.compat.v2.saved_model.load instead.

Augmenting with clare_augment: 100%|██████████| 2000/2000 [9:19:27<00:00, 16.78s/it]  
Augmenting with embedding_augment: 100%|██████████| 11000/11000 [14:15<00:00, 12.86it/s] 

Originalni dataset: (24783, 3)
Augmentisani dataset: (13000, 2)
Konačni dataset: (37783, 3)
Augmentirani dataset je sačuvan u 'augmented_train.csv'



