In [1]:
%load_ext autoreload
%autoreload 2

Dependencies

In [11]:
import pathlib as pb

# Environment
ROOT_PATH = pb.Path('..')
DATA_DIR_PATH = ROOT_PATH / 'data'
CACHE_DIR_PATH = ROOT_PATH / '.cache'
TRANSFORMERS_CACHE_DIR_PATH = CACHE_DIR_PATH / 'transformers'
DATASETS_CACHE_DIR_PATH = CACHE_DIR_PATH / 'datasets'
TEST_DATA_FILE = DATA_DIR_PATH / 'test_data.csv'
TRAIN_DATA_FILE = DATA_DIR_PATH / 'train_data.csv'

# Model Repositories
BERT_RO_MODEL_REPO = 'dumitrescustefan/bert-base-romanian-cased-v1'

In [6]:
import os

os.environ['TRANSFORMERS_CACHE'] = str(TRANSFORMERS_CACHE_DIR_PATH)
os.environ['HF_DATASETS_CACHE'] = str(DATASETS_CACHE_DIR_PATH)

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
import torchdata
import torchtext
from torch import backends
import typing
import pathlib as pb
import os
from typing import List, Tuple, Dict, Set
import random
import numpy as np
import pandas as pd
import platform
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
from utils import get_available_device, read_data
from preprocess import BertPreprocessor
from data import SexismDataset

Environment Setup

In [9]:
# Use available GPU
DEVICE: torch.device = get_available_device()

# Deterministic experiments
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
np.random.RandomState(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

In [27]:
train_data_raw, test_data_raw = read_data(DATA_DIR_PATH)

In [50]:
train_dataset = SexismDataset(train_data_raw, BertPreprocessor(train_data_raw, BERT_RO_MODEL_REPO))
test_dataset = SexismDataset(test_data_raw, BertPreprocessor(test_data_raw, BERT_RO_MODEL_REPO))

In [48]:
train_data_raw['label'].unique()

array(['non-offensive', 'offensive', 'direct', 'descriptive', 'reporting'],
      dtype=object)

In [56]:
train_data_raw

Unnamed: 0,text,label
0,@CorinaTomescu05 Încă nu ...dar am trecut prin...,non-offensive
1,@emosaphicbitch sau rosu ca mine,non-offensive
2,@DanaMinodora Ce frumoasa ești.. Arăți foarte ...,non-offensive
3,Din fericire în extaz!Ai dus covoareleeee?? Pu...,offensive
4,cand aveam vreo 5 ani credeam ca romana e sing...,non-offensive
...,...,...
39003,"@violetAndro @CorinaCg3 Așa e, știu atâtea exe...",non-offensive
39004,"Directorul medical SAJ Galați, despre dispecer...",non-offensive
39005,"Au ""invatat"" prostii de balta de la Satan ce e...",non-offensive
39006,@_seriosul_ E o parte de adevar. Dar pana la u...,non-offensive


In [66]:
torch.tensor([2, 3])

tensor([2, 3])

In [68]:
train_dataset[2000:2003] # 94 chrs, media 120chr - 20 toks

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1,