In [1]:
import pandas as pd
import json
import os
import shutil
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_from_huggingface(directory: str, dataset: str, task: str) -> None:
    os.makedirs(directory, exist_ok = True)
    load_dataset(dataset, task, cache_dir = directory, trust_remote_code = True)



def load_necessery_files(source: str, destination: str, new_name: str) -> None:
    shutil.copy(source, os.path.join(destination, new_name))



def clear_dir(dir_to_clear: str) -> None:
    for dir in [f for f in os.listdir(dir_to_clear) if os.path.isdir(os.path.join(dir_to_clear, f))]:
        shutil.rmtree(dir_to_clear + '/' + dir)



def make_sample(file: str, 
                n: int = 500, 
                random_state: int = 2137) -> pd.DataFrame:
    df = pd.read_csv(file, delimiter = r'\n', header = None)
    df = df.replace('@anonymized_account', '@USER', regex=True)
    sample = df.sample(n = n, random_state = random_state).reset_index(drop = True)
    return sample



def df_to_jsonl(df: pd.DataFrame, output_file: str) -> None:
    TEMP = 'temp.json'
    df.to_json(TEMP, orient = 'records', lines = True, force_ascii = False)
    with open(TEMP, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
            for line in infile:
                data = json.loads(line)
                new_format = {"text": data["0"], "label": ""}
                json.dump(new_format, outfile, ensure_ascii = False)
                outfile.write('\n')
    os.remove(TEMP)

In [5]:
# Ładowanie zbioru z huggingface
DIR = 'data'
DATASET = 'poleval/poleval2019_cyberbullying'
TASK = 'task01'

load_from_huggingface(DIR, DATASET, TASK)

Downloading data: 100%|██████████| 340k/340k [00:00<00:00, 2.52MB/s]
Downloading data: 100%|██████████| 70.1k/70.1k [00:00<00:00, 70.2MB/s]
Generating train split: 100%|██████████| 10041/10041 [00:00<00:00, 83446.62 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 36890.84 examples/s]


In [4]:
# Przeniesienie pożądanych danych
ROOT = DIR + '/downloads/extracted/'
DIRS = os.listdir(ROOT)
DIRS = [f for f in DIRS if os.path.isdir(os.path.join(ROOT, f))]
TRAIN_SRC = ROOT + DIRS[0] + '/training_set_clean_only_text.txt'
TEST_SRC = ROOT + DIRS[1] + '/Task6/task 01/test_set_clean_only_text.txt'

load_necessery_files(TRAIN_SRC, DIR, new_name = 'train.txt')
load_necessery_files(TEST_SRC, DIR, new_name = 'test.txt')

In [5]:
# Wyczyszczenie zbioru
clear_dir(DIR)

In [6]:
# Samplowanie zbioru i zapisanie go w formacie jsonl
OUTPUT_NAME = 'data/first_iter.jsonl'
FILE = 'data/train.txt'
N = 500 # Liczba obserwacji do anotacji w pierwszej iteracji

sample = make_sample(file = FILE)
df_to_jsonl(sample, OUTPUT_NAME)

  df = pd.read_csv(file, delimiter = r'\n', header = None)


In [47]:
df = pd.read_csv('data/train.txt', delimiter = r'\n', header = None)
df = df.drop_duplicates()
df = df.replace('@anonymized_account', '@USER', regex=True)
df["label"] = ''
df2 = pd.read_json('data/first_iter_fragments.jsonl', lines=True)
df = df[~df[0].isin(df2['text'])]

  df = pd.read_csv('data/train.txt', delimiter = r'\n', header = None)


In [48]:
sample = df.sample(n = 100, random_state = 2137).reset_index(drop = True)
df_to_jsonl(sample, 'data/second_iter_fragments.jsonl')

In [49]:
df = df[~df[0].isin(sample[0])]

In [50]:
len(df) / 4

2331.75

In [51]:
klaudia = df.sample(n = 2331, random_state = 2137).reset_index(drop = True)
df = df[~df[0].isin(klaudia[0])]
michal = df.sample(n = 2331, random_state = 2137).reset_index(drop = True)
df = df[~df[0].isin(michal[0])]
kajetan = df.sample(n = 2331, random_state = 2137).reset_index(drop = True)
df = df[~df[0].isin(kajetan[0])]

df_to_jsonl(klaudia, 'data/klaudia_all.jsonl')
df_to_jsonl(michal, 'data/michal_all.jsonl')
df_to_jsonl(kajetan, 'data/kajetan_all.jsonl')
df_to_jsonl(df, 'data/wiktor_all.jsonl')

In [4]:
df = pd.read_csv('data/train.txt', delimiter = r'\n', header = None)
df = df.drop_duplicates()
df = df.replace('@anonymized_account', '@USER', regex=True)
df["label"] = ''
df = df[~df[0].str.startswith('RT ')]
first = df.sample(n = 100, random_state = 2137).reset_index(drop = True)
df = df[~df[0].isin(first[0])]
second = df.sample(n = 100, random_state = 2137).reset_index(drop = True)
df = df[~df[0].isin(second[0])]
michal = df.sample(n = 200, random_state = 2137).reset_index(drop = True)
df = df[~df[0].isin(michal[0])]
kajetan = df.sample(n = 200, random_state = 2137).reset_index(drop = True)
df = df[~df[0].isin(kajetan[0])]
wiktor = df.sample(n = 200, random_state = 2137).reset_index(drop = True)
df = df[~df[0].isin(wiktor[0])]
klaudia = df.sample(n = 200, random_state = 2137).reset_index(drop = True)

df_to_jsonl(first, 'data/first_iter_fragments.jsonl')
df_to_jsonl(second, 'data/second_iter_fragments.jsonl')
df_to_jsonl(klaudia, 'data/klaudia_fragments.jsonl')
df_to_jsonl(michal, 'data/michal_fragments.jsonl')
df_to_jsonl(kajetan, 'data/kajetan_fragments.jsonl')
df_to_jsonl(wiktor, 'data/wiktor_fragments.jsonl')



  df = pd.read_csv('data/train.txt', delimiter = r'\n', header = None)
