# Dataset Preparing

# 01 Setup

In [1]:
import re
import preprocessor as p
import pandas as pd
from datasets import Dataset
from datasets import load_dataset
from tqdm.auto import tqdm
from pathlib import Path

# 02 Dataset List

## OSCAR Indonesia 2019 Deduplicated

OSCAR or Open Super-large Crawled Aggregated coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the Ungoliant architecture.


https://huggingface.co/datasets/oscar<br>
https://oscar-corpus.com

In [2]:
oscard_id_dataset = load_dataset('oscar', 'unshuffled_deduplicated_id')

Reusing dataset oscar (/home/tel-user/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_id/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
oscard_id_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 9948521
    })
})

In [4]:
oscard_id_dataset['train'].features

{'id': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None)}

In [5]:
# sample data
oscard_id_dataset['train'][1]

{'id': 1,
 'text': 'Perihal dari itu, kalau kunci hal yang demikian hilang, pemilik wajib melapor ke bengkel sah untuk dibuatkan kunci baru dengan kode baru sekalian pendaftaran. Biayanya tidak murah, melainkan rupanya dapat lebih rendah daripada membikin ke spesialis duplikat kunci. Rata-rata biaya pembuatan satu kunci immobilizer sekitar Rp 1.000.000-an. Itu termasuk kunci, jasa, dan tarif registrasi, serta mencetak mata kuncinya.\nCara pasang immobilizer mt25 di Madiun Catat saja no ponsel kami mungkin sewaktu waktu anda memerlukan jasa pakar kunci dengan biaya yang terjangkau dan dengan pelayanan memuaskan. Kami siap 24 jam online melewati medsos juga bisa dihubungi. Pokoknya soal keadaan sulit kunci apapun percayakan terhadap kami.\nBagaimana membuka kunci mobil tanpa merusak ? – Melainkan kali saat kendaraan beroda empat kita kehilangan kuncinya kita mengundang jasa tukang kunci mobil dan mempercayakan kendaraan beroda empat kita terhadap mereka, Hakekatnya apa yang terjadi keper

In [6]:
%%time

text_data = []
file_count = 0

for sample in tqdm(oscard_id_dataset['train']):
    sample = sample['text'].lower().replace('\n', '')
    text_data.append(sample)
    
    if len(text_data) == 10_000:
        with open(f'data/id_oscar/text_{file_count}.txt', 'w', encoding='utf-8') as fp:     # once 10K mark, save to file
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
with open(f'data/id_oscar/text_{file_count}.txt', 'w', encoding='utf-8') as fp:             # save leftover samples
        fp.write('\n'.join(text_data))

  0%|          | 0/9948521 [00:00<?, ?it/s]

CPU times: user 19min 48s, sys: 2min 48s, total: 22min 37s
Wall time: 23min 44s


In [7]:
oscard_id_dataset_path = [str(x) for x in Path('data/id_oscar').glob('**/*.txt')]

# see file in path
oscard_id_dataset_path[0:5]

['data/id_oscar/text_543.txt',
 'data/id_oscar/text_155.txt',
 'data/id_oscar/text_528.txt',
 'data/id_oscar/text_582.txt',
 'data/id_oscar/text_983.txt']

## IndoNLU: EmoT
An emotion classification dataset collected from the social media platform Twitter [(Saputri et al., 2018)](https://ieeexplore.ieee.org/document/8629262)

https://huggingface.co/datasets/indonlu

In [8]:
emot_id_dataset = load_dataset('indonlu', 'emot', split='train+test+validation')

Reusing dataset indonlu (/home/tel-user/.cache/huggingface/datasets/indonlu/emot/1.0.0/0a83b181cd831cd5d9c15ffe39f3be76af23407eba2c902bccca53fa905d68af)


In [9]:
emot_id_dataset

Dataset({
    features: ['tweet', 'label'],
    num_rows: 4401
})

In [10]:
emot_id_dataset.features

{'tweet': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=5, names=['sadness', 'anger', 'love', 'fear', 'happy'], id=None)}

In [11]:
# sample data
emot_id_dataset[1]

{'tweet': '[USERNAME] [USERNAME] Dari pertama [USERNAME] menduduki bangku jabatan anda, rakyat belum pernah mendengar dan melihat hasil kerja dan prestasi nyata yang anda berikan semasa menduduki bangku jabatan.Coba tanya Kenapa [USERNAME] ? Abdi rakyat butuh seoran',
 'label': 1}

In [12]:
%%time

text_data = []
file_count = 0

for sample in tqdm(emot_id_dataset):
    sample = sample['tweet'].lower().replace('\n', '')
    text_data.append(sample)
    
    if len(text_data) == 10_000:
        with open(f'data/id_emot/text_{file_count}.txt', 'w', encoding='utf-8') as fp:     # once 10K mark, save to file
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
with open(f'data/id_emot/text_{file_count}.txt', 'w', encoding='utf-8') as fp:             # save leftover samples
        fp.write('\n'.join(text_data))

  0%|          | 0/4401 [00:00<?, ?it/s]

CPU times: user 473 ms, sys: 12.2 ms, total: 485 ms
Wall time: 479 ms


In [13]:
emot_id_dataset_path = [str(x) for x in Path('data/id_emot').glob('**/*.txt')] 

# see file in path
emot_id_dataset_path[0:5]

['data/id_emot/text_0.txt']

## IndoNLU: CASA
An aspect-based sentiment analysis dataset consisting of around a thousand car reviews collected from multiple Indonesian online automobile platforms [(Ilmania et al., 2018)](https://ieeexplore.ieee.org/document/8629181)

https://huggingface.co/datasets/indonlu

In [14]:
casa_id_dataset = load_dataset('indonlu', 'casa', split='train+test+validation')

Reusing dataset indonlu (/home/tel-user/.cache/huggingface/datasets/indonlu/casa/1.0.0/0a83b181cd831cd5d9c15ffe39f3be76af23407eba2c902bccca53fa905d68af)


In [15]:
casa_id_dataset

Dataset({
    features: ['sentence', 'fuel', 'machine', 'others', 'part', 'price', 'service'],
    num_rows: 1080
})

In [16]:
casa_id_dataset.features

{'sentence': Value(dtype='string', id=None),
 'fuel': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'], id=None),
 'machine': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'], id=None),
 'others': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'], id=None),
 'part': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'], id=None),
 'price': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'], id=None),
 'service': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'], id=None)}

In [17]:
# sample data
casa_id_dataset[1]

{'sentence': 'Avanza kenapa jadi boros bensin begini dah ah. Baru diisi sudah mau setengah saja .',
 'fuel': 0,
 'machine': 1,
 'others': 1,
 'part': 1,
 'price': 1,
 'service': 1}

In [18]:
%%time

text_data = []
file_count = 0

for sample in tqdm(casa_id_dataset):
    sample = sample['sentence'].lower().replace('\n', '')
    text_data.append(sample)
    
    if len(text_data) == 10_000:
        with open(f'data/id_casa/text_{file_count}.txt', 'w', encoding='utf-8') as fp:     # once 10K mark, save to file
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
with open(f'data/id_casa/text_{file_count}.txt', 'w', encoding='utf-8') as fp:             # save leftover samples
        fp.write('\n'.join(text_data))

  0%|          | 0/1080 [00:00<?, ?it/s]

CPU times: user 222 ms, sys: 4.96 ms, total: 227 ms
Wall time: 239 ms


In [19]:
casa_id_dataset_path = [str(x) for x in Path('data/id_casa').glob('**/*.txt')]

# see file in path
casa_id_dataset_path[0:5]

['data/id_casa/text_0.txt']

## IndoNLU: SmSA
This sentence-level sentiment analysis dataset is a collection of comments and reviews in Indonesian obtained from multiple online platforms [(Purwarianti and Crisdayanti, 2019)](https://ieeexplore.ieee.org/document/8904199)

https://huggingface.co/datasets/indonlu

In [20]:
smsa_id_dataset = load_dataset('indonlu', 'smsa', split='train+test+validation')

Reusing dataset indonlu (/home/tel-user/.cache/huggingface/datasets/indonlu/smsa/1.0.0/0a83b181cd831cd5d9c15ffe39f3be76af23407eba2c902bccca53fa905d68af)


In [21]:
smsa_id_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 12760
})

In [22]:
# sample data
smsa_id_dataset[10]

{'text': 'simcard indosat inaktiv gara-gara lupa isi pulsa dan kabar nya aktif jika pinda ke pasca bayar , ribet banget',
 'label': 2}

In [23]:
%%time

text_data = []
file_count = 0

for sample in tqdm(smsa_id_dataset):
    sample = sample['text'].lower().replace('\n', '')
    text_data.append(sample)
    
    if len(text_data) == 10_000:
        with open(f'data/id_smsa/text_{file_count}.txt', 'w', encoding='utf-8') as fp:     # once 10K mark, save to file
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
with open(f'data/id_smsa/text_{file_count}.txt', 'w', encoding='utf-8') as fp:             # save leftover samples
        fp.write('\n'.join(text_data))

  0%|          | 0/12760 [00:00<?, ?it/s]

CPU times: user 1.31 s, sys: 31.5 ms, total: 1.35 s
Wall time: 1.44 s


In [24]:
smsa_id_dataset_path = [str(x) for x in Path('data/id_smsa').glob('**/*.txt')]

# see file in path
smsa_id_dataset_path[0:5]

['data/id_smsa/text_1.txt', 'data/id_smsa/text_0.txt']

## IndoNLU: HoASA
An aspect-based sentiment analysis dataset consisting of hotel reviews collected from the hotel aggregator platform, AiryRooms [(Azhar et al., 2019)](https://ieeexplore.ieee.org/document/8988898)

https://huggingface.co/datasets/indonlu

In [25]:
hoasa_id_dataset = load_dataset('indonlu', 'hoasa', split='train+test+validation')

Reusing dataset indonlu (/home/tel-user/.cache/huggingface/datasets/indonlu/hoasa/1.0.0/0a83b181cd831cd5d9c15ffe39f3be76af23407eba2c902bccca53fa905d68af)


In [26]:
hoasa_id_dataset

Dataset({
    features: ['sentence', 'ac', 'air_panas', 'bau', 'general', 'kebersihan', 'linen', 'service', 'sunrise_meal', 'tv', 'wifi'],
    num_rows: 2854
})

In [27]:
hoasa_id_dataset.features

{'sentence': Value(dtype='string', id=None),
 'ac': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None),
 'air_panas': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None),
 'bau': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None),
 'general': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None),
 'kebersihan': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None),
 'linen': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None),
 'service': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None),
 'sunrise_meal': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None),
 'tv': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None),
 'wifi': ClassLabel(num_classes=4, names=['neg', 'neut', 'pos', 'neg_pos'], id=None)}

In [28]:
# sample data
hoasa_id_dataset[1]

{'sentence': 'sangat mengecewakan... hotel bad image, kebersihan kurang, berisik',
 'ac': 1,
 'air_panas': 1,
 'bau': 1,
 'general': 1,
 'kebersihan': 0,
 'linen': 1,
 'service': 1,
 'sunrise_meal': 1,
 'tv': 1,
 'wifi': 1}

In [29]:
%%time

text_data = []
file_count = 0

for sample in tqdm(casa_id_dataset):
    sample = sample['sentence'].lower().replace('\n', '')
    text_data.append(sample)
    
    if len(text_data) == 10_000:
        with open(f'data/id_hoasa/text_{file_count}.txt', 'w', encoding='utf-8') as fp:     # once 10K mark, save to file
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
with open(f'data/id_hoasa/text_{file_count}.txt', 'w', encoding='utf-8') as fp:             # save leftover samples
        fp.write('\n'.join(text_data))

  0%|          | 0/1080 [00:00<?, ?it/s]

CPU times: user 216 ms, sys: 16.3 ms, total: 233 ms
Wall time: 236 ms


In [30]:
hoasa_id_dataset_path = [str(x) for x in Path('data/id_hoasa').glob('**/*.txt')]

# see file in path
hoasa_id_dataset_path[0:5]

['data/id_hoasa/text_0.txt']

## Indonesia Wikipedia Dump

https://dumps.wikimedia.org/

In [31]:
wiki_id_dataset = load_dataset('text', data_files='data/raw/id-wiki-dump/id-wiki-dump-lower.txt')

Using custom data configuration default-3107b47b1c96581c
Reusing dataset text (/home/tel-user/.cache/huggingface/datasets/text/default-3107b47b1c96581c/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
wiki_id_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 463225
    })
})

In [33]:
# sample data
wiki_id_dataset['train'][1]

{'text': 'muhammad anwar el sadat adalah seorang politikus mesir yang menjabat sebagai presiden mesir ketiga dari oktober hingga pembunuhannya oleh perwira tentara fundamentalis pada oktober anwar adalah seorang senior anggota perwira bebas yang menggulingkan raja farouk dalam revolusi mesir dan orang kepercayaan dekat presiden gamal abdel nasser di mana dia menjabat sebagai wakil presiden dua kali dan dia menggantikannya sebagai presiden pada tahun pada tahun sadat dan menachem begin perdana menteri israel menandatangani perjanjian damai bekerja sama dengan presiden amerika serikat jimmy carter di mana mereka diakui dengan hadiah nobel perdamaian dalam sebelas tahun sebagai presiden ia mengubah lintasan mesir berangkat dari banyak prinsip politik dan ekonomi nasserisme melembagakan kembali sistem multi partai dan meluncurkan kebijakan ekonomi infitah sebagai presiden ia memimpin mesir dalam perang yom kippur tahun untuk merebut kembali semenanjung sinai mesir yang telah diduduki israe

In [34]:
%%time

text_data = []
file_count = 0

for sample in tqdm(wiki_id_dataset['train']):
    sample = sample['text'].lower().replace('\n', '')
    text_data.append(sample)
    
    if len(text_data) == 10_000:
        with open(f'data/id_wiki/text_{file_count}.txt', 'w', encoding='utf-8') as fp:     # once 10K mark, save to file
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
with open(f'data/id_wiki/text_{file_count}.txt', 'w', encoding='utf-8') as fp:             # save leftover samples
        fp.write('\n'.join(text_data))

  0%|          | 0/463225 [00:00<?, ?it/s]

CPU times: user 54.8 s, sys: 8.7 s, total: 1min 3s
Wall time: 1min 7s


In [35]:
wiki_id_dataset_path = [str(x) for x in Path('data/id_wiki').glob('**/*.txt')]

# see file in path
wiki_id_dataset_path[0:5]

['data/id_wiki/text_34.txt',
 'data/id_wiki/text_22.txt',
 'data/id_wiki/text_25.txt',
 'data/id_wiki/text_26.txt',
 'data/id_wiki/text_16.txt']

## Indonesian Hate Speech & Abusive Language Dataset

https://github.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection (Ibrohim and Budi, 2018)

In [36]:
# !wget https://raw.githubusercontent.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection/master/re_dataset.csv -O data/raw/abusive-id.csv

In [37]:
abusive_id_df = pd.read_csv('data/raw/abusive-id.csv', on_bad_lines='skip', encoding='latin1')
abusive_id_dataset = Dataset.from_pandas(abusive_id_df)

In [38]:
abusive_id_dataset

Dataset({
    features: ['Tweet', 'HS', 'Abusive', 'HS_Individual', 'HS_Group', 'HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other', 'HS_Weak', 'HS_Moderate', 'HS_Strong'],
    num_rows: 13169
})

In [39]:
abusive_id_dataset.features

{'Tweet': Value(dtype='string', id=None),
 'HS': Value(dtype='int64', id=None),
 'Abusive': Value(dtype='int64', id=None),
 'HS_Individual': Value(dtype='int64', id=None),
 'HS_Group': Value(dtype='int64', id=None),
 'HS_Religion': Value(dtype='int64', id=None),
 'HS_Race': Value(dtype='int64', id=None),
 'HS_Physical': Value(dtype='int64', id=None),
 'HS_Gender': Value(dtype='int64', id=None),
 'HS_Other': Value(dtype='int64', id=None),
 'HS_Weak': Value(dtype='int64', id=None),
 'HS_Moderate': Value(dtype='int64', id=None),
 'HS_Strong': Value(dtype='int64', id=None)}

In [40]:
# sample data
abusive_id_dataset[1]

{'Tweet': "RT USER: USER siapa yang telat ngasih tau elu?edan sarap gue bergaul dengan cigax jifla calis sama siapa noh licew juga'",
 'HS': 0,
 'Abusive': 1,
 'HS_Individual': 0,
 'HS_Group': 0,
 'HS_Religion': 0,
 'HS_Race': 0,
 'HS_Physical': 0,
 'HS_Gender': 0,
 'HS_Other': 0,
 'HS_Weak': 0,
 'HS_Moderate': 0,
 'HS_Strong': 0}

In [41]:
%%time

text_data = []
file_count = 0

for sample in tqdm(abusive_id_dataset):
    sample = sample['Tweet'].lower().replace('\n', '')
    text_data.append(sample)
    
    if len(text_data) == 10_000:
        with open(f'data/id_abusive/text_{file_count}.txt', 'w', encoding='utf-8') as fp:     # once 10K mark, save to file
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
with open(f'data/id_abusive/text_{file_count}.txt', 'w', encoding='utf-8') as fp:             # save leftover samples
        fp.write('\n'.join(text_data))

  0%|          | 0/13169 [00:00<?, ?it/s]

CPU times: user 3.13 s, sys: 30.4 ms, total: 3.16 s
Wall time: 3.22 s


In [42]:
abusive_id_dataset_path = [str(x) for x in Path('data/id_abusive').glob('**/*.txt')]

# see file in path
abusive_id_dataset_path[0:5]

['data/id_abusive/text_1.txt', 'data/id_abusive/text_0.txt']

## Indonesian Hate Speech Language Dataset

https://github.com/ialfina/id-hatespeech-detection (Alfina et al., 2018)

In [43]:
# !wget https://raw.githubusercontent.com/ialfina/id-hatespeech-detection/master/IDHSD_RIO_unbalanced_713_2017.txt -O data/raw/hatespeech-id.csv

In [44]:
hatespeech_id_df = pd.read_csv('data/raw/hatespeech-id.csv', sep='\t', on_bad_lines='skip', encoding='latin1')
hatespeech_id_dataset = Dataset.from_pandas(hatespeech_id_df)

In [45]:
hatespeech_id_dataset

Dataset({
    features: ['Label', 'Tweet'],
    num_rows: 713
})

In [46]:
hatespeech_id_dataset.features

{'Label': Value(dtype='string', id=None),
 'Tweet': Value(dtype='string', id=None)}

In [48]:
# sample data
hatespeech_id_dataset[1]

{'Label': 'Non_HS',
 'Tweet': 'RT @baguscondromowo: Mereka terus melukai aksi dalam rangka memenjarakan Ahok atau Ahok gagal dalam Pilkada.'}

In [50]:
hatespeech_id_dataset['Tweet'][1]

'RT @baguscondromowo: Mereka terus melukai aksi dalam rangka memenjarakan Ahok atau Ahok gagal dalam Pilkada.'

In [52]:
%%time

text_data = []
file_count = 0

for sample in tqdm(hatespeech_id_dataset):
    sample = sample['Tweet'].lower().replace('\n', '')
    sample = re.sub(r'http\S+', '', sample) 
    sample = re.sub(r'[-+]?[0-9]+', '', sample)       
    p.set_options(p.OPT.MENTION, p.OPT.RESERVED, p.OPT.HASHTAG, p.OPT.URL)
    sample = p.clean(sample)
    
    text_data.append(sample)
    
    if len(text_data) == 10_000:
        with open(f'data/id_hatespeech/text_{file_count}.txt', 'w', encoding='utf-8') as fp:     # once 10K mark, save to file
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
with open(f'data/id_hatespeech/text_{file_count}.txt', 'w', encoding='utf-8') as fp:             # save leftover samples
        fp.write('\n'.join(text_data))

  0%|          | 0/713 [00:00<?, ?it/s]

CPU times: user 221 ms, sys: 3.25 ms, total: 224 ms
Wall time: 272 ms


In [53]:
hatespeech_id_dataset_path = [str(x) for x in Path('data/id_hatespeech').glob('**/*.txt')]

# see file in path
hatespeech_id_dataset_path[0:5]

['data/id_hatespeech/text_0.txt']

## Scraping Twitter
Jan 1, 2020 - Dec 31, 2022 (4.500.000 indonesian tweet)

In [54]:
tweet_id_dataset = load_dataset('csv', data_files='data/raw/id-tweet-dump/id-tweet-dump-clean.csv')

Using custom data configuration default-8224581112fba585
Reusing dataset csv (/home/tel-user/.cache/huggingface/datasets/csv/default-8224581112fba585/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/1 [00:00<?, ?it/s]

In [55]:
tweet_id_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'tweet'],
        num_rows: 3126987
    })
})

In [56]:
# sample data
tweet_id_dataset['train'][1]

{'Unnamed: 0': 1, 'tweet': 'Lihat, kehidupanku jauh lebih baik kan. Maaf ya.'}

In [57]:
%%time

text_data = []
file_count = 0

for sample in tqdm(tweet_id_dataset['train']):
    sample = sample['tweet'].lower().replace('\n', '')
    text_data.append(sample)
    
    if len(text_data) == 10_000:
        with open(f'data/id_tweet/text_{file_count}.txt', 'w', encoding='utf-8') as fp:     # once 10K mark, save to file
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
        
with open(f'data/id_tweet/text_{file_count}.txt', 'w', encoding='utf-8') as fp:             # save leftover samples
        fp.write('\n'.join(text_data))

  0%|          | 0/3126987 [00:00<?, ?it/s]

CPU times: user 4min 43s, sys: 3.35 s, total: 4min 47s
Wall time: 5min 2s


In [58]:
tweet_id_dataset_dataset_path = [str(x) for x in Path('data/id_tweet').glob('**/*.txt')]

# see file in path
tweet_id_dataset_dataset_path[0:5]

['data/id_tweet/text_155.txt',
 'data/id_tweet/text_246.txt',
 'data/id_tweet/text_222.txt',
 'data/id_tweet/text_225.txt',
 'data/id_tweet/text_34.txt']

# Merge All Dataset Folder Path

In [59]:
all_dataset_path = oscard_id_dataset_path + emot_id_dataset_path + casa_id_dataset_path + smsa_id_dataset_path + hoasa_id_dataset_path + wiki_id_dataset_path + abusive_id_dataset_path + hatespeech_id_dataset_path + tweet_id_dataset_dataset_path

# see merge path
all_dataset_path[0:10]

['data/id_oscar/text_543.txt',
 'data/id_oscar/text_155.txt',
 'data/id_oscar/text_528.txt',
 'data/id_oscar/text_582.txt',
 'data/id_oscar/text_983.txt',
 'data/id_oscar/text_919.txt',
 'data/id_oscar/text_729.txt',
 'data/id_oscar/text_857.txt',
 'data/id_oscar/text_246.txt',
 'data/id_oscar/text_222.txt']

In [60]:
len(all_dataset_path)

1363

In [61]:
import pickle

# dump folder path
with open('data/all_dataset_path', 'wb') as fp:
    pickle.dump(all_dataset_path, fp)