# Load data

In [1]:
from datasets import load_dataset

data = load_dataset('reddit')['train']

Using custom data configuration default
Reusing dataset reddit (/home/jhuertas/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
import pandas as pd
import numpy as np

big_dataset = pd.DataFrame({'id': data['author'], 'text': data['body'], 'subreddit': data['subreddit']})

In [43]:
# Remove duplicate texts
clean_dataset = big_dataset.drop_duplicates(subset=["text"], keep=False)

# Remove deleted accounts (no author info)
clean_dataset = clean_dataset[clean_dataset.id != '[deleted]'] 

In [44]:
# Remove throwaways
throwaways_text = clean_dataset.text.apply(lambda x: 'throwaway' in x.lower()) 
throwaways_id = clean_dataset.id.apply(lambda x: 'throwaway' in x.lower())
lurker_dataset = clean_dataset[throwaways_text | throwaways_id]
throwaways = lurker_dataset.id.unique()

clean_dataset = clean_dataset[~clean_dataset.id.isin(throwaways)] 

In [48]:
value_counts = clean_dataset.id.value_counts()
value_counts

DejaBoo        825
Shaper_pmp     649
rand486        573
kuvter         549
Lots42         450
              ... 
Rucksalot        1
G1G4H3RTZ        1
aconitine-       1
iamknowhere      1
an00bisX         1
Name: id, Length: 1386057, dtype: int64

In [53]:
N_AUTHORS = 16

valid_authors = value_counts[value_counts > N_AUTHORS].index.tolist()
minimal_dataset = clean_dataset[clean_dataset.id.isin(valid_authors)]
minimal_dataset.id.unique().shape

(16187,)

In [55]:
from multiprocessing import Pool
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('roberta-large')
CHUNK_SIZE = 512

def split_data(row):
    eid, values = row
    input_ids = tokenizer(values.text).input_ids
    chunked = [input_ids[chunk: chunk + CHUNK_SIZE] for chunk in range(0, len(input_ids), CHUNK_SIZE)]
    decoded_chunked = tokenizer.batch_decode(chunked)
    return pd.DataFrame({'id': [values.id]*len(chunked),
                         'decoded_text': decoded_chunked})
                         
with Pool(20) as p:
    chunks = list(tqdm(p.imap_unordered(split_data, minimal_dataset.iterrows()),
                       total=len(minimal_dataset)))


reddit_chunked = pd.concat(chunks)

  0%|          | 0/486904 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (938 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (676 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (867 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [56]:
reddit_valid = reddit_chunked[reddit_chunked.decoded_text.apply(len) > 100].drop_duplicates(subset=["decoded_text"], keep=False)
value_counts = reddit_valid.id.value_counts()
value_counts

DejaBoo                 1017
Shaper_pmp               861
kuvter                   600
rand486                  583
herman_gill              550
                        ... 
1541drive                 13
dkmdlb                    13
ThatsItGuysShowsOver      12
RandomPrecision1          11
USAF503                    8
Name: id, Length: 16187, dtype: int64

In [57]:
N_AUTHORS = 16 - 1

valid_authors = value_counts[value_counts > N_AUTHORS].index.tolist()
big_dataset_valid = reddit_valid[reddit_valid.id.isin(valid_authors)]
in_test = pd.Series(big_dataset_valid.id.unique()).sample(frac=.1).tolist()

In [58]:
big_dataset_train = big_dataset_valid[~big_dataset_valid.id.isin(in_test)]
big_dataset_test = minimal_dataset[minimal_dataset.id.isin(in_test)].drop_duplicates(subset=["text"], keep=False)


In [59]:
test_value_counts = big_dataset_test.id.value_counts()
valid_test = test_value_counts[test_value_counts > N_AUTHORS].index.tolist()
big_dataset_test_v2 = big_dataset_test[big_dataset_test.id.isin(valid_test)]
big_dataset_test_v2

Unnamed: 0,id,text,subreddit
15,Perservere,Didn't they lose 6 games in a row? Just becaus...,leagueoflegends
41,Duckylicious,"If this Plan B is the same as the ""morning aft...",TwoXChromosomes
84,BIllyBrooks,"This on no way helps, but when I lived in camp...",melbourne
99,masasin,I am in mechatronics too. Graduating in a few ...,uwaterloo
134,Azurphax,"Well, I suppose you are getting great color re...",gaming
...,...,...,...
3847669,Tangerine_Dreams,"Hey, awesometacular folks of r/wiiu!\n\nI'm so...",wiiu
3847672,Snow_Cub,My old computer was stepped on by a rhino (lon...,techsupport
3847948,themooseexperience,So I just got the game recently and built myse...,starbound
3848009,P2000Camaro,"I am a phone salesmen, and I discovered this t...",Android


In [60]:
sum(big_dataset_train.id.value_counts() < 16)

0

In [61]:
sum(big_dataset_test_v2.id.value_counts() < 16)

0

In [62]:
big_dataset_train.id.value_counts()


DejaBoo               1017
Shaper_pmp             861
kuvter                 600
herman_gill            550
redweasel              494
                      ... 
estrangedeskimo         16
Artegan                 16
hired_goon              16
Assbutt_Winchester      16
sec713                  16
Name: id, Length: 14548, dtype: int64

In [63]:
import csv
big_dataset_train.to_csv('local_data/reddit_train.csv', index=False, quoting=csv.QUOTE_ALL)
big_dataset_test_v2.to_csv('local_data/reddit_test.csv', index=False, quoting=csv.QUOTE_ALL)

In [42]:
big_dataset_train

Unnamed: 0,id,decoded_text
0,NightlyReaper,"<s>In Mechwarrior Online, I have begun to use ..."
0,leep420,<s>I take a beta blocker for my heart conditio...
0,Wheelman,<s>As an entrepreneur/freelancer (especially a...
0,FrankManic,"<s>And that is, hands down, the coolest aspect..."
0,chrom_ed,"<s>So you're saying ""try it, I might not mind ..."
...,...,...
0,avboden,<s>I completely understand supporting students...
1,avboden,the exam. \n\n\nEdit 3: THANK YOU! I forgot t...
0,sosuhme,"<s>Firstly, I agree with everyone, that hit wa..."
0,iamtotalcrap,<s>\nauthor: [Foxcy]( (*1 days*) ``|`` author ...


# Load data (Local)

In [17]:
import pandas as pd
from data import build_dataset
from transformers import AutoTokenizer

train = pd.read_csv('local_data/reddit_train.csv').sample(frac=1.)
test = pd.read_csv('local_data/reddit_test.csv')

train['unique_id'] = train.index.astype(str)
test['unique_id'] = test.index.astype(str)

BATCH_SIZE = 16384
VALID_BATCH_SIZE = 1000
CHUNK_SIZE = 512
TRAINING_STEPS = 3000
VALIDATION_STEPS = 500
WARMUP_STEPS = 0

train_data = build_dataset(train,
                           steps=TRAINING_STEPS*BATCH_SIZE,
                           batch_size=BATCH_SIZE,
                           num_workers=8, 
                           prefetch_factor=8,
                           max_len=CHUNK_SIZE,
                           tokenizer = AutoTokenizer.from_pretrained('roberta-base'),
                           mode='text')
test_data = build_dataset(test, 
                          steps=VALIDATION_STEPS*VALID_BATCH_SIZE, 
                          batch_size=VALID_BATCH_SIZE, 
                          num_workers=4, 
                          prefetch_factor=4, 
                          max_len=CHUNK_SIZE,
                          tokenizer = AutoTokenizer.from_pretrained('roberta-base'),
                          mode='text')

  train = pd.read_csv('local_data/reddit_train.csv').sample(frac=1.)
  test = pd.read_csv('local_data/reddit_test.csv')


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [43]:
print(train.text.sample(1).tolist()[0])

TBH, I doubt the difficulty rises are going to taper off any time soon. BUT, it could if the price begins to stagnate or fall. Currently there are a number of ASIC producers, as well as a number of up and coming companies with even more efficient designs. This is going to mean the fight to be even more efficient is definitely going to continue to drive up the difficulty, even if the price stays where it is.

This is bad news for people wanting to enter the mining market, and existing miners, their slice of the block reward is going to probably get progressively smaller and may drive many miners into the red and out of the game. However conversely, this is fantastic news for Bitcoin in general, multiple chips designs means there is no single point of failure regarding chip design, manufacture, and distribution. It also is fantastic news in the sense that there is incredibly fierce mining competition and this makes it all the harder to disrupt, or try to control the network. If this keep

In [18]:
import wandb

from datetime import datetime
from transformers import AutoTokenizer, AutoModel
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from pytorch_lightning import Trainer

from model_experimental import (ContrastiveLSTMTransformer,
                                )

# Name model
date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
save_name = f'final_{date_time}'
print(f'Saving model to {save_name}')

wandb.login()
wandb_logger = WandbLogger(name=save_name, project="author_profiling_reddit")
checkpoint_callback = ModelCheckpoint('model',
                                      filename=save_name,
                                      monitor=None,
                                      every_n_val_epochs=1,
                                      )
lr_monitor = LearningRateMonitor('step')

# Define training arguments
trainer = Trainer(devices=0,
                  max_steps=3000,
                  accelerator='gpu',
                  log_every_n_steps=1,
                  flush_logs_every_n_steps=500,
                  logger=wandb_logger,
                  precision=16,
                  val_check_interval=250,
                  callbacks=[checkpoint_callback, lr_monitor],
                  )

# Define model
base_transformer = AutoModel.from_pretrained('roberta-large')
train_model = ContrastiveLSTMTransformer(base_transformer,
                                         learning_rate=1e-2,
                                         weight_decay=.01,
                                         num_warmup_steps=0,
                                         num_training_steps=3000,
                                         enable_scheduler=True,
                                         minibatch_size=256,)

trainer.fit(train_model, train_data, test_data)
wandb.finish()

Saving model to final_2022-06-08_15-49-38


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjahuerta92[0m (use `wandb login --relogin` to force relogin)
  rank_zero_deprecation(


ValueError: Mismatch between the requested accelerator type (GPU) and assigned device type (CPU).

In [19]:
!nvidia-smi

Wed Jun  8 15:50:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 8000     Off  | 00000000:37:00.0 Off |                  Off |
| 33%   29C    P8    15W / 260W |   1631MiB / 49152MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro RTX 8000     Off  | 00000000:86:00.0 Off |                  Off |
| 59%   79C    P2   240W / 260W |  41421MiB / 49152MiB |    100%      Default |
|       

In [1]:
import pandas as pd

reddit_train = pd.read_csv('local_data/reddit_train.csv')
reddit_test = pd.read_csv('local_data/reddit_test.csv')

reddit_train

Unnamed: 0,id,decoded_text
0,------x------,<s>Just found out my boyfriend of 3 years has ...
1,------x------,rying I just thought he was playing a game. We...
2,------x------,ve made). I am staying with a family friend an...
3,---annon---,<s>Ok so this is my only real badass moment ev...
4,---annon---,a retreat focused on healing from sexual abus...
...,...,...
1629244,zzzzzzzzzzzzzzzzspaf,\n\nCom on public security? Like the EU battl...
1629245,zzzzzzzzzzzzzzzzspaf,with every country to have the authorisation ...
1629246,zzzzzzzzzzzzzzzzspaf,of errors (stuff like 1/2 +3/4 = (1+3)/(2+4) ...
1629247,zzzzzzzzzzzzzzzzspaf,.<\s>Actually from my understanding (read out ...


In [2]:
included = reddit_train.decoded_text.apply(len) > 500

In [4]:
reddit_train_long = reddit_train[included]

In [13]:
reddit_train_long.id.value_counts() > 10

iamtotalcrap          True
Death_Star_           True
RamsesThePigeon       True
DejaBoo               True
Shaper_pmp            True
                     ...  
Ariadenus            False
misap                False
qs12                 False
ffsidonotonlylurk    False
Laspimon             False
Name: id, Length: 289987, dtype: bool

In [20]:
value_counts = reddit_train_long.id.value_counts()
valid_authors = value_counts[value_counts >= 10].index.tolist()
small_reddit_data = reddit_train_long[reddit_train_long.id.isin(valid_authors)]


In [23]:
import csv
small_reddit_data.to_csv('local_data/reddit_train_clean.csv', index=False, quoting=csv.QUOTE_ALL)