In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import BertModel
from datasets import load_dataset

from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm


Download here: https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset

# Make splits (only run once)

In [5]:
import json
data_path = f'{PATHS["root"]}/data/yelp/'
f1 = open(data_path+'yelp_academic_dataset_business.json') #150346
f2 = open(data_path+'yelp_academic_dataset_review.json') #6990280

business = []
for line in f1:
    business.append(json.loads(line))

review = []
for line in f2:
    review.append(json.loads(line))

f1.close()
f2.close()

In [2]:
american_business_ids = []
japanese_business_ids = []
chinese_business_ids = []
italian_business_ids = []

for example in business:
    if (not example['categories'] is None) and 'American' in example['categories']:
        american_business_ids.append(example['business_id'])
    if (not example['categories'] is None) and 'Japanese' in example['categories']:
        japanese_business_ids.append(example['business_id'])
    if (not example['categories'] is None) and 'Chinese' in example['categories']:
        chinese_business_ids.append(example['business_id'])
    if (not example['categories'] is None) and 'Italian' in example['categories']:
        italian_business_ids.append(example['business_id'])

In [5]:
print(len(american_business_ids))
print(len(japanese_business_ids))
print(len(italian_business_ids))
print(len(chinese_business_ids))

13066
1830
4573
3343


In [6]:
#takes two hours

import time

american = []
japanese = []
chinese = []
italian = []

start = time.time()

for idx, example in enumerate(review):
    if example['business_id'] in american_business_ids:
        american.append(example)
    if example['business_id'] in japanese_business_ids:
        japanese.append(example)
    if example['business_id'] in chinese_business_ids:
        chinese.append(example)
    if example['business_id'] in italian_business_ids:
        italian.append(example)

        
with open('american.json', 'w') as f3:
    json.dump(american, f3)
with open('japanese.json', 'w') as f4:
    json.dump(japanese, f4)
with open('chinese.json', 'w') as f5:
    json.dump(chinese, f5)
with open('italian.json', 'w') as f6:
    json.dump(italian, f6)

0
3.5595893859863284e-05
250000
3.8109967788060506
500000
7.7034997542699175
750000
11.659009126822154
1000000
15.495580875873566
1250000
19.41890575091044
1500000
23.406390301386516
1750000
27.311221476395925
2000000
31.324174058437347
2250000
35.39449222485224
2500000
39.39674288034439
2750000
43.51244024435679
3000000
47.61593271493912
3250000
51.7338965177536
3500000
55.86985433101654
3750000
59.9607965985934
4000000
64.13195392688115
4250000
68.31887435118357
4500000
72.4751049598058
4750000
76.7526405374209
5000000
80.98935148715972
5250000
85.24175108671189
5500000
89.59655721187592
5750000
93.96100403865178
6000000
98.38862371047338
6250000
102.86650851567586
6500000
107.35190171003342
6750000
111.7896115342776


In [3]:
!mv american.json ~/NLP-brain-biased-robustness/data/yelp
!mv italian.json ~/NLP-brain-biased-robustness/data/yelp
!mv japanese.json ~/NLP-brain-biased-robustness/data/yelp
!mv chinese.json ~/NLP-brain-biased-robustness/data/yelp

# Pre-processing and training code

In [2]:
import json
data_path = '/home/ubuntu/NLP-brain-biased-robustness/data/yelp/'
f1 = open(data_path+'american.json')
f2 = open(data_path+'italian.json')
f3 = open(data_path+'japanese.json')
f4 = open(data_path+'chinese.json')

american = []
for line in f1:
    american.append(json.loads(line))

italian = []
for line in f2:
    italian.append(json.loads(line))
    
japanese = []
for line in f3:
    japanese.append(json.loads(line))
    
chinese = []
for line in f4:
    chinese.append(json.loads(line))

f1.close()
f2.close()
f3.close()
f4.close()


american = american[0]
italian = italian[0]
japanese = japanese[0]
chinese = chinese[0]

In [3]:
american = american[:10000]
italian = italian[:10000]
japanese = japanese[:10000]
chinese = chinese[:10000]

In [4]:
import torch
import torch.nn.functional as F

na = []
for i in american:
    na.append({'text': i['text'], 'labels': F.one_hot((torch.tensor(i['stars']-1)).to(torch.int64), num_classes=5)})

ni = []
for i in italian:
    ni.append({'text': i['text'], 'labels': F.one_hot((torch.tensor(i['stars']-1)).to(torch.int64), num_classes=5)})

nj = []
for i in japanese:
    nj.append({'text': i['text'], 'labels': F.one_hot((torch.tensor(i['stars']-1)).to(torch.int64), num_classes=5)})

nc = []
for i in chinese:
    nc.append({'text': i['text'], 'labels': F.one_hot((torch.tensor(i['stars']-1)).to(torch.int64), num_classes=5)})

In [5]:
american_dataloader = DataLoader(na, shuffle=True, batch_size=8)
italian_dataloader = DataLoader(ni, shuffle=True, batch_size=8)
japanese_dataloader = DataLoader(nj, shuffle=True, batch_size=8)
chinese_dataloader = DataLoader(nc, shuffle=True, batch_size=8)

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import BertModel

from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

#!WANDB_START_METHOD = "thread"

import wandb

def change_all_keys(pre_odict):
    def change_key(odict, old, new):
        for _ in range(len(odict)):
            k, v = odict.popitem(False)
            odict[new if old == k else k] = v
            return odict
    for key in pre_odict.keys():
        if key[:5] == 'bert.':
            post_odict = change_key(pre_odict, key, key[5:])
            return change_all_keys(post_odict)
        if key[:7] == 'linear.':
            del pre_odict[key]
            return change_all_keys(pre_odict)
    return pre_odict

class PlaceHolderBERT(nn.Module):
    def __init__(self, num_out=5, sigmoid=False, return_CLS_representation=False, brain=True):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        self.bert = BertModel.from_pretrained('bert-base-cased')
        if brain:
            state_path = '/home/ubuntu/NLP-brain-biased-robustness/state_dicts/fine_tuned_model'
            pre_odict = torch.load(state_path)
            filtered_odict = change_all_keys(pre_odict)
            self.bert.load_state_dict(filtered_odict, strict=True)
        self.linear = nn.Linear(768,num_out)
        self.return_CLS_representation = return_CLS_representation
        self.sigmoid_bool = sigmoid
        self.sigmoid = nn.Sigmoid()
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    def forward(self, x):
        embeddings = self.tokenizer(x, return_tensors='pt', padding=True, truncation=True)
        embeddings.to(self.device)
        representations = self.bert(**embeddings).last_hidden_state
        cls_representation = representations[:,0,:]
        pred = self.linear(cls_representation)
        if self.return_CLS_representation:
            return cls_representation
        if self.sigmoid_bool:
            return self.sigmoid(pred)
        return pred
    
    
def train(model, dataloader, num_epochs=10): #can scrap keyword
    wandb.require(experiment="service")
    wandb.init(project="preliminary results just in case", entity="nlp-brain-biased-robustness")
    wandb.run.name = 'yelp bert 10 epochs'
    wandb.config = {
      "learning_rate": 5e-5,
      "epochs": 10,
      "batch_size": 8
    }
    
    
    #optimizer as usual
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_function = torch.nn.MSELoss()
    #learning rate scheduler
    num_training_steps = num_epochs * len(dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    #auto logging; progress bar
    progress_bar = tqdm(range(num_training_steps))

    #training loop
    model.train()
    for epoch in range(num_epochs):
        for batch in dataloader: #tryin unpacking text from 'labels' as in model development
            #batch = {k: v.to(device) for k, v in batch.items()}
            #features = {k: v for k, v in batch.items() if k != 'labels'}
            preds = model(batch['text'])
            targets = batch['labels'].float().to(device)
            loss = loss_function(preds, targets) #replace .loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
        american_score = evaluate(model, american_dataloader)
        wandb.log({"american": american_score})
        italian_score = evaluate(model, italian_dataloader)
        wandb.log({"italian": italian_score})
        japanese_score = evaluate(model, japanese_dataloader)
        wandb.log({"japanese": japanese_score})
        chinese_score = evaluate(model, chinese_dataloader)
        wandb.log({"chinese": chinese_score})

def evaluate(model, dataloader):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()
    num_correct = 0
    num_samples = 0
    for batch in dataloader:
        #batch = {k: v.to(device) for k, v in batch.items()}
        #features = {k: v for k, v in batch.items() if k != 'labels'}
        with torch.no_grad():
            preds = model(batch['text'])
            preds = torch.argmax(preds, axis=1)
            labels = torch.argmax(batch['labels'], axis=1).to(device)
            num_correct += (preds==labels).sum()
            num_samples += preds.size(0)
    return float(num_correct)/float(num_samples)*100 

#tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

#tokenize function
#def tokenize_dataset(examples):
#    return tokenizer(examples['text'], padding="max_length", truncation=True)



#pre-tokenize entire dataset
#tokenized_american = na.map(tokenize_dataset, batched=True)
#tokenized_italian = ni.map(tokenize_dataset, batched=True)
#tokenized_japanese = nj.map(tokenize_dataset, batched=True)
#tokenized_chinese = nc.map(tokenize_dataset, batched=True)

#tokenized_american = tokenized_american.remove_columns(["text"])
#tokenized_american.set_format("torch")
#tokenized_italian = tokenized_italian.remove_columns(["text"])
#tokenized_italian.set_format("torch")
#tokenized_japanese = tokenized_japanese.remove_columns(["text"])
#tokenized_japanese.set_format("torch")
#tokenized_chinese = tokenized_chinese.remove_columns(["text"])
#tokenized_chinese.set_format("torch")

### Only for practice
#american_small = tokenized_american.shuffle(seed=42).select(range(10000))
#italian_small = tokenized_italian.shuffle(seed=42).select(range(10000))
#japanese_small = tokenized_japanese.shuffle(seed=42).select(range(10000))
#chinese_small = tokenized_chinese.shuffle(seed=42).select(range(10000))
###
#american_dataloader = DataLoader(na, shuffle=True, batch_size=8)
#italian_dataloader = DataLoader(ni, shuffle=True, batch_size=8)
#japanese_dataloader = DataLoader(nj, shuffle=True, batch_size=8)
#chinese_dataloader = DataLoader(nc, shuffle=True, batch_size=8)

In [9]:
model = PlaceHolderBERT()
train(model, italian_dataloader)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[34m[1mwandb[0m: Currently logged in as: [33mjgc239[0m ([33mnlp-brain-biased-robustness[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/12500 [00:00<?, ?it/s]

In [10]:
print(evaluate(model, italian_dataloader))
print(evaluate(model, american_dataloader))
print(evaluate(model, japanese_dataloader))
print(evaluate(model, chinese_dataloader))

40.400000000000006
40.39
41.61
35.49
