In [1]:
!pip install transformers -q

[K     |████████████████████████████████| 5.5 MB 30.4 MB/s 
[K     |████████████████████████████████| 182 kB 88.4 MB/s 
[K     |████████████████████████████████| 7.6 MB 76.7 MB/s 
[?25h

In [2]:
!pip install datasets -q

[K     |████████████████████████████████| 451 kB 31.7 MB/s 
[K     |████████████████████████████████| 212 kB 90.8 MB/s 
[K     |████████████████████████████████| 115 kB 80.6 MB/s 
[K     |████████████████████████████████| 127 kB 88.8 MB/s 
[?25h

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, DataCollatorWithPadding, AdamW
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments

import datasets
from datasets import load_dataset
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, do_lower_case = True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/data

Mounted at /content/drive
/content/drive/MyDrive/data


In [5]:
train = pd.read_csv("train.csv")

speak = train['Speaker'].tolist()
text = train['Utterance'].tolist()

conv_text = []

for i in range(len(train)):
  conv_text.append(speak[i] + " : "+ text[i])

train['Utterance'] = conv_text

sentences = train.Utterance.values
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

train['labels'] = train['Target'].map({'neutral':0,
                                 'joy':1,
                                 'surprise':2,
                                 'anger':3,
                                 'sadness':4,
                                 'disgust':5,
                                 'fear':6})

labels = train.labels.values

In [6]:
train = train.drop(columns = ['ID', 'Speaker', 'Dialogue_ID','Target'])
t_data, v_data = train_test_split(train, test_size = 0.2, random_state = 42, shuffle = True)

from datasets import Dataset

train_dataset = Dataset.from_pandas(t_data)
val_dataset = Dataset.from_pandas(v_data)

from datasets.dataset_dict import DatasetDict

raw_data = {
     'train':Dataset.from_dict({'Utterance':train_dataset['Utterance'], 'labels' : train_dataset['labels']}),
     'val':Dataset.from_dict({'Utterance':val_dataset['Utterance'], 'labels' : val_dataset['labels']})
     }

raw_data = DatasetDict(raw_data)
raw_data

DatasetDict({
    train: Dataset({
        features: ['Utterance', 'labels'],
        num_rows: 7991
    })
    val: Dataset({
        features: ['Utterance', 'labels'],
        num_rows: 1998
    })
})

In [7]:
def tokenizer_function(sample):
     return tokenizer(sample['Utterance'], truncation=True)

In [8]:
tokenized_data = raw_data.map(tokenizer_function, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
labels = set([l['labels'] for l in tokenized_data['train']])
labels2idx = {k: i for i, k in enumerate(labels)}

In [10]:
def convert_labels(sample):
     label = [
         torch.tensor(labels2idx[l], dtype=torch.long).unsqueeze(0)
         for l in sample['labels']
     ]
     return {'labels': label}

tokenized_data = tokenized_data.map(convert_labels, batched=True).shuffle()

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [11]:
tokenized_data = tokenized_data.remove_columns(['Utterance'])

In [12]:
def compute_metrics(preds):
    metric = datasets.load_metric('f1')
    logits, labels = preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average = "macro")

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=7)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [30]:
args = TrainingArguments('checkpoints/bert-base-uncased-sentiment-classifier',
                         load_best_model_at_end=True,
                         num_train_epochs=10,
                         evaluation_strategy='epoch',
                         save_strategy='epoch',
                         learning_rate = 1e-5,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=4
                         )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [31]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

## test

In [33]:
test = pd.read_csv("test.csv")
test= test.drop(columns = ['ID', 'Speaker', 'Dialogue_ID'])

test_dataset = Dataset.from_pandas(test)

test_data = {
     'test':Dataset.from_dict({'Utterance':test_dataset['Utterance']})
     }

test_data = DatasetDict(test_data)
test_preprocessed_data = test_data.map(tokenizer_function, batched = True)
test_tokenized_data = test_preprocessed_data.remove_columns(['Utterance'])
test_tokenized_data["test"].column_names

  0%|          | 0/3 [00:00<?, ?ba/s]

['input_ids', 'token_type_ids', 'attention_mask']

In [34]:
outputs = trainer.predict(test_tokenized_data["test"])
y_pred = outputs.predictions.argmax(1)

sample = pd.read_csv("sample_submission.csv")
sample['Target'] = y_pred

sample['Target'] = sample['Target'].map({0:'neutral',
                                 1:'joy',
                                 2:'surprise',
                                 3:'anger',
                                 4:'sadness',
                                 5:'disgust',
                                 6:'fear'})
sample.head()

***** Running Prediction *****
  Num examples = 2610
  Batch size = 4


Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,anger


In [35]:
sample['Target'].value_counts()

neutral     1544
surprise     456
joy          307
anger        230
sadness       73
Name: Target, dtype: int64