In [1]:
!pip install transformers[sentencepiece]
!pip install datasets

Collecting transformers[sentencepiece]
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 12.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 37.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.4 MB/s 
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.many

In [6]:
import pandas as pd

from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset

from sklearn.model_selection import train_test_split

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_path = '/data/train.csv'
df = pd.read_csv(train_path)
df = df.fillna("NONE")

# Here I opt for a 85-15 split betwen train and development sets
train_df, dev_df = train_test_split(df,test_size=0.15,random_state=42)

train_data = Dataset.from_pandas(train_df)
dev_data = Dataset.from_pandas(dev_df)

train_data = train_data.rename_columns({'TEXT':'text','LABEL':'label'})
dev_data = dev_data.rename_columns({'TEXT':'text','LABEL':'label'})

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

train_data = train_data.map(preprocess_function, batched=True)
dev_data = dev_data.map(preprocess_function, batched=True)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.18.0",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10

  0%|          | 0/60 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [9]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("/submission_checkpoint", local_files_only=True)
trainer = Trainer(model=model, data_collator = data_collator)
trainer.model = model.cuda()
y = trainer.predict(dev_data)

loading configuration file /content/drive/MyDrive/Ling539/submission_checkpoint/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Ling539/submission_checkpoint",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "vocab_size": 30522
}

loading weights file /content/drive/MyDrive/Ling539/submission_checkpoint/pytor

In [10]:
from sklearn.metrics import classification_report
import numpy as np

preds = np.argmax(y.predictions,axis=1)

def metric_printout(modelname:str, labels, preds):

    print(modelname,": -------------------")

    print(classification_report(labels,preds,target_names=['Not Movie','Positive','Negative']))

metric_printout("Transformer Evaluation", preds, dev_data['label'])

Transformer Evaluation : -------------------
              precision    recall  f1-score   support

   Not Movie       1.00      0.99      0.99      5267
    Positive       0.93      0.91      0.92      2768
    Negative       0.90      0.93      0.92      2494

    accuracy                           0.95     10529
   macro avg       0.94      0.94      0.94     10529
weighted avg       0.95      0.95      0.95     10529

