In [None]:
!pip install transformers[sentencepiece]



In [None]:
!pip install datasets



In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
train_path = '/data/train.csv'

df = pd.read_csv(train_path)
df = df.fillna("NONE")

# Here I opt for a 85-15 split betwen train and development sets
train_df, dev_df = train_test_split(df,test_size=0.15,random_state=42)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
import torch
from datasets import Dataset

train_data = Dataset.from_pandas(train_df)
dev_data = Dataset.from_pandas(dev_df)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
model.to('cuda:0')

# The case for the TEXT column doesn't matter all that much, but distilbert is expecting a lowercase 'label' column
train_data = train_data.rename_columns({'TEXT':'text','LABEL':'label'})
dev_data = dev_data.rename_columns({'TEXT':'text','LABEL':'label'})

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

train_data = train_data.map(preprocess_function, batched=True)
dev_data = dev_data.map(preprocess_function, batched=True)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

  0%|          | 0/60 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [None]:
train_data

Dataset({
    features: ['ID', 'text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 59658
})

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="./results",learning_rate=2e-5,
                                  per_device_train_batch_size=8, per_device_eval_batch_size=8, 
                                  num_train_epochs=3, weight_decay=0.01)
trainer = Trainer(model = model, args = training_args, train_dataset=train_data, eval_dataset=dev_data,data_collator=data_collator)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: ID, __index_level_0__, text. If ID, __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 59658
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 22374


Step,Training Loss
500,0.3332
1000,0.2132
1500,0.1998
2000,0.193
2500,0.1926
3000,0.1899
3500,0.1876
4000,0.1931
4500,0.1897
5000,0.1738


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3

KeyboardInterrupt: ignored

In [None]:
test_path = '/data/test.csv'

test_df = pd.read_csv(test_path)
test_df = test_df.fillna("NONE")
test_data = Dataset.from_pandas(test_df)
test_data = test_data.rename_columns({'TEXT':'text'})
test_data = test_data.map(preprocess_function, batched=True)

  0%|          | 0/31 [00:00<?, ?ba/s]

In [None]:
len(test_df)

30078

In [None]:
preds = trainer.predict(test_data)

The following columns in the test set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: ID, text. If ID, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 30078
  Batch size = 8


Step,Training Loss
500,0.3332
1000,0.2132
1500,0.1998
2000,0.193
2500,0.1926
3000,0.1899
3500,0.1876
4000,0.1931
4500,0.1897
5000,0.1738


In [None]:
len(preds.predictions)

30078

In [None]:
predictions = np.argmax(preds.predictions,axis=1)

In [None]:
submission = pd.DataFrame(columns=['ID','Predicted'])
submission['ID'] = test_df['ID']
submission['Predicted'] = predictions

# Submission csv
submission.to_csv("submission.csv",index=False)

In [None]:
# The Line for saving my model to my google drive
# trainer.save_model('/content/drive/MyDrive/Ling539/submission_checkpoint')
trainer.save_model('/submission_checkpoint')

Saving model checkpoint to /content/drive/MyDrive/Ling539/submission_checkpoint
Configuration saved in /content/drive/MyDrive/Ling539/submission_checkpoint/config.json
Model weights saved in /content/drive/MyDrive/Ling539/submission_checkpoint/pytorch_model.bin
