In [1]:
!pip install transformers &> /dev/null
!pip install datasets &> /dev/null
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
from datasets import Dataset, load_dataset, load_metric

task='fine-tuning'
MODEL = "tner/twitter-roberta-base-dec2021-tweetner7-random"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [2]:
text_df = pd.read_json("train_text.json",lines=True)
label_df = pd.read_json("train_truth.json", lines=True)

df = pd.concat([text_df,label_df],axis=1)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   twitter user id  160 non-null    object
 1   texts            160 non-null    object
 2   tweet ids        160 non-null    object
 3   twitter user id  160 non-null    object
 4   class            160 non-null    object
dtypes: object(5)
memory usage: 6.4+ KB


In [4]:
df = df.drop(df.columns[[0,2,3]], axis=1)

In [5]:
tweet = []
label = []
for index, row in df.iterrows():
    for text in row['texts']:
        tweet.append(text['text'])
        label.append(row['class'])

In [6]:
tweet = pd.DataFrame(tweet)
label = pd.DataFrame(label)

In [7]:
sent_df = pd.concat([tweet,label],axis=1)

In [8]:
sent_df.columns = ["text","label"]

In [9]:
print(set(sent_df["label"]))

{'nano', 'macro', 'micro', 'no influencer', 'mega'}


In [10]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

sent_df["text"] = sent_df["text"].apply(preprocess)

In [11]:
train_dataset = sent_df[:600]
eval_dataset = sent_df[600:750]
test_dataset = sent_df[750:]

In [12]:
def label_encoder(label):
  labels = {"mega":1, "micro":2, "macro":3, "nano":4, "no influencer":5}
  return labels[label]

def encode_labels(dataframe):
      encoded_labels = []
      for r in dataframe["label"]:
        label = label_encoder(r)
        encoded_labels.append(label)
      dataframe.insert(2, "labels", encoded_labels, True)
      # converting pandas df back to dataset object
      return Dataset.from_pandas(dataframe)

train_dataset = encode_labels(train_dataset)
eval_dataset = encode_labels(eval_dataset)
test_dataset = encode_labels(test_dataset)

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at tner/twitter-roberta-base-dec2021-tweetner7-random were not used when initializing RobertaForSequenceClassification: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at tner/twitter-roberta-base-dec2021-tweetner7-random and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should prob

In [14]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.remove_columns(["label"])
train_dataset.set_format("torch", columns=["input_ids","attention_mask","labels"])

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [15]:
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.remove_columns(["label"])
eval_dataset.set_format("torch", columns=["input_ids","attention_mask","labels"])

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [16]:
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.remove_columns(["label"])
test_dataset.set_format("torch", columns=["input_ids","attention_mask","labels"])

Map:   0%|          | 0/179 [00:00<?, ? examples/s]

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [18]:
!pip install evaluate
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [22]:
train_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 600
})

In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.222456,0.453333
2,No log,1.05071,0.62
3,No log,1.076559,0.586667


TrainOutput(global_step=225, training_loss=1.0618190511067709, metrics={'train_runtime': 180.6337, 'train_samples_per_second': 9.965, 'train_steps_per_second': 1.246, 'total_flos': 473655179059200.0, 'train_loss': 1.0618190511067709, 'epoch': 3.0})