In [1]:
from datasets import load_from_disk
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, pipeline





In [2]:
ds = load_from_disk('/Users/kailiu/LLMProject/data/imdb_dataset')
train_ds = ds['train']
test_ds = ds['test']



In [8]:

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")


pipe = pipeline(
    "text-classification", 
    model="distilbert-base-uncased-finetuned-sst-2-english",
    tokenizer=tokenizer,
    device='mps',
    truncation=True,  # Truncate sequences longer than the model's max length
    padding=True      # Pad sequences to the model's max length
)



In [9]:
train_preds = pipe(train_ds['text'])
print(train_preds)


[{'label': 'POSITIVE', 'score': 0.7872871160507202}, {'label': 'NEGATIVE', 'score': 0.9991909861564636}, {'label': 'NEGATIVE', 'score': 0.998217761516571}, {'label': 'POSITIVE', 'score': 0.814460277557373}, {'label': 'NEGATIVE', 'score': 0.9993877410888672}, {'label': 'NEGATIVE', 'score': 0.9989563226699829}, {'label': 'NEGATIVE', 'score': 0.9994860887527466}, {'label': 'NEGATIVE', 'score': 0.9961752891540527}, {'label': 'NEGATIVE', 'score': 0.989343523979187}, {'label': 'NEGATIVE', 'score': 0.9909103512763977}, {'label': 'POSITIVE', 'score': 0.6470481753349304}, {'label': 'NEGATIVE', 'score': 0.9972400665283203}, {'label': 'NEGATIVE', 'score': 0.9925944209098816}, {'label': 'NEGATIVE', 'score': 0.996019184589386}, {'label': 'NEGATIVE', 'score': 0.9810841679573059}, {'label': 'NEGATIVE', 'score': 0.9997406601905823}, {'label': 'NEGATIVE', 'score': 0.9994814991950989}, {'label': 'NEGATIVE', 'score': 0.9997113347053528}, {'label': 'NEGATIVE', 'score': 0.9996881484985352}, {'label': 'NEGA

In [10]:
# convert the train_preds to a dataframe
import pandas as pd
train_preds_df = pd.DataFrame(train_preds)
print(train_preds_df)


          label     score
0      POSITIVE  0.787287
1      NEGATIVE  0.999191
2      NEGATIVE  0.998218
3      POSITIVE  0.814460
4      NEGATIVE  0.999388
...         ...       ...
24995  NEGATIVE  0.998632
24996  POSITIVE  0.994772
24997  POSITIVE  0.998151
24998  NEGATIVE  0.968497
24999  POSITIVE  0.999594

[25000 rows x 2 columns]


In [11]:
# label the train_preds_df
train_preds_df['label'] = train_preds_df['label'].apply(lambda x: 1 if x == 'POSITIVE' else 0)
print(train_preds_df)



       label     score
0          1  0.787287
1          0  0.999191
2          0  0.998218
3          1  0.814460
4          0  0.999388
...      ...       ...
24995      0  0.998632
24996      1  0.994772
24997      1  0.998151
24998      0  0.968497
24999      1  0.999594

[25000 rows x 2 columns]


In [12]:
# compute the accuracy
metric = evaluate.load("accuracy")
accuracy = metric.compute(predictions=train_preds_df['label'], references=train_ds['label'])
print(accuracy)

{'accuracy': 0.88852}


In [13]:
# test data for model
test_preds = pipe(test_ds['text'])
test_preds_df = pd.DataFrame(test_preds)
test_preds_df['label'] = test_preds_df['label'].apply(lambda x: 1 if x == 'POSITIVE' else 0)
accuracy = metric.compute(predictions=test_preds_df['label'], references=test_ds['label'])
print(accuracy)

{'accuracy': 0.89072}
