In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

from openai import OpenAI
import os

# Load the Data

In [None]:
df = pd.read_csv("interiority_gold_final.csv")
df

Unnamed: 0,title,paragraph,gold_label
0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",high
1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,low
2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",high
3,A Room with a View,An engagement is so potent a thing that sooner...,low
4,A Room with a View,“In the course of conversation they said that ...,low
...,...,...,...
592,The murder of Roger Ackroyd,Caroline does not care a hang for woods at any...,high
593,The murder of Roger Ackroyd,"After the evening talk I have just chronicled,...",low
594,The murder of Roger Ackroyd,“Now I have made it my business to find out mo...,none
595,The murder of Roger Ackroyd,Raymond pushed his chair away from the table v...,low


In [None]:
df['gold_label'].value_counts()

Unnamed: 0_level_0,count
gold_label,Unnamed: 1_level_1
none,237
high,204
low,156


In [None]:
mapping = {"none":0, "low":1, "high":2}
df["y"] = df["gold_label"].map(mapping)
X = df["paragraph"].tolist()
y = df["y"].tolist()


In [None]:
df.head(10)

Unnamed: 0,title,paragraph,gold_label,y
0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",high,2
1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,low,1
2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",high,2
3,A Room with a View,An engagement is so potent a thing that sooner...,low,1
4,A Room with a View,“In the course of conversation they said that ...,low,1
5,A Room with a View,"Miss Bartlett only sighed, and enveloped her i...",high,2
6,A Room with a View,"“The point is, we have warred with it. Look.” ...",none,0
7,A Room with a View,The young man named George glanced at the clev...,low,1
8,A Room with a View,“But my feelings are of no importance. I know ...,high,2
9,A Room with a View,“Indeed you may!” he cried. “Here we are with ...,low,1


## train-test split randomly

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## train-test split based on books

In [None]:
df.title.value_counts()

Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
A farewell to arms,40
Dubliners,40
Martin Eden,40
My Ántonia,40
Mrs. Dalloway,40
The Age of Innocence,40
Swann's Way,40
The Picture of Dorian Gray,40
The Great Gatsby,40
The Dunwich horror,40


In [None]:
test_books = [
    "Dubliners",
    "The Picture of Dorian Gray",
    "My Ántonia",
    "The murder of Roger Ackroyd",
    "A farewell to arms",
    "The Garden Party, and Other Stories"
]
test_df2 = df[df["title"].isin(test_books)]
train_df2 = df[~df["title"].isin(test_books)]
X_train2 = train_df2["paragraph"]
y_train2 = train_df2["gold_label"]

X_test2 = test_df2["paragraph"]
y_test2 = test_df2["gold_label"]


In [None]:
print("Train size:", len(train_df2))
print("Test size:", len(test_df2))


Train size: 357
Test size: 240


# BASELINE + BERT (use random train-test split)

## CountVectorizer

In [None]:
## logistic regression
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=2)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=3000, class_weight="balanced")
clf.fit(X_train_vec, y_train)

pred = clf.predict(X_test_vec)
print(classification_report(y_test, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.62      0.67      0.64        48
         low       0.32      0.32      0.32        31
        high       0.62      0.56      0.59        41

    accuracy                           0.54       120
   macro avg       0.52      0.52      0.52       120
weighted avg       0.54      0.54      0.54       120



In [None]:
## naive bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
pred = nb.predict(X_test_vec)
print(classification_report(y_test, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.59      0.48      0.53        48
         low       0.55      0.19      0.29        31
        high       0.46      0.78      0.58        41

    accuracy                           0.51       120
   macro avg       0.53      0.48      0.46       120
weighted avg       0.53      0.51      0.48       120



## TF-IDF

In [None]:
## Logistic Regression
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_train_t = tfidf.fit_transform(X_train)
X_test_t = tfidf.transform(X_test)

clf_tfidf = LogisticRegression(max_iter=3000, class_weight="balanced")
clf_tfidf.fit(X_train_t, y_train)

pred = clf_tfidf.predict(X_test_t)
print(classification_report(y_test, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.56      0.50      0.53        48
         low       0.38      0.29      0.33        31
        high       0.57      0.73      0.64        41

    accuracy                           0.53       120
   macro avg       0.50      0.51      0.50       120
weighted avg       0.51      0.53      0.51       120



In [None]:
## Naive Bayes
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_t, y_train)
pred = nb_tfidf.predict(X_test_t)
print(classification_report(y_test, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.56      0.73      0.63        48
         low       0.00      0.00      0.00        31
        high       0.51      0.71      0.59        41

    accuracy                           0.53       120
   macro avg       0.35      0.48      0.41       120
weighted avg       0.40      0.53      0.45       120



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Small Fine-tuned Transformers

In [None]:
# pip install transformers datasets accelerate


In [None]:
from datasets import Dataset

train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)


In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### RoBERTa-base

In [None]:
## RoBERTa-base
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

model_name = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

train_tok = train_tok.remove_columns(["text"])
test_tok = test_tok.remove_columns(["text"])

train_tok.set_format("torch")
test_tok.set_format("torch")


Map:   0%|          | 0/477 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,1.079372
2,No log,0.760943
3,No log,0.773891
4,No log,0.812624
5,No log,0.774407


TrainOutput(global_step=300, training_loss=0.730302988688151, metrics={'train_runtime': 185.4864, 'train_samples_per_second': 12.858, 'train_steps_per_second': 1.617, 'total_flos': 313762750640640.0, 'train_loss': 0.730302988688151, 'epoch': 5.0})

In [None]:
pred = trainer.predict(test_tok)
import numpy as np
y_pred = np.argmax(pred.predictions, axis=1)
print(classification_report(y_test, y_pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.71      0.83      0.77        48
         low       0.48      0.35      0.41        31
        high       0.76      0.76      0.76        41

    accuracy                           0.68       120
   macro avg       0.65      0.65      0.64       120
weighted avg       0.67      0.68      0.67       120



### DistilBERT

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

train_tok = train_tok.remove_columns(["text"])
test_tok = test_tok.remove_columns(["text"])

train_tok.set_format("torch")
test_tok.set_format("torch")


Map:   0%|          | 0/477 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./distilbert_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_strategy="epoch"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,1.0572,0.984552
2,0.8548,0.851566
3,0.6366,0.768127
4,0.4747,0.768711
5,0.3822,0.749108


TrainOutput(global_step=300, training_loss=0.6810756238301595, metrics={'train_runtime': 78.6746, 'train_samples_per_second': 30.315, 'train_steps_per_second': 3.813, 'total_flos': 157970190021120.0, 'train_loss': 0.6810756238301595, 'epoch': 5.0})

In [None]:
pred = trainer.predict(test_tok)

import numpy as np
y_pred = np.argmax(pred.predictions, axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.77      0.77      0.77        48
         low       0.52      0.48      0.50        31
        high       0.77      0.80      0.79        41

    accuracy                           0.71       120
   macro avg       0.69      0.69      0.69       120
weighted avg       0.70      0.71      0.71       120



### BERT-large

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model_name = "bert-large-uncased"

tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

train_tok = train_tok.remove_columns(["text"])
test_tok = test_tok.remove_columns(["text"])

train_tok.set_format("torch")
test_tok.set_format("torch")


Map:   0%|          | 0/477 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_large_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_strategy="epoch"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,1.1185,1.092403
2,0.9508,0.848686
3,0.7257,1.203365
4,0.3977,1.193404


TrainOutput(global_step=956, training_loss=0.7981764103079441, metrics={'train_runtime': 576.3634, 'train_samples_per_second': 3.31, 'train_steps_per_second': 1.659, 'total_flos': 889065524570112.0, 'train_loss': 0.7981764103079441, 'epoch': 4.0})

In [None]:
pred = trainer.predict(test_tok)

import numpy as np
y_pred = np.argmax(pred.predictions, axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.73      0.62      0.67        48
         low       0.37      0.32      0.34        31
        high       0.65      0.83      0.73        41

    accuracy                           0.62       120
   macro avg       0.59      0.59      0.58       120
weighted avg       0.61      0.62      0.61       120



# BASELINE + BERT (use new train-test split)

## CountVectorizer

In [None]:
## logistic regression
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=2)
X_train_vec = vectorizer.fit_transform(X_train2)
X_test_vec = vectorizer.transform(X_test2)

clf = LogisticRegression(max_iter=3000, class_weight="balanced")
clf.fit(X_train_vec, y_train2)

pred = clf.predict(X_test_vec)
print(classification_report(y_test2, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.53      0.55      0.54        71
         low       0.30      0.28      0.29        60
        high       0.59      0.60      0.59       109

    accuracy                           0.50       240
   macro avg       0.47      0.48      0.48       240
weighted avg       0.50      0.50      0.50       240



In [None]:
## naive bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train2)
pred = nb.predict(X_test_vec)
print(classification_report(y_test2, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.40      0.80      0.53        71
         low       0.30      0.17      0.22        60
        high       0.63      0.37      0.47       109

    accuracy                           0.45       240
   macro avg       0.44      0.45      0.40       240
weighted avg       0.48      0.45      0.42       240



## TF-IDF

In [None]:
## Logistic Regression
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_train_t = tfidf.fit_transform(X_train2)
X_test_t = tfidf.transform(X_test2)

clf_tfidf = LogisticRegression(max_iter=3000, class_weight="balanced")
clf_tfidf.fit(X_train_t, y_train2)

pred = clf_tfidf.predict(X_test_t)
print(classification_report(y_test2, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.51      0.66      0.58        71
         low       0.20      0.18      0.19        60
        high       0.56      0.49      0.52       109

    accuracy                           0.46       240
   macro avg       0.43      0.44      0.43       240
weighted avg       0.46      0.46      0.46       240



In [None]:
## Naive Bayes
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_t, y_train2)
pred = nb_tfidf.predict(X_test_t)
print(classification_report(y_test2, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.32      0.93      0.48        71
         low       0.00      0.00      0.00        60
        high       0.69      0.23      0.34       109

    accuracy                           0.38       240
   macro avg       0.34      0.39      0.27       240
weighted avg       0.41      0.38      0.30       240



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Small Fine-tuned Transformers

In [None]:
# pip install transformers datasets accelerate


In [None]:
from datasets import Dataset

# Map string labels to integers before creating the DataFrame
train_df = pd.DataFrame({"text": X_train2, "label": y_train2.map(mapping)})
test_df = pd.DataFrame({"text": X_test2, "label": y_test2.map(mapping)})

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)


In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### RoBERTa-base

In [None]:
## RoBERTa-base
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

model_name = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

train_tok = train_tok.remove_columns(["text"])
test_tok = test_tok.remove_columns(["text"])

train_tok.set_format("torch")
test_tok.set_format("torch")


Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.057651
2,No log,0.837321
3,No log,0.848584
4,No log,0.971197
5,No log,1.00038


TrainOutput(global_step=225, training_loss=0.6986124674479167, metrics={'train_runtime': 164.3428, 'train_samples_per_second': 10.861, 'train_steps_per_second': 1.369, 'total_flos': 234828725322240.0, 'train_loss': 0.6986124674479167, 'epoch': 5.0})

In [None]:
pred = trainer.predict(test_tok)
import numpy as np
y_pred = np.argmax(pred.predictions, axis=1)

y_test2_int = y_test2.map(mapping)

print(classification_report(y_test2_int, y_pred, target_names=mapping.keys()))

              precision    recall  f1-score   support

        none       0.61      0.88      0.72       109
         low       0.25      0.08      0.12        60
        high       0.73      0.63      0.68        71

    accuracy                           0.61       240
   macro avg       0.53      0.53      0.51       240
weighted avg       0.55      0.61      0.56       240



### DistilBERT

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

train_tok = train_tok.remove_columns(["text"])
test_tok = test_tok.remove_columns(["text"])

train_tok.set_format("torch")
test_tok.set_format("torch")

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./distilbert_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_strategy="epoch"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0897,1.054303
2,0.9548,0.931263
3,0.7383,0.864027
4,0.5582,0.879548
5,0.4705,0.863584


TrainOutput(global_step=225, training_loss=0.7622845628526476, metrics={'train_runtime': 181.6301, 'train_samples_per_second': 9.828, 'train_steps_per_second': 1.239, 'total_flos': 118229261713920.0, 'train_loss': 0.7622845628526476, 'epoch': 5.0})

In [None]:
from transformers import Trainer
import accelerate.state

# Explicitly reset AcceleratorState before re-instantiating Trainer
accelerate.state.AcceleratorState._reset_state()

# Re-instantiate the Trainer to ensure a fresh Accelerator state
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)

pred = trainer.predict(test_tok)

import numpy as np
y_pred = np.argmax(pred.predictions, axis=1)

from sklearn.metrics import classification_report

y_test2_int = y_test2.map(mapping)

print(classification_report(y_test2_int, y_pred, target_names=mapping.keys()))

              precision    recall  f1-score   support

        none       0.69      0.64      0.67       109
         low       0.25      0.25      0.25        60
        high       0.65      0.72      0.68        71

    accuracy                           0.57       240
   macro avg       0.53      0.54      0.53       240
weighted avg       0.57      0.57      0.57       240



### BERT-large

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model_name = "bert-large-uncased"

tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

train_tok = train_tok.remove_columns(["text"])
test_tok = test_tok.remove_columns(["text"])

train_tok.set_format("torch")
test_tok.set_format("torch")

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_large_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_strategy="epoch"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,1.0923,1.003079
2,0.8303,1.007034
3,0.5823,1.354937
4,0.4029,1.352136


TrainOutput(global_step=716, training_loss=0.7269533572916212, metrics={'train_runtime': 939.6154, 'train_samples_per_second': 1.52, 'train_steps_per_second': 0.762, 'total_flos': 665401241659392.0, 'train_loss': 0.7269533572916212, 'epoch': 4.0})

In [None]:
pred = trainer.predict(test_tok)

import numpy as np
y_pred = np.argmax(pred.predictions, axis=1)

from sklearn.metrics import classification_report

y_test2_int = y_test2.map(mapping)
print(classification_report(y_test2_int, y_pred, target_names=mapping.keys()))

              precision    recall  f1-score   support

        none       0.63      0.70      0.66       109
         low       0.00      0.00      0.00        60
        high       0.46      0.77      0.58        71

    accuracy                           0.55       240
   macro avg       0.36      0.49      0.41       240
weighted avg       0.42      0.55      0.47       240



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# GPT (use new train-test split)

In [None]:
def classify_with_gpt(text, model, api_key, system_prompt, user_prompt_template, temperature=0):
    client = OpenAI(api_key=api_key)

    # Format the user prompt with the text
    user_prompt = user_prompt_template.format(text=text)

    # Prepare arguments for client.chat.completions.create
    completion_kwargs = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
    }
    # Only add temperature if it's not None, allowing model default if None is passed
    if temperature is not None:
        completion_kwargs["temperature"] = temperature

    response = client.chat.completions.create(**completion_kwargs)

    return response.choices[0].message.content.strip().lower()

In [None]:
def evaluate_gpt_model(X_test, y_test, model, api_key, system_prompt, user_prompt_template, temperature=0):

    label_map = {"none": 0, "low": 1, "high": 2}
    preds = []

    # Convert y_test to integers
    y_test_int = [label_map[label.lower().strip()] for label in y_test]

    for text in X_test:
        raw = classify_with_gpt(
            text=text,
            model=model,
            api_key=api_key,
            system_prompt=system_prompt,
            user_prompt_template=user_prompt_template,
            temperature=temperature
        )
        preds.append(label_map.get(raw, -1))

    report = classification_report(y_test_int, preds, target_names=["none", "low", "high"], digits=2)
    return report, preds

## GPT 3.5 turbo

### zero-shot

In [None]:
system_prompt_zero = (
    "You are a classifier for literary interiority in fiction. "
    "Interiority refers to moments when the text gives access to a character’s inner thoughts, "
    "feelings, or perceptions, rather than only external actions or events. "
    "Label each paragraph as exactly one of: "
    "high (explicit access to inner experience), "
    "low (indirect or ambiguous hints), "
    "none (only external description, actions, or spoken dialogue). "
    "Spoken dialogue alone does not count as interiority unless the text also explicitly reveals inner thoughts or feelings. "
    "Output only one word in lowercase: high, low, or none."
)

user_prompt_zero = """
Classify the interiority level of the following paragraph as high, low, or none:

\"\"\"{text}\"\"\"
""".strip()


In [None]:
api_key = "YOUR API KEY"

report, preds = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-3.5-turbo",
    api_key=api_key,
    system_prompt=system_prompt_zero,
    user_prompt_template=user_prompt_zero
)

print(report)


              precision    recall  f1-score   support

        none       0.92      0.21      0.34       109
         low       0.28      0.68      0.40        60
        high       0.62      0.59      0.60        71

    accuracy                           0.44       240
   macro avg       0.61      0.50      0.45       240
weighted avg       0.67      0.44      0.43       240



### Few-shot

In [None]:
system_prompt_few = (
    "You are a classifier for literary interiority in fiction. "
    "Interiority refers to moments when the text gives access to a character’s inner thoughts, "
    "feelings, or perceptions. "
    "Label each paragraph as exactly one of: high, low, none. "
    "Spoken dialogue alone does not count as interiority.\n\n"

    "Examples:\n"
    "“So, thought Septimus, looking up, they are signalling to me.” → high\n"
    "“At first, he stood there still, looking at the ground as if the contents of his head were rearranging themselves into new positions.” → low\n"
    "“The wind rose in the night and rain came in sheets as the Croatians crossed the mountain meadows and fought in the dark.” → none\n"
    "“Come on, I said. Get in.” → none\n\n"

    "Output only one word in lowercase: high, low, or none."
)

user_prompt_few = """
Classify the interiority level of the following paragraph as high, low, or none:

\"\"\"{text}\"\"\"
""".strip()


In [None]:
report, preds = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-3.5-turbo",
    api_key=api_key,
    system_prompt=system_prompt_few,
    user_prompt_template=user_prompt_few
)

print(report)

              precision    recall  f1-score   support

        none       0.83      0.05      0.09       109
         low       0.20      0.33      0.25        60
        high       0.42      0.80      0.55        71

    accuracy                           0.34       240
   macro avg       0.49      0.39      0.30       240
weighted avg       0.55      0.34      0.27       240



## GPT 4.0


### zero-shot

In [None]:
report_4o_zero, preds_4o_zero = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-4o",
    api_key=api_key,
    system_prompt=system_prompt_zero,
    user_prompt_template=user_prompt_zero
)
print(report_4o_zero)


              precision    recall  f1-score   support

        none       0.69      0.78      0.73       109
         low       0.37      0.33      0.35        60
        high       0.76      0.68      0.72        71

    accuracy                           0.64       240
   macro avg       0.61      0.60      0.60       240
weighted avg       0.63      0.64      0.63       240



### few-shot

In [None]:
report_4o_few, preds_4o_few = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-4o",
    api_key=api_key,
    system_prompt=system_prompt_few,
    user_prompt_template=user_prompt_few
)
print(report_4o_few)


              precision    recall  f1-score   support

        none       0.67      0.77      0.72       109
         low       0.38      0.32      0.35        60
        high       0.72      0.66      0.69        71

    accuracy                           0.62       240
   macro avg       0.59      0.58      0.58       240
weighted avg       0.61      0.62      0.62       240



## GPT 4.1 mini

### zero-shot

In [None]:
report_41mini_zero, preds_41mini_zero = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-4.1-mini",
    api_key=api_key,
    system_prompt=system_prompt_zero,
    user_prompt_template=user_prompt_zero
)
print(report_41mini_zero)


              precision    recall  f1-score   support

        none       0.76      0.58      0.66       109
         low       0.28      0.28      0.28        60
        high       0.60      0.82      0.69        71

    accuracy                           0.57       240
   macro avg       0.55      0.56      0.54       240
weighted avg       0.59      0.57      0.57       240



### few-shot

In [None]:
report_41mini_few, preds_41mini_few = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-4.1-mini",
    api_key=api_key,
    system_prompt=system_prompt_few,
    user_prompt_template=user_prompt_few
)
print(report_41mini_few)


              precision    recall  f1-score   support

        none       0.75      0.57      0.65       109
         low       0.38      0.48      0.42        60
        high       0.69      0.77      0.73        71

    accuracy                           0.61       240
   macro avg       0.60      0.61      0.60       240
weighted avg       0.64      0.61      0.61       240



## GPT 4.1

### zero-shot

In [None]:
report_41_zero, preds_41_zero = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-4.1",
    api_key=api_key,
    system_prompt=system_prompt_zero,
    user_prompt_template=user_prompt_zero
)
print(report_41_zero)


              precision    recall  f1-score   support

        none       0.82      0.56      0.67       109
         low       0.32      0.27      0.29        60
        high       0.56      0.92      0.70        71

    accuracy                           0.59       240
   macro avg       0.57      0.58      0.55       240
weighted avg       0.62      0.59      0.58       240



### few-shot

In [None]:
report_41_few, preds_41_few = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-4.1",
    api_key=api_key,
    system_prompt=system_prompt_few,
    user_prompt_template=user_prompt_few
)
print(report_41_few)


              precision    recall  f1-score   support

        none       0.82      0.53      0.64       109
         low       0.29      0.33      0.31        60
        high       0.61      0.85      0.71        71

    accuracy                           0.57       240
   macro avg       0.57      0.57      0.55       240
weighted avg       0.62      0.57      0.58       240



## GPT 5

### zero-shot


In [None]:
report_5_zero, preds_5_zero = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-5-nano",
    api_key=api_key,
    system_prompt=system_prompt_zero,
    user_prompt_template=user_prompt_zero,
    temperature=1
)
print(report_5_zero)

              precision    recall  f1-score   support

        none       0.89      0.50      0.64       109
         low       0.29      0.18      0.22        60
        high       0.46      0.92      0.61        71

    accuracy                           0.54       240
   macro avg       0.55      0.53      0.49       240
weighted avg       0.61      0.54      0.53       240



### few-shot

In [None]:
report_5_few, preds_5_few = evaluate_gpt_model(
    X_test=X_test2,
    y_test=y_test2,
    model="gpt-5-nano",
    api_key=api_key,
    system_prompt=system_prompt_few,
    user_prompt_template=user_prompt_few,
    temperature=1
)
print(report_5_few)

              precision    recall  f1-score   support

        none       0.79      0.53      0.64       109
         low       0.30      0.27      0.28        60
        high       0.54      0.87      0.67        71

    accuracy                           0.57       240
   macro avg       0.55      0.56      0.53       240
weighted avg       0.60      0.57      0.56       240

