In [None]:
#!pip install --upgrade transformers


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

## Preprocess data

In [None]:
df = pd.read_csv("interiority_gold_final.csv")
df

Unnamed: 0,title,paragraph,gold_label
0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",high
1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,low
2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",high
3,A Room with a View,An engagement is so potent a thing that sooner...,low
4,A Room with a View,“In the course of conversation they said that ...,low
...,...,...,...
592,The murder of Roger Ackroyd,Caroline does not care a hang for woods at any...,high
593,The murder of Roger Ackroyd,"After the evening talk I have just chronicled,...",low
594,The murder of Roger Ackroyd,“Now I have made it my business to find out mo...,none
595,The murder of Roger Ackroyd,Raymond pushed his chair away from the table v...,low


In [None]:
df['gold_label'].value_counts()

Unnamed: 0_level_0,count
gold_label,Unnamed: 1_level_1
none,237
high,204
low,156


In [None]:
df['title'].unique()

array(['A Room with a View', 'A farewell to arms', 'Dubliners',
       'Martin Eden', 'Metamorphosis', 'Mrs. Dalloway', 'My Ántonia',
       "Swann's Way", 'The Age of Innocence',
       'The Awakening, and Selected Short Stories', 'The Dunwich horror',
       'The Garden Party, and Other Stories', 'The Great Gatsby',
       'The Picture of Dorian Gray', 'The murder of Roger Ackroyd'],
      dtype=object)

In [None]:
# map labels to numeric
mapping = {"none":0, "low":1, "high":2}
df["y"] = df["gold_label"].map(mapping)
# select 6 books as test
test_books = [
    "Dubliners",
    "The Picture of Dorian Gray",
    "My Ántonia",
    "The murder of Roger Ackroyd",
    "A farewell to arms",
    "The Garden Party, and Other Stories"
    ]
test_df = df[df["title"].isin(test_books)]
train_df = df[~df["title"].isin(test_books)]
print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 357
Test size: 240


In [None]:
#X = df["paragraph"].tolist()
#y = df["y"].tolist()

X_train = training_data["paragraph"].tolist()
y_train = training_data["gold_label"].tolist()

X_test = test_data["paragraph"].tolist()
y_test = test_data["gold_label"].tolist()


## Llama3.3-80B （Groq)


In [None]:
!pip install groq

Collecting groq
  Downloading groq-0.37.1-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.37.1-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.37.1


In [None]:
from groq import Groq

client = Groq(api_key="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")

response = client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=[
        {"role": "user", "content": "Explain large language models in two sentences"}
    ]
)

print(response.choices[0].message.content)


Large language models are artificial intelligence systems that use complex algorithms to process and generate human-like language, trained on vast amounts of text data to learn patterns and relationships between words, phrases, and ideas. By leveraging this training, these models can perform a range of tasks, including language translation, text summarization, and conversation generation, with applications in areas such as customer service, content creation, and language learning.


##




### Zero-Shot

In [None]:
def classify_interiority(paragraph: str) -> str:
    """
    Classify a fiction paragraph into one of: high, low, none
    based on literary interiority.
    """
    system_prompt = (
        "You are a classifier for literary interiority in fiction. "
        "Interiority refers to moments when the text gives access to a character’s inner thoughts, "
        "feelings, or perceptions, rather than only external actions or events. "
        "Label each paragraph as exactly one of: "
        "high (explicit access to inner experience), "
        "low (indirect or ambiguous hints), "
        "none (only external description, actions, or spoken dialogue). "
        "Spoken dialogue alone does not count as interiority unless the text also explicitly "
        "reveals inner thoughts or feelings. "
        "Output only one word in lowercase: high, low, or none."
    )

    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": f"Paragraph:\n{paragraph}\n\nLabel this paragraph:",
            },
        ],
        max_tokens=5,
        temperature=0.0,
    )

    label = response.choices[0].message.content.strip().lower()
    if label not in {"high", "low", "none"}:
        return "none"
    return label

In [None]:
test_text = df["paragraph"][6]
print("Paragraph:", test_text)
print("Prediction:", classify_interiority(test_text))

Paragraph: “The point is, we have warred with it. Look.” He pointed to the Val d’Arno, which was visible far below them, through the budding trees. “Fifty miles of Spring, and we’ve come up to admire them. Do you suppose there’s any difference between Spring in nature and Spring in man? But there we go, praising the one and condemning the other as improper, ashamed that the same laws work eternally through both.”
Prediction: none


In [None]:
test_df["llama3.3_pred"] = test_df["paragraph"].apply(classify_interiority)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["llama3.3_pred"] = test_df["paragraph"].apply(classify_interiority)


In [None]:
print(classification_report(
    test_df["gold_label"],
    test_df["llama3.3_pred"],
    labels=["high", "low", "none"]
))

              precision    recall  f1-score   support

        high       0.78      0.66      0.72        71
         low       0.42      0.50      0.46        60
        none       0.73      0.73      0.73       109

    accuracy                           0.65       240
   macro avg       0.65      0.63      0.64       240
weighted avg       0.67      0.65      0.66       240



In [None]:
test_df["llama3.3_pred"].value_counts()

Unnamed: 0_level_0,count
llama3.3_pred,Unnamed: 1_level_1
none,109
low,71
high,60


### Few-Shot

In [None]:
def classify_interiority_few(paragraph: str) -> str:
    """
    Classify a fiction paragraph into one of: high, low, none
    based on literary interiority.
    """
    system_prompt = (
        "You are a classifier for literary interiority in fiction. "
        "Interiority refers to moments when the text gives access to a character’s inner thoughts, "
        "feelings, or perceptions, rather than only external actions or events. "
        "Label each paragraph as exactly one of: "
        "high (explicit access to inner experience), "
        "low (indirect or ambiguous hints), "
        "none (only external description, actions, or spoken dialogue). "
        "Spoken dialogue alone does not count as interiority unless the text also explicitly "
        "reveals inner thoughts or feelings. "

        "Examples:\n"
        "“So, thought Septimus, looking up, they are signalling to me.” → high\n"
        "“At first, he stood there still, looking at the ground as if the contents of his head were rearranging themselves into new positions.” → low\n"
        "“The wind rose in the night and rain came in sheets as the Croatians crossed the mountain meadows and fought in the dark.” → none"
        "“Come on, I said. Get in.” → none\n\n"

        "Output only one word in lowercase: high, low, or none."
    )

    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": f"Paragraph:\n{paragraph}\n\nLabel this paragraph:",
            },
        ],
        max_tokens=5,
        temperature=0.0,
    )

    label = response.choices[0].message.content.strip().lower()
    if label not in {"high", "low", "none"}:
        return "none"
    return label

In [None]:
test_df["llama3.3_pred_2"] = test_df["paragraph"].apply(classify_interiority_few)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["llama3.3_pred_2"] = test_df["paragraph"].apply(classify_interiority_few)


In [None]:
print(classification_report(
    test_df["gold_label"],
    test_df["llama3.3_pred_2"],
    labels=["high", "low", "none"]
))

              precision    recall  f1-score   support

        high       0.81      0.59      0.68        71
         low       0.49      0.55      0.52        60
        none       0.75      0.83      0.79       109

    accuracy                           0.69       240
   macro avg       0.68      0.66      0.66       240
weighted avg       0.70      0.69      0.69       240



In [None]:
test_df["llama3.3_pred"].value_counts()

Unnamed: 0_level_0,count
llama3.3_pred,Unnamed: 1_level_1
none,109
low,71
high,60


## Llama 3.1 - 8B

### Zero Shot

In [None]:
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "user", "content": "Explain large language models in two sentences"}
    ]
)

print(response.choices[0].message.content)


Large language models, such as those powered by deep learning algorithms like transformer architecture, are artificial intelligence systems that use massive amounts of text data to generate human-like language outputs. These models, composed of complex neural networks, learn patterns and relationships in language, enabling them to understand context, generate text, respond to queries, and even translate languages in a way that closely mimics human communication.


In [None]:
def classify_interiority_2(paragraph: str) -> str:
    """
    Classify a fiction paragraph into one of: high, low, none
    based on literary interiority.
    """
    system_prompt = (
        "You are a classifier for literary interiority in fiction. "
        "Interiority refers to moments when the text gives access to a character’s inner thoughts, "
        "feelings, or perceptions, rather than only external actions or events. "
        "Label each paragraph as exactly one of: "
        "high (explicit access to inner experience), "
        "low (indirect or ambiguous hints), "
        "none (only external description, actions, or spoken dialogue). "
        "Spoken dialogue alone does not count as interiority unless the text also explicitly "
        "reveals inner thoughts or feelings. "
        "Output only one word in lowercase: high, low, or none."
    )

    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": f"Paragraph:\n{paragraph}\n\nLabel this paragraph:",
            },
        ],
        max_tokens=5,
        temperature=0.0,
    )

    label = response.choices[0].message.content.strip().lower()
    if label not in {"high", "low", "none"}:
        return "none"
    return label

In [None]:
test_df["llama3.1_pred_zero"] = test_df["paragraph"].apply(classify_interiority_2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["llama3.1_pred_zero"] = test_df["paragraph"].apply(classify_interiority_2)


In [None]:
print(classification_report(
    test_df["gold_label"],
    test_df["llama3.1_pred_zero"],
    labels=["high", "low", "none"]
))

              precision    recall  f1-score   support

        high       0.63      0.76      0.69        71
         low       0.26      0.37      0.30        60
        none       0.65      0.40      0.50       109

    accuracy                           0.50       240
   macro avg       0.51      0.51      0.50       240
weighted avg       0.54      0.50      0.50       240



In [None]:
test_df["llama3.1_pred_zero"].value_counts()

Unnamed: 0_level_0,count
llama3.1_pred_zero,Unnamed: 1_level_1
low,86
high,86
none,68


### Few Shot

In [None]:
def classify_interiority_few2(paragraph: str) -> str:
    """
    Classify a fiction paragraph into one of: high, low, none
    based on literary interiority.
    """
    system_prompt = (
        "You are a classifier for literary interiority in fiction. "
        "Interiority refers to moments when the text gives access to a character’s inner thoughts, "
        "feelings, or perceptions, rather than only external actions or events. "
        "Label each paragraph as exactly one of: "
        "high (explicit access to inner experience), "
        "low (indirect or ambiguous hints), "
        "none (only external description, actions, or spoken dialogue). "
        "Spoken dialogue alone does not count as interiority unless the text also explicitly "
        "reveals inner thoughts or feelings. "

        "Examples:\n"
        "“So, thought Septimus, looking up, they are signalling to me.” → high\n"
        "“At first, he stood there still, looking at the ground as if the contents of his head were rearranging themselves into new positions.” → low\n"
        "“The wind rose in the night and rain came in sheets as the Croatians crossed the mountain meadows and fought in the dark.” → none"
        "“Come on, I said. Get in.” → none\n\n"

        "Output only one word in lowercase: high, low, or none."
    )

    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": f"Paragraph:\n{paragraph}\n\nLabel this paragraph:",
            },
        ],
        max_tokens=5,
        temperature=0.0,
    )

    label = response.choices[0].message.content.strip().lower()
    if label not in {"high", "low", "none"}:
        return "none"
    return label

In [None]:
test_df["llama3.1_pred_few"] = test_df["paragraph"].apply(classify_interiority_few2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["llama3.1_pred_few"] = test_df["paragraph"].apply(classify_interiority_few2)


In [None]:
print(classification_report(
    test_df["gold_label"],
    test_df["llama3.1_pred_few"],
    labels=["high", "low", "none"]
))

              precision    recall  f1-score   support

        high       0.70      0.52      0.60        71
         low       0.31      0.62      0.41        60
        none       0.70      0.42      0.53       109

    accuracy                           0.50       240
   macro avg       0.57      0.52      0.51       240
weighted avg       0.60      0.50      0.52       240



In [None]:
test_df["llama3.1_pred_few"].value_counts()

Unnamed: 0_level_0,count
llama3.1_pred_few,Unnamed: 1_level_1
low,121
none,66
high,53


## HF

In [None]:
from huggingface_hub import login
login(new_session=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#!pip install --upgrade transformers
#!pip install -q transformers accelerate sentencepiece
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


In [None]:
# load llama model
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]



### V1

In [None]:
def build_prompt(paragraph: str) -> str:
    messages = [
        {"role": "system",
            "content": (
                "You are a classifier for literary interiority in fiction. "
                "Interiority is as any moment in the narrative where the text gives access "
                "to a character’s inner experience, including their thoughts, feelings, or perceptions. "
                "In other words, a passage shows interiority when it represents what is going on inside "
                "the character’s mind, rather than describing only external actions or events. "
                "Your task is to label each paragraph as exactly one of: high, low, or none. "
                "Output only the label word in lowercase, with no explanation."),
        },
        {"role": "user",
            "content": (
                "Classify the interiority of the following paragraph as high, low, or none:\n\n"
                f"\"\"\"{paragraph}\"\"\""),
        }]

    # chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt

test_text = df["paragraph"][0]
prompt = build_prompt(test_text)
prompt

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a classifier for literary interiority in fiction. Interiority is as any moment in the narrative where the text gives access to a character’s inner experience, including their thoughts, feelings, or perceptions. In other words, a passage shows interiority when it represents what is going on inside the character’s mind, rather than describing only external actions or events. Your task is to label each paragraph as exactly one of: high, low, or none. Output only the label word in lowercase, with no explanation.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nClassify the interiority of the following paragraph as high, low, or none:\n\n"""“I have been a failure,” said Miss Bartlett, as she struggled with the straps of Lucy’s trunk instead of strapping her own. “Failed to make you happy; failed in my duty to your mother. She has been so generous to m

In [None]:
#import torch

def classify_paragraph(paragraph: str) -> str:
    prompt = build_prompt(paragraph)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    input_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            temperature=0.0,   # greedy decoding: more stable classification
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )

    new_generated_tokens = outputs[0][input_len:]
    output_text = tokenizer.decode(new_generated_tokens, skip_special_tokens=True)

    # take the newly generated output
    final_label = output_text.strip().lower()

    return final_label

In [None]:
test_text = df["paragraph"][0]
print("Paragraph:", test_text)
print("Prediction:", classify_paragraph(test_text))

Paragraph: “I have been a failure,” said Miss Bartlett, as she struggled with the straps of Lucy’s trunk instead of strapping her own. “Failed to make you happy; failed in my duty to your mother. She has been so generous to me; I shall never face her again after this disaster.”
Prediction: high


In [None]:
df["llama_pred"] = df["paragraph"].apply(classify_paragraph)
df.head()

Unnamed: 0,title,paragraph,gold_label,y,llama_pred
0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",high,2,high
1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,low,1,low
2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",high,2,high
3,A Room with a View,An engagement is so potent a thing that sooner...,low,1,high
4,A Room with a View,“In the course of conversation they said that ...,low,1,low


In [None]:
from sklearn.metrics import classification_report

print(classification_report(
    df["gold_label"],
    df["llama_pred"],
    labels=["high", "low", "none"]
))


              precision    recall  f1-score   support

        high       0.56      0.87      0.68       204
         low       0.27      0.48      0.35       156
        none       0.83      0.02      0.04       237

    accuracy                           0.43       597
   macro avg       0.56      0.46      0.36       597
weighted avg       0.59      0.43      0.34       597



### V2


In [None]:
def build_prompt2(paragraph: str) -> str:
    system_prompt = (
        "You are a classifier for literary interiority in fiction. "
        "Interiority refers to moments when the text gives access to a character’s inner thoughts, "
        "feelings, or perceptions, rather than only external actions or events. "
        "Label each paragraph as exactly one of: "
        "high (explicit access to inner experience), " # add rules for high/low/none
        "low (indirect or ambiguous hints), "
        "none (only external description, actions, or spoken dialogue). "
        "Spoken dialogue alone does not count as interiority unless the text also explicitly reveals inner thoughts or feelings. " # address dialogues
        "Output only one word in lowercase: high, low, or none."
    )

    user_prompt = (
        "Classify the interiority level of the following paragraph as high, low, or none:\n\n"
        f"\"\"\"{paragraph}\"\"\""
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    # chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt

test_text = df["paragraph"][0]
prompt = build_prompt2(test_text)
prompt

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a classifier for literary interiority in fiction. Interiority refers to moments when the text gives access to a character’s inner thoughts, feelings, or perceptions, rather than only external actions or events. Label each paragraph as exactly one of: high (explicit access to inner experience), low (indirect or ambiguous hints), none (only external description, actions, or spoken dialogue). Spoken dialogue alone does not count as interiority unless the text also explicitly reveals inner thoughts or feelings. Output only one word in lowercase: high, low, or none.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nClassify the interiority level of the following paragraph as high, low, or none:\n\n"""“I have been a failure,” said Miss Bartlett, as she struggled with the straps of Lucy’s trunk instead of strapping her own. “Failed to make you happy; fai

In [None]:
#import torch

def classify_paragraph2(paragraph: str) -> str:
    prompt = build_prompt2(paragraph)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    input_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2,  # reduce max token
            temperature=0.0,   # greedy decoding: more stable classification
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )

    new_generated_tokens = outputs[0][input_len:]
    output_text = tokenizer.decode(new_generated_tokens, skip_special_tokens=True)

    # take the newly generated output
    final_label = output_text.strip().lower()

    return final_label

### Create a mini sample dataset


In [None]:
test_text = df["paragraph"][2]
print("Paragraph:", test_text)
print("Prediction:", classify_paragraph2(test_text))

Paragraph: Miss Bartlett, who was poor at figures, became bewildered and rendered up the sovereign, amidst the suppressed gurgles of the other youths. For a moment Cecil was happy. He was playing at nonsense among his peers. Then he glanced at Lucy, in whose face petty anxieties had marred the smiles. In January he would rescue his Leonardo from this stupefying twaddle.


NameError: name 'classify_paragraph2' is not defined

In [None]:
mini_df = df.groupby("title").sample(n=4, random_state=2025).reset_index(drop=True)


In [None]:
mini_df["llama_pred"] = mini_df["paragraph"].apply(classify_paragraph2)
mini_df.head()

In [None]:
print(classification_report(
    mini_df["gold_label"],
    mini_df["llama_pred"],
    labels=["high", "low", "none"]
))

              precision    recall  f1-score   support

        high       0.60      0.91      0.72        23
         low       0.21      0.38      0.27        13
        none       1.00      0.04      0.08        24

    accuracy                           0.45        60
   macro avg       0.60      0.45      0.36        60
weighted avg       0.68      0.45      0.37        60



In [None]:
mini_df['gold_label'].value_counts()


Unnamed: 0_level_0,count
gold_label,Unnamed: 1_level_1
none,24
high,23
low,13


In [None]:
mini_df['llama_pred'].value_counts()

Unnamed: 0_level_0,count
llama_pred,Unnamed: 1_level_1
high,35
low,24
none,1


## V3 - few shot

In [None]:
def build_prompt3(paragraph: str) -> str:
    system_prompt = (
        "You are a classifier for literary interiority in fiction. "
        "Interiority refers to moments when the text gives access to a character’s inner thoughts, "
        "feelings, or perceptions, rather than only external actions or events. "
        "Label each paragraph as exactly one of: "
        "high (explicit access to inner experience), "
        "low (indirect or ambiguous hints), "
        "none (only external description, actions, or spoken dialogue). "
        "Spoken dialogue alone does not count as interiority unless the text also explicitly reveals inner thoughts or feelings. "

        "Examples:\n"
        "“So, thought Septimus, looking up, they are signalling to me.” → high\n"
        "“At first, he stood there still, looking at the ground as if the contents of his head were rearranging themselves into new positions.” → low\n"
        "“The wind rose in the night and rain came in sheets as the Croatians crossed the mountain meadows and fought in the dark.” → none"
        "“Come on, I said. Get in.” → none\n\n"

        "Output only one word in lowercase: high, low, or none."
    )

    user_prompt = (
        "Classify the interiority level of the following paragraph as high, low, or none:\n\n"
        f"\"\"\"{paragraph}\"\"\""
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    # chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt

test_text = df["paragraph"][0]
prompt = build_prompt2(test_text)
prompt

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a classifier for literary interiority in fiction. Interiority refers to moments when the text gives access to a character’s inner thoughts, feelings, or perceptions, rather than only external actions or events. Label each paragraph as exactly one of: high (explicit access to inner experience), low (indirect or ambiguous hints), none (only external description, actions, or spoken dialogue). Spoken dialogue alone does not count as interiority unless the text also explicitly reveals inner thoughts or feelings. Output only one word in lowercase: high, low, or none.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nClassify the interiority level of the following paragraph as high, low, or none:\n\n"""“I have been a failure,” said Miss Bartlett, as she struggled with the straps of Lucy’s trunk instead of strapping her own. “Failed to make you happy; fai

In [None]:

def classify_paragraph3(paragraph: str) -> str:
    prompt = build_prompt3(paragraph)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    input_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2,  # reduce max token
            temperature=0.0,   # greedy decoding: more stable classification
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )

    new_generated_tokens = outputs[0][input_len:]
    output_text = tokenizer.decode(new_generated_tokens, skip_special_tokens=True)

    # take the newly generated output
    final_label = output_text.strip().lower()

    return final_label

In [None]:
df.head(10)

Unnamed: 0,title,paragraph,gold_label,y
0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",high,2
1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,low,1
2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",high,2
3,A Room with a View,An engagement is so potent a thing that sooner...,low,1
4,A Room with a View,“In the course of conversation they said that ...,low,1
5,A Room with a View,"Miss Bartlett only sighed, and enveloped her i...",high,2
6,A Room with a View,"“The point is, we have warred with it. Look.” ...",none,0
7,A Room with a View,The young man named George glanced at the clev...,low,1
8,A Room with a View,“But my feelings are of no importance. I know ...,high,2
9,A Room with a View,“Indeed you may!” he cried. “Here we are with ...,low,1


In [None]:
test_text = df["paragraph"][6]
print("Paragraph:", test_text)
print("Prediction:", classify_paragraph3(test_text))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Paragraph: “The point is, we have warred with it. Look.” He pointed to the Val d’Arno, which was visible far below them, through the budding trees. “Fifty miles of Spring, and we’ve come up to admire them. Do you suppose there’s any difference between Spring in nature and Spring in man? But there we go, praising the one and condemning the other as improper, ashamed that the same laws work eternally through both.”
Prediction: low


In [None]:
mini_df["llama_pred_v3"] = mini_df["paragraph"].apply(classify_paragraph3)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [None]:
print(classification_report(
    mini_df["gold_label"],
    mini_df["llama_pred_v3"],
    labels=["high", "low", "none"]
))

              precision    recall  f1-score   support

        high       0.79      0.65      0.71        23
         low       0.29      0.85      0.43        13
        none       1.00      0.12      0.22        24

    accuracy                           0.48        60
   macro avg       0.69      0.54      0.46        60
weighted avg       0.77      0.48      0.46        60



In [None]:
mini_df['gold_label'].value_counts()


Unnamed: 0_level_0,count
gold_label,Unnamed: 1_level_1
none,24
high,23
low,13


In [None]:
mini_df['llama_pred_v3'].value_counts()


Unnamed: 0_level_0,count
llama_pred_v3,Unnamed: 1_level_1
low,38
high,19
none,3


In [None]:
df["llama_pred_v3"] = df["paragraph"].apply(classify_paragraph3)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [None]:
print(classification_report(
    df["gold_label"],
    df["llama_pred_v3"],
    labels=["high", "low", "none"]
))

              precision    recall  f1-score   support

        high       0.70      0.59      0.64       204
         low       0.30      0.74      0.43       156
        none       0.86      0.16      0.26       237

    accuracy                           0.46       597
   macro avg       0.62      0.49      0.44       597
weighted avg       0.66      0.46      0.43       597



In [None]:
df['llama_pred_v3'].value_counts()


Unnamed: 0_level_0,count
llama_pred_v3,Unnamed: 1_level_1
low,382
high,172
none,43


In [None]:
df['gold_label'].value_counts()


Unnamed: 0_level_0,count
gold_label,Unnamed: 1_level_1
none,237
high,204
low,156


## V4

In [None]:
def build_prompt4(paragraph: str) -> str:
    system_prompt = (
        "You are a classifier for literary interiority in fiction. "
        "Interiority refers to moments when the text gives access to a character’s inner thoughts, "
        "feelings, or perceptions, rather than only external actions or events. "

        "Interiority can appear through the following techniques:\n"
        "1. Psycho-narration: narrator explicitly states thoughts/feelings (e.g., She thought that ...).\n"
        "2. Quoted interior monologue: unspoken thoughts in quotation marks (e.g., “…” she thought).\n"
        "3. Narrated monologue / free indirect discourse: blended narrator-character voice expressing inner perspective (e.g. She walked in. What a disaster this would be.).\n"
        #"4. Retrospective narration (1st person): narrator reflects on past mental states.\n"
        #"5. Direct interior monologue (1st person): ongoing thoughts expressed directly.\n\n"

        "Label each paragraph as exactly one of: "
        "- high: clear evidence of any one of the 3 interiority types.\n"
        "- low: ambiguous, indirect, or minimal hints of inner experience.\n"
        "- none: external description, actions, setting, or spoken dialogue.\n"

        "Only assign “high” or “low” if there is clear textual evidence of interiority."
        "If interiority is not explicitly or implicitly present, label the passage as “none”."
        "Descriptive narration, physical actions, setting, or dialogue alone do NOT indicate interiority.***\n\n"

        "Examples:\n"
        "“So, thought Septimus, looking up, they are signalling to me.” → high\n"
        "“At first, he stood there still, looking at the ground as if the contents of his head were rearranging themselves into new positions.” → low\n"
        "“The wind rose in the night and rain came in sheets as the Croatians crossed the mountain meadows and fought in the dark.” → none"
        "“Come on, I said. Get in.” → none\n\n"

        "Output only one word in lowercase: high, low, or none."
    )

    user_prompt = (
        "Classify the interiority level of the following paragraph as high, low, or none:\n\n"
        f"\"\"\"{paragraph}\"\"\""
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    # chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt


def classify_paragraph4(paragraph: str) -> str:
    prompt = build_prompt4(paragraph)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    input_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2,  # reduce max token
            temperature=0.0,   # greedy decoding: more stable classification
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )

    new_generated_tokens = outputs[0][input_len:]
    output_text = tokenizer.decode(new_generated_tokens, skip_special_tokens=True)

    # take the newly generated output
    final_label = output_text.strip().lower()

    return final_label


In [None]:
df.head(10)

Unnamed: 0,title,paragraph,gold_label,y
0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",high,2
1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,low,1
2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",high,2
3,A Room with a View,An engagement is so potent a thing that sooner...,low,1
4,A Room with a View,“In the course of conversation they said that ...,low,1
5,A Room with a View,"Miss Bartlett only sighed, and enveloped her i...",high,2
6,A Room with a View,"“The point is, we have warred with it. Look.” ...",none,0
7,A Room with a View,The young man named George glanced at the clev...,low,1
8,A Room with a View,“But my feelings are of no importance. I know ...,high,2
9,A Room with a View,“Indeed you may!” he cried. “Here we are with ...,low,1


In [None]:
test_text = df["paragraph"][6]
print("Paragraph:", test_text)
print("Prediction:", classify_paragraph4(test_text))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Paragraph: “The point is, we have warred with it. Look.” He pointed to the Val d’Arno, which was visible far below them, through the budding trees. “Fifty miles of Spring, and we’ve come up to admire them. Do you suppose there’s any difference between Spring in nature and Spring in man? But there we go, praising the one and condemning the other as improper, ashamed that the same laws work eternally through both.”
Prediction: low


In [None]:
mini_df["llama_pred_v4"] = mini_df["paragraph"].apply(classify_paragraph4)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [None]:
print(classification_report(
    mini_df["gold_label"],
    mini_df["llama_pred_v4"],
    labels=["high", "low", "none"]
))

              precision    recall  f1-score   support

        high       0.67      0.70      0.68        23
         low       0.26      0.69      0.38        13
        none       1.00      0.08      0.15        24

    accuracy                           0.45        60
   macro avg       0.64      0.49      0.41        60
weighted avg       0.71      0.45      0.41        60



In [None]:
mini_df['llama_pred_v4'].value_counts()


Unnamed: 0_level_0,count
llama_pred_v4,Unnamed: 1_level_1
low,34
high,24
none,2


## CountVectorizer

In [None]:
## logistic regression
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=2)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=3000, class_weight="balanced")
clf.fit(X_train_vec, y_train)

pred = clf.predict(X_test_vec)
print(classification_report(y_test, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.58      0.62      0.60        73
         low       0.29      0.26      0.27        38
        high       0.54      0.53      0.54        49

    accuracy                           0.51       160
   macro avg       0.47      0.47      0.47       160
weighted avg       0.50      0.51      0.50       160



In [None]:
## naive bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
pred = nb.predict(X_test_vec)
print(classification_report(y_test, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.54      0.34      0.42        73
         low       0.32      0.18      0.23        38
        high       0.41      0.78      0.54        49

    accuracy                           0.44       160
   macro avg       0.42      0.43      0.40       160
weighted avg       0.45      0.44      0.41       160



## TF-IDF

In [None]:
## Logistic Regression
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_train_t = tfidf.fit_transform(X_train)
X_test_t = tfidf.transform(X_test)

clf_tfidf = LogisticRegression(max_iter=3000, class_weight="balanced")
clf_tfidf.fit(X_train_t, y_train)

pred = clf_tfidf.predict(X_test_t)
print(classification_report(y_test, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.56      0.47      0.51        73
         low       0.26      0.24      0.25        38
        high       0.52      0.67      0.58        49

    accuracy                           0.47       160
   macro avg       0.44      0.46      0.45       160
weighted avg       0.47      0.47      0.47       160



In [None]:
## Naive Bayes
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_t, y_train)
pred = nb_tfidf.predict(X_test_t)
print(classification_report(y_test, pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.55      0.38      0.45        73
         low       0.00      0.00      0.00        38
        high       0.38      0.84      0.52        49

    accuracy                           0.43       160
   macro avg       0.31      0.41      0.32       160
weighted avg       0.37      0.43      0.36       160



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Small Fine-tuned Transformers

In [None]:
pip install transformers datasets accelerate




In [None]:
from datasets import Dataset

train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)


In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### RoBERTa-base

In [None]:
## RoBERTa-base
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

model_name = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

train_tok = train_tok.remove_columns(["text"])
test_tok = test_tok.remove_columns(["text"])

train_tok.set_format("torch")
test_tok.set_format("torch")


Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,1.014034
2,No log,0.758653
3,No log,0.615499
4,No log,0.671503
5,No log,0.663426


TrainOutput(global_step=320, training_loss=0.6815510749816894, metrics={'train_runtime': 192.4823, 'train_samples_per_second': 13.17, 'train_steps_per_second': 1.662, 'total_flos': 333496256970240.0, 'train_loss': 0.6815510749816894, 'epoch': 5.0})

In [None]:
pred = trainer.predict(test_tok)
import numpy as np
y_pred = np.argmax(pred.predictions, axis=1)
print(classification_report(y_test, y_pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.81      0.72      0.76        36
         low       0.48      0.65      0.56        23
        high       0.89      0.77      0.83        31

    accuracy                           0.72        90
   macro avg       0.73      0.72      0.72        90
weighted avg       0.75      0.72      0.73        90



### DistilBERT

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

train_tok = train_tok.remove_columns(["text"])
test_tok = test_tok.remove_columns(["text"])

train_tok.set_format("torch")
test_tok.set_format("torch")


Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./distilbert_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_strategy="epoch"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,1.0591,0.94906
2,0.8483,0.764055
3,0.6265,0.769756
4,0.481,0.738287
5,0.3734,0.737301


TrainOutput(global_step=320, training_loss=0.6776483476161956, metrics={'train_runtime': 85.9781, 'train_samples_per_second': 29.484, 'train_steps_per_second': 3.722, 'total_flos': 167905422097920.0, 'train_loss': 0.6776483476161956, 'epoch': 5.0})

In [None]:
pred = trainer.predict(test_tok)

import numpy as np
y_pred = np.argmax(pred.predictions, axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.76      0.78      0.77        36
         low       0.48      0.43      0.45        23
        high       0.78      0.81      0.79        31

    accuracy                           0.70        90
   macro avg       0.67      0.67      0.67        90
weighted avg       0.69      0.70      0.70        90



## BERT-large

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model_name = "bert-large-uncased"

tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3   # HIGH, LOW, NONE
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tok = train_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

train_tok = train_tok.remove_columns(["text"])
test_tok = test_tok.remove_columns(["text"])

train_tok.set_format("torch")
test_tok.set_format("torch")


Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_large_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_strategy="epoch"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,1.03,0.750081
2,0.9295,1.126505
3,0.6983,0.783959
4,0.3024,1.251573


TrainOutput(global_step=1016, training_loss=0.7400447139589805, metrics={'train_runtime': 639.7415, 'train_samples_per_second': 3.17, 'train_steps_per_second': 1.588, 'total_flos': 944981595297792.0, 'train_loss': 0.7400447139589805, 'epoch': 4.0})

In [None]:
pred = trainer.predict(test_tok)

import numpy as np
y_pred = np.argmax(pred.predictions, axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=mapping.keys()))


              precision    recall  f1-score   support

        none       0.68      0.69      0.68        36
         low       0.36      0.35      0.36        23
        high       0.81      0.81      0.81        31

    accuracy                           0.64        90
   macro avg       0.62      0.62      0.62        90
weighted avg       0.64      0.64      0.64        90

