In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Example llm

## Classification

### Training

In [2]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

data = load_dataset("imdb")

tokenized_data = data.map(tokenize_function, batched=True)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 50000/50000 [00:15<00:00, 3193.00 examples/s]


In [3]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./smaller_bert_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_dir="./logs",)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlinfeng-wang[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
500,0.439,0.285558
1000,0.3295,0.295979
1500,0.3148,0.265527
2000,0.3195,0.254542
2500,0.2813,0.263708
3000,0.2967,0.279517
3500,0.1841,0.302228
4000,0.169,0.329028
4500,0.1632,0.315049
5000,0.1869,0.291457


TrainOutput(global_step=9375, training_loss=0.18951775309244792, metrics={'train_runtime': 4103.4332, 'train_samples_per_second': 18.277, 'train_steps_per_second': 2.285, 'total_flos': 9935054899200000.0, 'train_loss': 0.18951775309244792, 'epoch': 3.0})

In [121]:
# Assuming you have a CUDA-capable GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Move your model to the chosen device
model.to(device)

# Ensure your input tensor is also moved to the same device
# example_input = tokenizer("I am absolutely amazed with this new and revolutionary AI device", return_tensors="pt").to(device)
example_input = tokenizer("This thing is absolutely pointless and I don't get why people are wasting their time on it.", return_tensors="pt").to(device)

# Forward pass
output = model(**example_input)

# Get the predicted label
predicted_label = torch.argmax(output.logits, dim=1).item()

# If you need to save your fine-tuned model
# model.save_pretrained("./my_bert_finetuned")
# tokenizer.save_pretrained("./my_bert_finetuned")


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### obtainig class proabbabity

In [71]:
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
# model_name = "bert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

text = "I am an example sequence for text classification."
text ="This thing is absolutely pointless and I don't get why people are wasting their time on it."
class SimpleClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleClassifier, self).__init__()
        self.fc = nn.Linear(input_size, num_classes)
    def forward(self, x):
        return self.fc(x)
    
inputs = tokenizer(
    text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=64)

outputs = model(**inputs)

pooled_output = outputs.pooler_output
print("Hidden states size: ", outputs.last_hidden_state.shape)
print("Pooled output size: ", pooled_output.shape)

classifier_head = SimpleClassifier(
    pooled_output.size(-1),
    num_classes=2) 

logits = classifier_head(pooled_output)
probs = torch.softmax(logits, dim=1)
print("Predicted Class Probabilities:", probs)

Hidden states size:  torch.Size([1, 22, 768])
Pooled output size:  torch.Size([1, 768])
Predicted Class Probabilities: tensor([[0.4616, 0.5384]], grad_fn=<SoftmaxBackward0>)


## Text generation

In [85]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
prompt = "what is your capability n,"

inputs = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(inputs, max_length=26)
generated_text = tokenizer.decode(
output[0], skip_special_tokens=True)
print("Generated Text:")
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
what is your capability n, what is your ability n, what is your ability n, what is your ability n, what is


In [83]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Replace 'your-model-name' with the actual model you're using.
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')

# If your tokenizer does not have a pad token, set it to one that exists (e.g., eos_token).
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Your input text
input_texts = ["Hello, I'm a model.", "This is another text."]

# Tokenizing inputs with attention mask and padding
inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

# Extracting input_ids and attention_mask
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Generating outputs using the model
outputs = model.generate(input_ids, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id)

# Decoding the generated text
generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(generated_text)


Using pad_token, but it is not set yet.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


what is your capability?

I'm not sure. I'm not sure if I'm going to be able to do it. I'm not sure if I'm going to be able to do it. I'm not sure if I'm going


### Chatbot

In [88]:
from transformers import DistilBertTokenizer, DistilBertModel
# Load the DistilBert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# Import the DistilBert pretrained model
bert = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [None]:
text = ["this is a distil bert model.","data is oil"]
# Encode the text
encoded_input = tokenizer(text, padding=True,truncation=True, return_tensors='pt')
print(encoded_input)

In [90]:
text = ["this is a distil bert model.","data is oil"]
# Encode the text
encoded_input = tokenizer(text, padding=True,truncation=True, return_tensors='pt')
print(encoded_input)


{'input_ids': tensor([[  101,  2023,  2003,  1037,  4487, 16643,  2140, 14324,  2944,  1012,
           102],
        [  101,  2951,  2003,  3514,   102,     0,     0,     0,     0,     0,
             0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}


In [91]:
seq_len = [len(i.split()) for i in train_text]
pd.Series(seq_len).hist(bins = 10)

tokens_train = tokenizer(
    train_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

NameError: name 'train_text' is not defined

In [96]:
df.iloc[0,0]

"The patient is Male, born in Kazakhstan and currently employed as Unemployed, diagnosed with TB at the age of 26, presents a case with Sensitive resistance. The patient has 0 children and interact with 0 individuals daily. The patient’s BMI is recorded at 15.9, and TB primarily affects the Pulmonary part of the lung(s), with Greater than or equal to 50% of lung volume showing abnormalities. Pleural effusion involves Less than 50% of the hemithorax, and bilateral effusion is No. Additional non-TB abnormalities include Yes, and mediastinal lymph nodes presence is No. The Timika score is 96.0, with noted lung collapse Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No, and cavity sizes categorized as small (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No), medium (Lower Left Sextant-No, Lower Right Sextant

### Paraphrasing

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

input_sentence = "They were there to enjoy us and they were there to pray for us."

model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')
batch = tokenizer(input_sentence, return_tensors='pt')
generated_ids = model.generate(batch['input_ids'])
generated_sentence = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_sentence)


In [102]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

input_sentence = "They were there to enjoy us and they were there to pray for us."
# input_sentence = "The patient is Male, born in Kazakhstan and currently employed as Unemployed, diagnosed with TB at the age of 26, presents a case with Sensitive resistance. The patient has 0 children and interact with 0 individuals daily. The patient’s BMI is recorded at 15.9, and TB primarily affects the Pulmonary part of the lung(s), with Greater than or equal to 50% of lung volume showing abnormalities. Pleural effusion involves Less than 50% of the hemithorax, and bilateral effusion is No. Additional non-TB abnormalities include Yes, and mediastinal lymph nodes presence is No. The Timika score is 96.0, with noted lung collapse Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No, and cavity sizes categorized as small (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No), medium (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No), and large (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No), with Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No large cavities being part of a multisextant cavity and multiple cavities visible Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No. Lung infiltrates are observed with low (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No), medium (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No), and high (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No) density alongside nodules of various sizes: small (Lower Left Sextant-Yes, Lower Right Sextant-Yes, Middle Left Sextant-Yes, Middle Right Sextant-Yes, Upper Left Sextant-Yes, Upper Right Sextant-Yes), medium (Lower Left Sextant-Yes, Lower Right Sextant-Yes, Middle Left Sextant-Yes, Middle Right Sextant-Yes, Upper Left Sextant-Yes, Upper Right Sextant-Yes), large (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No), and huge (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-No, Upper Right Sextant-No). Calcified or partially calcified nodules exist: Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-Yes, Upper Right Sextant-No, with non-calcified Lower Left Sextant-Yes, Lower Right Sextant-Yes, Middle Left Sextant-Yes, Middle Right Sextant-Yes, Upper Left Sextant-Yes, Upper Right Sextant-Yes and clustered Lower Left Sextant-Yes, Lower Right Sextant-Yes, Middle Left Sextant-Yes, Middle Right Sextant-Yes, Upper Left Sextant-Yes, Upper Right Sextant-Yes nodules noted. Multiple nodules are present Lower Left Sextant-Yes, Lower Right Sextant-Yes, Middle Left Sextant-Yes, Middle Right Sextant-Yes, Upper Left Sextant-Yes, Upper Right Sextant-Yes, with nodules' characteristics detailed as low-ground glass density active/fresh (Lower Left Sextant-Yes, Lower Right Sextant-Yes, Middle Left Sextant-Yes, Middle Right Sextant-Yes, Upper Left Sextant-Yes, Upper Right Sextant-Yes), medium density stabilized/fibrotic (Lower Left Sextant-Yes, Lower Right Sextant-Yes, Middle Left Sextant-Yes, Middle Right Sextant-Yes, Upper Left Sextant-Yes, Upper Right Sextant-Yes), and high density calcified/typical sequela (Lower Left Sextant-No, Lower Right Sextant-No, Middle Left Sextant-No, Middle Right Sextant-No, Upper Left Sextant-Yes, Upper Right Sextant-No). Over a period of 678.0 days, the patient has undergone 2 treatment regimens with the current status being New drugs available, Treatment ended and receiving Bedaquiline, Clofazimine, Cycloserine, Ethambutol, Isoniazid, Linezolid, Moxifloxacin, Pyrazinamide, Rifampicin. Comorbidities include Anemia, HIV, Others. Laboratory results are as follows: Total Protein no information g/l, Potassium no information mmol/l, Aspartate Aminotransferase no information u/l, Total Bilirubin no information umol/l, Glucose no information mmol/l, Creatinine no information umol/l, Alanine Aminotransferase no information u/l, and Erythrocyte Sedimentation Rate no information mm/hr."
# input_sentence = "The patient is Male, born in Kazakhstan and currently employed as Unemployed, diagnosed with TB at the age of 26, asdc sd."
model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')

batch = tokenizer(input_sentence, return_tensors='pt').to(device)
generated_ids = model.generate(batch['input_ids'])
generated_sentence = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
print(generated_sentence)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# My data training

### Data

In [3]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = pd.read_csv('/mnt/storageG1/lwang/Projects/TBpt/llm/serialized1.csv')
df = df.reset_index(drop=True)
df.reset_index(drop=True, inplace=True)

# Assuming `df` is your pandas DataFrame with 'text' and 'label' columns.
df = df.drop(columns = ['Unnamed: 0'])
df.columns = ['text', 'label']

# merged_data = merged_data[~merged_data['outcome_cd'].isin(['unknown', 'stillOnTreatment'])]
df = df[~df['label'].isin(['Lost to follow up', 'Still on treatment','Unknown'])]


In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(['Died', 'Failure', 'Completed', 'Palliative Care', 'Cured'])
output = label_encoder.transform(df['label'] )
df['label'] = output

texts_train, texts_test, labels_train, labels_test = train_test_split(df['text'], df['label'], test_size=0.3)

# Create a training Dataset
train_dataset = Dataset.from_dict({
    "text": texts_train,
    "label": labels_train
})

# Create a test Dataset
test_dataset = Dataset.from_dict({
    "text": texts_test,
    "label": labels_test
})

# Combine them into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Now, dataset_dict is what you wanted
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8487
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3638
    })
})


### LLM - encoder

In [6]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=5)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# data = load_dataset("imdb")

tokenized_data = dataset_dict.map(tokenize_function, batched=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 8487/8487 [00:03<00:00, 2135.06 examples/s]
Map: 100%|██████████| 3638/3638 [00:01<00:00, 1964.60 examples/s]


In [7]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./smaller_bert_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_dir="./logs",)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlinfeng-wang[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
500,1.0153,0.953905
1000,0.9475,0.936803
1500,0.9279,0.935906
2000,0.9299,0.91529
2500,0.9108,0.900777
3000,0.8893,0.894208


TrainOutput(global_step=3183, training_loss=0.9351722892260275, metrics={'train_runtime': 597.6756, 'train_samples_per_second': 42.6, 'train_steps_per_second': 5.326, 'total_flos': 3372932881935360.0, 'train_loss': 0.9351722892260275, 'epoch': 3.0})

In [17]:
# # Assuming you have a CUDA-capable GPU
# device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Move your model to the chosen device
model.to(device)

# Ensure your input tensor is also moved to the same device
# example_input = tokenizer("I am absolutely amazed with this new and revolutionary AI device", return_tensors="pt").to(device)
example_input = tokenizer("The patient is Male, born in Belarus and currently employed as Student, diagnosed with TB at the age of 18, presents a case with XDR resistance. The patient has 0 children and interact with 2 individuals daily. The patient’s BMI is recorded at 20.9, and TB primarily affects the Pulmonary part of the lung(s), with Less than 50% of lung volume showing abnormalities. Pleural effusion involves Less than 50% of the hemithorax, and bilateral effusion is No. Additional non-TB abnormalities include No, and mediastinal lymph nodes presence is No. The Timika score is 4.0, with noted lung collapse Upper Right Sextant-No, and cavity sizes categorized as small (Upper Right Sextant-No), medium (Upper Right Sextant-No), and large (Upper Right Sextant-No), with Upper Right Sextant-No large cavities being part of a multisextant cavity and multiple cavities visible Upper Right Sextant-No. Lung infiltrates are observed with low (Upper Right Sextant-No), medium (Upper Right Sextant-No), and high (Upper Right Sextant-No) density alongside nodules of various sizes: small (Upper Right Sextant-Yes), medium (Upper Right Sextant-Yes), large (Upper Right Sextant-No), and huge (Upper Right Sextant-No). Calcified or partially calcified nodules exist: Upper Right Sextant-No, with non-calcified Upper Right Sextant-Yes and clustered Upper Right Sextant-Yes nodules noted. Multiple nodules are present Upper Right Sextant-Yes, with nodules' characteristics detailed as low-ground glass density active/fresh (Upper Right Sextant-Yes), medium density stabilized/fibrotic (Upper Right Sextant-Yes), and high density calcified/typical sequela (Upper Right Sextant-No). Over a period of 635.0 days, the patient has undergone 3 treatment regimens with the current status being Continuation of treatment, Treatment ended and receiving Capreomycin, Clofazimine, Cycloserine, Delamanid, Levofloxacin, Linezolid, Moxifloxacin. Comorbidities include no information. Laboratory results are as follows: Total Protein no information g/l, Potassium no information mmol/l, Aspartate Aminotransferase no information u/l, Total Bilirubin no information umol/l, Glucose no information mmol/l, Creatinine no information umol/l, Alanine Aminotransferase no information u/l, and Erythrocyte Sedimentation Rate no information mm/hr.", return_tensors="pt").to(device)

# Forward pass
output = model(**example_input)

# Get the predicted label
predicted_label = torch.argmax(output.logits, dim=1).item()

# If you need to save your fine-tuned model
# model.save_pretrained("./my_bert_finetuned")
# tokenizer.save_pretrained("./my_bert_finetuned")


RuntimeError: The size of tensor a (607) must match the size of tensor b (512) at non-singleton dimension 1

In [16]:
tokenized_data["test"]['text'][0]

"The patient is Male, born in Belarus and currently employed as Student, diagnosed with TB at the age of 18, presents a case with XDR resistance. The patient has 0 children and interact with 2 individuals daily. The patient’s BMI is recorded at 20.9, and TB primarily affects the Pulmonary part of the lung(s), with Less than 50% of lung volume showing abnormalities. Pleural effusion involves Less than 50% of the hemithorax, and bilateral effusion is No. Additional non-TB abnormalities include No, and mediastinal lymph nodes presence is No. The Timika score is 4.0, with noted lung collapse Upper Right Sextant-No, and cavity sizes categorized as small (Upper Right Sextant-No), medium (Upper Right Sextant-No), and large (Upper Right Sextant-No), with Upper Right Sextant-No large cavities being part of a multisextant cavity and multiple cavities visible Upper Right Sextant-No. Lung infiltrates are observed with low (Upper Right Sextant-No), medium (Upper Right Sextant-No), and high (Upper R

In [14]:
predicted_label

1

prediction test

In [12]:
pred = []
label = []
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for x in tokenized_data["test"]:
    model.to(device)
    # Ensure your input tensor is also moved to the same device
    # example_input = tokenizer("I am absolutely amazed with this new and revolutionary AI device", return_tensors="pt").to(device)
    example_input = tokenizer(x['text'], return_tensors="pt").to(device)
    # Forward pass
    output = model(**example_input)
    # Get the predicted label
    predicted_label = torch.argmax(output.logits, dim=1).item()
    pred.append(predicted_label)
    label.append(x['label'])
    break

Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (607) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
# model_name = "bert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

text = "I am an example sequence for text classification."
text ="This thing is absolutely pointless and I don't get why people are wasting their time on it."
class SimpleClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleClassifier, self).__init__()
        self.fc = nn.Linear(input_size, num_classes)
    def forward(self, x):
        return self.fc(x)
    
inputs = tokenizer(
    text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=64)

outputs = model(**inputs)

pooled_output = outputs.pooler_output
print("Hidden states size: ", outputs.last_hidden_state.shape)
print("Pooled output size: ", pooled_output.shape)
s
classifier_head = SimpleClassifier(
    pooled_output.size(-1),
    num_classes=2) 

logits = classifier_head(pooled_output)
probs = torch.softmax(logits, dim=1)
print("Predicted Class Probabilities:", probs)