In [1]:
import pandas as pd
from transformers import BertTokenizer
from transformers import AutoTokenizer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv(r"C:\Users\johnk\LLM project\datasets\clean_train.csv")
df_test = pd.read_csv(r"C:\Users\johnk\LLM project\datasets\clean_test.csv")
df_unsupervised = pd.read_csv(r"C:\Users\johnk\LLM project\datasets\unsupervised_clean.csv")

In [3]:
df_train.head()

Unnamed: 0,text,label,clean_text
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,rented curiousyellow video store controversy s...
1,"""I Am Curious: Yellow"" is a risible and preten...",0,curious yellow risible pretentious steaming pi...
2,If only to avoid making this type of film in t...,0,avoid making type film future film interesting...
3,This film was probably inspired by Godard's Ma...,0,film probably inspired godards masculin f√©mini...
4,"Oh, brother...after hearing about this ridicul...",0,oh brotherafter hearing ridiculous film umptee...


In [4]:
df_unsupervised.head()

Unnamed: 0,text,label,clean_text
0,This is just a precious little diamond. The pl...,-1,precious little diamond play script excellent ...
1,When I say this is my favourite film of all ti...,-1,say favourite film time comment taken lightly ...
2,I saw this movie because I am a huge fan of th...,-1,saw movie huge fan tv series name starring roy...
3,Being that the only foreign films I usually li...,-1,foreign films usually like star japanese perso...
4,After seeing Point of No Return (a great movie...,-1,seeing point return great movie told original ...


In [None]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) 

def tokenize_data(examples):
    return tokenizer(examples["clean_text"], padding=True, truncation=True, max_length=512)

 


In [None]:
texts = df_train['clean_text'].tolist()  # The list of reviews (texts)
labels = df_train['label'].tolist()  # The list of sentiment labels (0 or 1)


In [19]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.1, random_state=42
)


In [20]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [21]:

# Convert tokenized data into Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})


In [6]:

# Convert train and test DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_data, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_data, batched=True)


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25000/25000 [00:07<00:00, 3469.42 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25000/25000 [00:07<00:00, 3543.42 examples/s]


In [7]:
import torch

# Set format for PyTorch
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [8]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_name)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import TrainingArguments
from accelerate import Accelerator


In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",           # Output directory for saved models
    num_train_epochs=3,               # Number of epochs
    per_device_train_batch_size=8,    # Batch size per device during training
    per_device_eval_batch_size=16,    # Batch size for evaluation
    warmup_steps=500,                 # Number of warmup steps
    weight_decay=0.01,                # Strength of weight decay
    logging_dir="./logs",             # Directory for storing logs
    logging_steps=10,                 # Log every 10 steps
    evaluation_strategy="epoch",      # Evaluation strategy (evaluate every epoch)
)




In [22]:
from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your tokenized training dataset
    eval_dataset=val_dataset,    # Your tokenized evaluation dataset
    tokenizer=tokenizer,          # Tokenizer for preprocessing
)


  trainer = Trainer(
