#Classification

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Load Dataset
data = pd.read_csv("path_to_your_classification_dataset.csv")
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Load Tokenizer and Model
model_name = "distilbert-base-uncased"  # Replace with a smaller LLM if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(data['label'].unique()))

# Tokenize Dataset
def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True)

train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

# Convert Data to PyTorch Format
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, y_train.values)
test_dataset = Dataset(test_encodings, y_test.values)

# Define Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train Model
trainer.train()


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_classification_dataset.csv'

#Conversational Chat

In [None]:
# Import Libraries
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load Pre-trained Model & Tokenizer
model_name = "gpt2"  # TinyLlama might need adaptation
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Chat Function
def generate_response(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=50, num_return_sequences=1)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example Usage
user_input = "Hello! How are you today?"
response = generate_response(user_input)
print(response)


#Extending Max Context Size of TinyLlama

In [None]:
# Extending Context Size (Conceptual)
# Import Necessary Libraries
import torch
from transformers import GPT2Config, GPT2LMHeadModel

# Load Configuration and Adjust the 'n_positions' Parameter
model_name = "tiny-llama-checkpoint"
config = GPT2Config.from_pretrained(model_name)
config.n_positions = 2048  # Increase context size (original might be 1024)

# Load Model with New Configuration
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

# Save Extended Model
model.save_pretrained("extended_tiny_llama")


# Multiple Datasets for Single Fine-Tuning

In [None]:
# Load and Merge Multiple Datasets
dataset1 = pd.read_csv("dataset1.csv")
dataset2 = pd.read_csv("dataset2.csv")

# Assuming similar structures (e.g., columns 'text' and 'label')
combined_dataset = pd.concat([dataset1, dataset2], axis=0).reset_index(drop=True)

# Tokenization and Splitting
X_train, X_test, y_train, y_test = train_test_split(combined_dataset['text'], combined_dataset['label'], test_size=0.2, random_state=42)

train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

# Dataset Creation for PyTorch (reuse Dataset class defined above)
train_dataset = Dataset(train_encodings, y_train.values)
test_dataset = Dataset(test_encodings, y_test.values)

# Fine-Tuning (reuse Trainer from previous template)
trainer.train()
