In [1]:
from transformers import AutoModelForCausalLM
from huggingface_hub.hf_api import HfApi
from huggingface_hub.utils import logging
import torch
import os  
import pandas as pd  
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling  
from datasets import Dataset  

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Set environment variable for cache directory to a local permitted directory 

os.environ['TRANSFORMERS_CACHE'] = './transformers_cache'  
os.environ['HF_HOME'] = './hf_home'

In [3]:
# Load CSV data  

print("Loading CSV file...")  
df = pd.read_csv('./data/Combined Data.csv') 

Loading CSV file...


In [4]:
# Load model directly

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

In [5]:
# Check for missing statements and filter them out  
df = df.dropna(subset=['statement'])  
print("Data after removing missing values: " + str(df.shape))  

Data after removing missing values: (52681, 3)


In [6]:
# Create a Hugging Face Dataset from the DataFrame (using just the 'statement' column)  
dataset = Dataset.from_pandas(df[['statement']])  

In [7]:
# Load tokenizer and model (using your working code)  
print("Loading tokenizer and model...")  
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")  
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")  
print("Tokenizer and model loaded successfully.")  

Loading tokenizer and model...
Tokenizer and model loaded successfully.


In [8]:
# Define tokenization function using the "statement" column  
def tokenize_function(examples):  
    return tokenizer(examples["statement"], padding="max_length", truncation=True, max_length=128) 

In [9]:
# Tokenize the dataset  
print("Tokenizing dataset...")  
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)  
print("Dataset tokenized successfully.")  

Tokenizing dataset...


Map: 100%|██████████| 52681/52681 [00:06<00:00, 8449.35 examples/s] 

Dataset tokenized successfully.





In [10]:
# Setup data collator (for causal LM, mlm is False)  
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  

In [11]:
# Set up training arguments (adjust hyperparameters as needed)  
training_args = TrainingArguments(  
    output_dir="./happy_brain",  
    overwrite_output_dir=True,  
    num_train_epochs=1,  # Adjust the number of epochs as needed  
    per_device_train_batch_size=4,  
    save_steps=1000,  
    save_total_limit=2,  
    logging_steps=100,  
    evaluation_strategy="no"  
) 



In [12]:
# Initialize the Trainer  
trainer = Trainer(  
    model=model,  
    args=training_args,  
    train_dataset=tokenized_dataset,  
    data_collator=data_collator,  
)

print("Trainer initialized. Ready to start training.")  
print("Note: Full training requires significant time and resources.") 

# Uncomment the following line to start training:  
# trainer.train()  
  
# Save the model and tokenizer after training (uncomment when trainer.train() is executed)  
# model.save_pretrained("./happy_brain")  
# tokenizer.save_pretrained("./happy_brain")  

Trainer initialized. Ready to start training.
Note: Full training requires significant time and resources.


In [13]:
# For demonstration, here's a sample inference function using the base model:  
def generate_response(input_text, max_length=50):  
    inputs = tokenizer(input_text, return_tensors="pt")  
    outputs = model.generate(  
        inputs.input_ids,   
        max_length=max_length,  
        num_return_sequences=1,  
        temperature=0.7  
    )  
    return tokenizer.decode(outputs[0], skip_special_tokens=True)  

print("\nExample inference (using base model):")  
sample_input = "I've been feeling really anxious lately and can't sleep."  
print("Input: " + sample_input)  
print("Response: " + generate_response(sample_input))  


Example inference (using base model):
Input: I've been feeling really anxious lately and can't sleep.




Response: I've been feeling really anxious lately and can't sleep. I'm worried about my future and what's going to happen to me. I'm scared of the unknown and don't know how to co
