In [1]:
from transformers import AutoModelForCausalLM
from huggingface_hub.hf_api import HfApi
from huggingface_hub.utils import logging
import torch
import os  
import pandas as pd  
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling  
from datasets import Dataset  

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Set environment variable for cache directory to a local permitted directory 

os.environ['TRANSFORMERS_CACHE'] = './transformers_cache'  
os.environ['HF_HOME'] = './hf_home'

In [3]:
# Load CSV data  

print("Loading survey.csv...")  
df_survey = pd.read_csv('./data/survey.csv')  
print("Survey data shape:", df_survey.shape)  
print("Survey data columns:", df_survey.columns.tolist()[:10], "...") 

Loading survey.csv...
Survey data shape: (1259, 27)
Survey data columns: ['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed', 'family_history', 'treatment', 'work_interfere', 'no_employees'] ...


In [4]:
# Filter out rows with missing comments  
df_survey = df_survey.dropna(subset=['comments'])  
print("Survey data shape after filtering for non-null comments:", df_survey.shape) 

Survey data shape after filtering for non-null comments: (164, 27)


In [5]:
# Create a DataFrame with just the 'comments' column, renamed to 'statement'  
df_train = df_survey[['comments']].rename(columns={'comments': 'statement'})  
print("Training data shape:", df_train.shape)  

Training data shape: (164, 1)


In [6]:
# Display a sample of the training data (first 3 rows)  
print("\nSample of training data (first 3 rows):")  
for i, row in df_train.head(3).iterrows():  
    sample_text = row['statement']  
    print("Statement " + str(i+1) + ": " + (sample_text[:100] + '...' if len(sample_text) > 100 else sample_text))  
      


Sample of training data (first 3 rows):
Statement 14: I'm not on my company's health insurance which could be part of the reason I answered Don't know to ...
Statement 16: I have chronic low-level neurological issues that have mental health side effects. One of my supervi...
Statement 17: My company does provide healthcare but not to me as I'm on a fixed-term contract. The mental healthc...


In [7]:
# Create a Hugging Face Dataset from the DataFrame  
dataset = Dataset.from_pandas(df_train)

In [8]:
# Load model directly

print("\nLoading tokenizer and model...")  
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", force_download=True)  
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", force_download=True)  
print("Tokenizer and model loaded successfully.")  


Loading tokenizer and model...
Tokenizer and model loaded successfully.


In [9]:
# Define tokenization function  
def tokenize_function(examples):  
    return tokenizer(examples["statement"], padding="max_length", truncation=True, max_length=128) 

In [10]:
# Tokenize the dataset  
print("\nTokenizing dataset...")  
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)  
print("Dataset tokenized successfully.") 


Tokenizing dataset...


Map: 100%|██████████| 164/164 [00:00<00:00, 5751.43 examples/s]

Dataset tokenized successfully.





In [11]:
# Setup data collator  
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  

In [12]:
# Set up training arguments (adjust hyperparameters as needed)  
training_args = TrainingArguments(  
    output_dir="./happy_brain",  
    overwrite_output_dir=True,  
    num_train_epochs=3,  
    per_device_train_batch_size=4,  
    save_steps=50,  
    save_total_limit=2,  
    logging_steps=10,  
    evaluation_strategy="no"  
)   



In [13]:
# Initialize the Trainer  
trainer = Trainer(  
    model=model,  
    args=training_args,  
    train_dataset=tokenized_dataset,  
    data_collator=data_collator,  
) 

print("Trainer initialized. Ready to start training.")  
print("Note: Full training requires significant time and resources.")  
  
# Uncomment the following lines to start training:  
trainer.train()  
model.save_pretrained('./happy_brain')  
tokenizer.save_pretrained('./happy_brain')    

Trainer initialized. Ready to start training.
Note: Full training requires significant time and resources.


Step,Training Loss
10,3.0228
20,3.0602
30,3.0444
40,3.0378


KeyboardInterrupt: 

In [None]:
def generate_response(input_text, max_length=50):  
    inputs = tokenizer(input_text, return_tensors="pt")  
    outputs = model.generate(  
        inputs.input_ids,  
        max_length=max_length,  
        num_return_sequences=1,  
        temperature=0.7,  
        do_sample=True  
    )  
    return tokenizer.decode(outputs[0], skip_special_tokens=True)  
  
print("\nExample inference (using base model):")  
sample_input = "I've been feeling really anxious lately and can't sleep."  
print("Input: " + sample_input)  
print("Response: " + generate_response(sample_input))  


Example inference (using base model):
Input: I've been feeling really anxious lately and can't sleep.




Response: I've been feeling really anxious lately and can't sleep. I'm worried about my future and what's going to happen to me. I'm scared of the unknown and don't know how to co
