# Install all libraries

In [1]:
!pip install wordcloud
!pip install keras
!pip install tensorflow
!pip install transformers datasets
!pip install sentencepiece



In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Word Cloud
from wordcloud import WordCloud
# from textacy import preprocessing
from nltk.stem.snowball import SnowballStemmer
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

# Feed data

In [3]:
file_path1 = "5vec2.csv"
file_path2 = "watch_closely.csv"
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)
df1.head()

Unnamed: 0,Q,cog_self,aff_self,int_self,cog_target,aff_target,int_target
0,two children are stealing cookies from the jar...,0,0,0,0,0,1
1,This house seems chaotic. The boy is on a stoo...,0,0,0,1,0,0
2,"(Presumably)Mother daydreaming, distracted by ...",0,0,0,2,0,0
3,a woman is washing the dishes\nwater is spilli...,0,0,0,0,0,0
4,"A woman is washing the plates.However, the wat...",0,0,0,1,0,0


In [4]:
df2.head()

Unnamed: 0,Q,aff_tar,cog_tar,int_tar,aff_oth,cog_oth,int_oth
0,"A child and a man, presumably the father, are ...",1,0,0,0,0,0
1,Scene opens with a father and daughter riding ...,1,1,0,0,0,0
2,A father and daughter rode their bikes along a...,0,0,0,0,0,0
3,The clip commences with a cross view of the gi...,0,3,1,1,0,0
4,A girl cycles with her father to see him off a...,0,1,1,0,0,0


# Split data into train, val, test

In [55]:
# Assuming your dataset is in a pandas DataFrame called 'data'
train_df, temp_data = train_test_split(df1, test_size=0.25, random_state=42)
# Split the temp_data into validation (15%) and test (10%)
valid_df, test_df = train_test_split(temp_data, test_size=0.4, random_state=42)  # 0.4 * 25% = 10%

In [56]:
print(f"Training set size: {len(train_df)} rows")
print(f"Validation set size: {len(valid_df)} rows")
print(f"Test set size: {len(test_df)} rows")


Training set size: 236 rows
Validation set size: 47 rows
Test set size: 32 rows


# Prepare dataset for training

In [16]:
from transformers import T5ForConditionalGeneration, get_linear_schedule_with_warmup
from torch.utils.data import Dataset 
from transformers import AdamW
import torch

In [54]:
#dataset Class
class ToMDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=1775):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        # Add task-specific instructions for training
        text = row['Q']
        target = f"{row['cog_self']} {row['aff_self']} {row['int_self']} {row['cog_target']} {row['aff_target']} {row['int_target']}"

        # Tokenize input and target
        inputs = self.tokenizer(
            text, 
            padding="max_length", 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors="pt"
        )
        targets = self.tokenizer(
            target, 
            padding="max_length", 
            truncation=True, 
            max_length=6, 
            return_tensors="pt"
        )
        
        return {
            'input_ids': inputs.input_ids.squeeze(),
            'attention_mask': inputs.attention_mask.squeeze(),
            'target_ids': targets.input_ids.squeeze()
        }


In [59]:
# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Create dataset instance
dataset = ToMDataset(train_df, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [60]:
# Set the model to training mode
model.train()

# Use a GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

# Train

In [61]:
# Training
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Number of training steps
num_epochs = 1 # I start with 1 coz it takes some time and firstly we need to make it work, then we can do more epochs
total_steps = len(dataloader) * num_epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,  # Default value
    num_training_steps=total_steps
)

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    for batch in dataloader:
        # Move data to device (GPU or CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target_ids = batch['target_ids'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            labels=target_ids
        )
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate

        # Print loss for monitoring
        print(f"Loss: {loss.item()}")




Epoch 1/1
Loss: 7.077762126922607
Loss: 6.716832637786865
Loss: 5.908145427703857
Loss: 6.1862053871154785
Loss: 4.862710952758789
Loss: 5.549628734588623
Loss: 4.801024913787842
Loss: 5.030360221862793
Loss: 4.753163814544678
Loss: 4.195201396942139
Loss: 4.325935363769531
Loss: 4.023024082183838
Loss: 4.245853900909424
Loss: 3.8592069149017334
Loss: 3.91497802734375
Loss: 3.340871810913086
Loss: 3.647738218307495
Loss: 3.699476480484009
Loss: 4.416460990905762
Loss: 3.814748764038086
Loss: 3.2606146335601807
Loss: 3.2982637882232666
Loss: 3.4785473346710205
Loss: 3.6264684200286865
Loss: 3.0236940383911133
Loss: 3.5930445194244385
Loss: 2.970031499862671
Loss: 3.417034149169922
Loss: 3.306366205215454
Loss: 4.284363269805908


In [47]:
# Save the fine-tuned model
model.save_pretrained("./t5_toM_count_model")
tokenizer.save_pretrained("./t5_toM_count_model")

('./t5_toM_count_model/tokenizer_config.json',
 './t5_toM_count_model/special_tokens_map.json',
 './t5_toM_count_model/spiece.model',
 './t5_toM_count_model/added_tokens.json')

# Evaluate training

In [48]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./t5_toM_count_model")
tokenizer = T5Tokenizer.from_pretrained("./t5_toM_count_model")

# Ensure the model is in evaluation mode
model.eval()

# Use a GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [57]:
# Now create the ToMDataset instance
test_data = ToMDataset(test_df, tokenizer)

# Create a DataLoader for the test data
test_loader = DataLoader(test_data, batch_size=8, shuffle=False)

# Check the first batch
for batch in test_loader:
    print(batch)  # This should work without an error
    break


{'input_ids': tensor([[   27,   816,    34,  ...,     0,     0,     0],
        [  255,    47, 11237,  ...,     0,     0,     0],
        [ 1029,   160,  3503,  ...,     0,     0,     0],
        ...,
        [  499,  4284,    19,  ...,     0,     0,     0],
        [   27,    47,   578,  ...,     0,     0,     0],
        [ 1029,  4496,    31,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'target_ids': tensor([[  3, 632,   3, 632,   3,   1],
        [  3, 632,   3, 632,   3,   1],
        [  3, 632,   3, 632,   3,   1],
        [209,   3, 632,   3, 632,   1],
        [314,   3, 632,   3, 632,   1],
        [  3, 632,   3, 632,   3,   1],
        [  3, 632,   3, 632,   3,   1],
        [  3, 632,   3, 632,   3,   1]])}


# Check predicted and true values
## Here we have some **issues**:  
firstly - some sentences have true labels as only 2 numbers, not 3;  
secondly - our predicted categories are some random pieces of text....

In [58]:
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target_ids = batch['target_ids'].to(device)

        # Generate predictions
        outputs = model.generate(input_ids, max_length=6)

        # Decode the predictions and targets
        predicted_text = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        true_text = [tokenizer.decode(target, skip_special_tokens=True) for target in target_ids]

        # Print the predictions and true values for debugging
        print(f"Predicted: {predicted_text}")
        print(f"True: {true_text}")
        break  # Exit after one batch to examine the output


Predicted: ['I thought it was very', 'Elle ne', 'perspective, she was', 'Sarah. Sarah had', 'the scenario, I', 'is sneaking me', '. She would look', 'Jack was sitting directly across']
True: ['0 0', '0 0', '0 0', '1 0 0', '4 0 0', '0 0', '0 0', '0 0']
