In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import random
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
import pandas as pd


In [None]:
df = pd.read_csv('/content/drive/MyDrive/IU Prediction/predict_df.csv',encoding='UTF-8',low_memory=False)
df

Unnamed: 0,corpusSeq,docId,unitId,unitStart,unitStartSkipPause,unitStartSkipPauseAlt,unitEnd,unitDur,unitDurSkipPause,unitBil,...,init4,init5,prevEndnote,currLength,totalLength,currLengthSq,preBoundary,preClosure,preContinue,preAppeal
0,2,sbc001,1,0.000,5.294,6.820,9.210,9.210,3.916,I,...,,,noBoundary,0,11,0,False,False,False,False
1,3,sbc001,1,0.000,5.294,6.820,9.210,9.210,3.916,I,...,,,noBoundary,1,11,1,False,False,False,False
2,4,sbc001,1,0.000,5.294,6.820,9.210,9.210,3.916,I,...,,,noBoundary,2,11,4,False,False,False,False
3,5,sbc001,1,0.000,5.294,6.820,9.210,9.210,3.916,I,...,so,,noBoundary,3,11,9,False,False,False,False
4,6,sbc001,1,0.000,5.294,6.820,9.210,9.210,3.916,I,...,so,so,noBoundary,4,11,16,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263361,450292,sbc060,1010,1484.999,1484.999,1484.999,1486.347,1.348,1.348,B,...,,,continuing,0,2,0,False,False,False,False
263362,450293,sbc060,1010,1484.999,1484.999,1484.999,1486.347,1.348,1.348,L,...,,,continuing,1,2,1,True,True,False,False
263363,450299,sbc060,1011,1486.347,1486.847,1487.817,1488.519,2.172,1.672,I,...,,,final,0,1,0,True,False,False,False
263364,450303,sbc060,1012,1488.519,1488.519,1488.519,1489.281,0.762,0.762,B,...,,,break,0,2,0,False,False,False,False


In [None]:
df = df[['docId','unitId','has_hyphen','text_trimmed','preBoundary','POS_dft']]
df.loc[df['has_hyphen'] == True, 'text_trimmed'] = '<INC>'
df = df.drop(columns=['has_hyphen','POS_dft'])
df


Unnamed: 0,docId,unitId,text_trimmed,preBoundary
0,sbc001,1,so,False
1,sbc001,1,you,False
2,sbc001,1,do,False
3,sbc001,1,n't,False
4,sbc001,1,need,False
...,...,...,...,...
263361,sbc060,1010,maybe,False
263362,sbc060,1010,<UNK>,True
263363,sbc060,1011,in,True
263364,sbc060,1012,in,False


In [None]:
VOCAB = list(set(df['text_trimmed'])) + ['<PAD>']
VOCAB_SIZE = len(VOCAB)
WORD_INDEX = {word: i for i, word in enumerate(VOCAB)}

print(VOCAB_SIZE)
print(WORD_INDEX)

203
{'wan': 0, 'right': 1, 'i': 2, 'where': 3, 'an': 4, 'hm': 5, 'his': 6, 'had': 7, 'went': 8, 'wanted': 9, 'think': 10, 'things': 11, 'unhunh': 12, 'which': 13, 'what': 14, 'alright': 15, 'those': 16, "n't": 17, 'another': 18, 'everything': 19, 'two': 20, 'not': 21, 'came': 22, 'when': 23, 'whole': 24, 'there': 25, 'oh': 26, 'five': 27, 'then': 28, 'with': 29, 'so': 30, 'she': 31, "'m": 32, 'cause': 33, 'that': 34, 'mm': 35, 'new': 36, 'just': 37, 'did': 38, 'put': 39, 'into': 40, 'the': 41, 'like': 42, 'my': 43, 'a': 44, 'some': 45, 'still': 46, 'it': 47, 'who': 48, 'sure': 49, 'say': 50, 'should': 51, 'people': 52, 'too': 53, 'told': 54, 'man': 55, 'mean': 56, 'long': 57, 'have': 58, 'him': 59, 'well': 60, 'no': 61, 'can': 62, 'their': 63, 'will': 64, 'year': 65, 'last': 66, 'or': 67, 'about': 68, "'re": 69, 'got': 70, 'em': 71, 'does': 72, 'even': 73, 'great': 74, 'way': 75, 'would': 76, 'kind': 77, 'of': 78, 'get': 79, 'very': 80, 'okay': 81, 'he': 82, 'never': 83, 'goes': 84, 't

In [None]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [None]:
SEQUENCE_LENGTH = 13
SLIDING_WINDOW = 3  # New hyperparameter for sliding window
HIDDEN_SIZE = 64
BATCH_SIZE = 32
EPOCHS = 30

In [None]:
class IntonationDataset(Dataset):
    def __init__(self, df, WORD_INDEX, seq_length, sliding_window):
        self.df = df
        self.WORD_INDEX = WORD_INDEX
        self.seq_length = seq_length
        self.sliding_window = sliding_window

    def __len__(self):
        return (len(self.df) - self.seq_length) // self.sliding_window + 1

    def __getitem__(self, idx):
        start_idx = idx * self.sliding_window
        words = self.df['text_trimmed'].iloc[start_idx:start_idx+self.seq_length].tolist()
        word_indices = [self.WORD_INDEX.get(word) for word in words]
        targets = self.df['preBoundary'].iloc[start_idx:start_idx+self.seq_length].astype(int).tolist()

        # Create mask for valid (non-padded) elements
        mask = [1] * len(word_indices)

        # Pad sequences if necessary
        if len(word_indices) < self.seq_length:
            word_indices += [self.WORD_INDEX['<PAD>']] * (self.seq_length - len(word_indices))
            targets += [0] * (self.seq_length - len(targets))
            mask += [0] * (self.seq_length - len(mask))

        # One-hot encode the input
        input_tensor = torch.zeros(self.seq_length, len(self.WORD_INDEX))
        for i, word_idx in enumerate(word_indices):
            input_tensor[i, word_idx] = 1

        return input_tensor, torch.tensor(targets, dtype=torch.float32), torch.tensor(mask, dtype=torch.float32)

def collate_fn(batch):
    inputs, targets, masks = zip(*batch)
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    masks = pad_sequence(masks, batch_first=True, padding_value=0)
    return inputs, targets, masks

class IntonationRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(IntonationRNN, self).__init__()
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output)
        return output





In [None]:
# Calculate sizes for 90/10 split
full_dataset = IntonationDataset(df, WORD_INDEX, SEQUENCE_LENGTH, SLIDING_WINDOW)


# # Split the dataset
# train_df, val_df = train_test_split(df, test_size=0.1, random_state=SEED)

total_size = len(full_dataset)
train_size = int(0.9 * total_size)
val_size = total_size - train_size

# Split the dataset
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Initialize model
model = InterpretableGRUModel(len(WORD_INDEX), HIDDEN_SIZE)


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
def find_best_threshold(targets, probabilities):
    # Compute precision-recall curve
    precisions, recalls, thresholds = precision_recall_curve(targets, probabilities)

    # Calculate F1-scores for each threshold
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)  # Avoid division by zero
    best_index = np.argmax(f1_scores)
    best_threshold = thresholds[best_index]

    print(f"Best Threshold: {best_threshold:.4f}, Best F1-Score: {f1_scores[best_index]:.4f}")
    return best_threshold

In [None]:
def evaluate(model, data_loader, criterion, threshold=0.5):
    model.eval()
    total_loss = 0.0
    total_samples = 0
    all_targets = []
    all_probabilities = []

    with torch.no_grad():
        for inputs, targets, masks in data_loader:
            outputs = model(inputs)
            probabilities = torch.sigmoid(outputs).squeeze(-1)  # Ensure correct shape

            # Collect probabilities and targets
            all_probabilities.extend(probabilities.cpu().numpy().flatten())
            all_targets.extend(targets.cpu().numpy().flatten())

            # Calculate masked loss
            if masks.sum() > 0:
                masked_loss = (criterion(outputs.squeeze(-1), targets) * masks).sum() / masks.sum()
            else:
                masked_loss = torch.tensor(0.0)

            # Accumulate loss and valid sample count
            total_loss += masked_loss.item() * masks.sum().item()
            total_samples += masks.sum().item()

    # Calculate average loss
    avg_loss = total_loss / total_samples if total_samples > 0 else 0.0

    # Convert to numpy arrays
    all_probabilities = np.array(all_probabilities)
    all_targets = np.array(all_targets)

    # Calculate metrics
    predictions = (all_probabilities > threshold).astype(float)
    accuracy = accuracy_score(all_targets, predictions)
    precision = precision_score(all_targets, predictions, zero_division=0)
    recall = recall_score(all_targets, predictions, zero_division=0)
    f1 = f1_score(all_targets, predictions, zero_division=0)

    return avg_loss, accuracy, precision, recall, f1

# Training loop with metric logging
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    # Training phase
    for inputs, targets, masks in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)

        # Calculate masked loss
        if masks.sum() > 0:
            masked_loss = (criterion(outputs.squeeze(-1), targets) * masks).sum() / masks.sum()
        else:
            masked_loss = torch.tensor(0.0)

        # Backpropagation and optimization step
        masked_loss.backward()
        optimizer.step()
        total_loss += masked_loss.item()

    # Calculate average training loss
    train_loss = total_loss / len(train_loader)

    # Evaluation phase: Run after training completes for the epoch
    all_probabilities, all_targets = [], []
    with torch.no_grad():
        for inputs, targets, _ in val_loader:
            outputs = model(inputs)
            probabilities = torch.sigmoid(outputs).squeeze(-1)

            # Collect all probabilities and targets
            all_probabilities.extend(probabilities.cpu().numpy().flatten())
            all_targets.extend(targets.cpu().numpy().flatten())

    all_probabilities = np.array(all_probabilities)
    all_targets = np.array(all_targets)

    # Ensure targets are binary
    if not np.array_equal(np.unique(all_targets), [0, 1]):
        raise ValueError("Targets must be binary (0 or 1). Check your dataset or dataloader.")

    # Find the best threshold only once (during the first epoch)
    if epoch == 0:
        best_threshold = find_best_threshold(all_targets,all_probabilities)

    # Evaluate on the validation set using the best threshold
    val_loss, accuracy, precision, recall, f1 = evaluate(
        model, val_loader, criterion, threshold=best_threshold
    )

    # Print metrics for the epoch
    print(f"Epoch {epoch + 1}/{EPOCHS}, "
          f"Train Loss: {train_loss:.4f}, "
          f"Validation Loss: {val_loss:.4f}, "
          f"Accuracy: {accuracy:.4f}, "
          f"Precision: {precision:.4f}, "
          f"Recall: {recall:.4f}, "
          f"F1-score: {f1:.4f}, "
          f"Threshold: {best_threshold:.4f}")


Best Threshold: 0.2741, Best F1-Score: 0.5946
Epoch 1/30, Train Loss: 0.4357, Validation Loss: 0.4220, Accuracy: 0.7540, Precision: 0.4977, Recall: 0.7381, F1-score: 0.5945, Threshold: 0.2741
Epoch 2/30, Train Loss: 0.4174, Validation Loss: 0.4139, Accuracy: 0.7524, Precision: 0.4957, Recall: 0.7612, F1-score: 0.6004, Threshold: 0.2741
Epoch 3/30, Train Loss: 0.4107, Validation Loss: 0.4093, Accuracy: 0.7592, Precision: 0.5049, Recall: 0.7593, F1-score: 0.6065, Threshold: 0.2741
Epoch 4/30, Train Loss: 0.4059, Validation Loss: 0.4058, Accuracy: 0.7627, Precision: 0.5097, Recall: 0.7599, F1-score: 0.6101, Threshold: 0.2741
Epoch 5/30, Train Loss: 0.4019, Validation Loss: 0.4030, Accuracy: 0.7639, Precision: 0.5113, Recall: 0.7677, F1-score: 0.6138, Threshold: 0.2741
Epoch 6/30, Train Loss: 0.3980, Validation Loss: 0.4005, Accuracy: 0.7702, Precision: 0.5206, Recall: 0.7568, F1-score: 0.6169, Threshold: 0.2741
Epoch 7/30, Train Loss: 0.3942, Validation Loss: 0.3979, Accuracy: 0.7658, Pre