Hi there!
This is the code template for CW2 task1 of COMP34711 2025/26.

- <span style="color:red; font-size:1em">First of all, please rename the notebook into "{your_student_id}_CW2_task{your_task_number}.ipynb", for example "12345678_CW2_task1.ipynb".</span>

- In this template, we only provide the minimal structure for your coursework.
  
- Please carefully read and organize your code in the template we provided.

## Constants

In [63]:
#Please keep only necessary information in this cell.

#----------------------Please keep all following constants unchanged.----------------------------------------
NUM_ROWS_VALIDATION = 1031 # Number of rows in validation set
NUM_ROWS_TEST = 1053 # Number of rows in test set

#----------------------Please modify the following constants to fit your actual value.-----------------------
STUDENT_ID = '11445473'  # Replace with your actual 8-digits student ID
TRAINING_SET = './data/CW2_training_dataset.csv' # Replace with the actual path to your training dataset csv file
VALIDATION_SET = './data/CW2_validation_dataset.csv'  # Replace with the actual path to your validation dataset csv file
VALIDATION_SET_OUTPUT = f'./data/{STUDENT_ID}_CW2_task1_validation_results.csv'  # Replace with the actual path to your validation prediction csv file
TEST_SET_INPUT = './data/CW2_test_dataset.csv'  # Replace with the actual path to your test prediction csv file

#----------------------Your constants------------------------------------------------
# By adding more constants here, you can help improve the clarity and maintainability of your code and make the reviewing easier for TAs.
HIDDEN_DIM = 64
MAX_SYNOPSIS_LENGHT = 5000
BATCH_SIZE = 16
NUM_EPOCHS = 5

## Installations

In [64]:
# Install required packages for the coursework
# Uncomment and run the following lines if needed

!pip install pandas scikit-learn spacy nltk --quiet
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m42.5 MB/s[0m  [33m0:00:00[0mm0:00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


## Imports

In [65]:
#Please keep all imports of your code cells in this cell

#---------------------Required imports----------------------
import pandas as pd
import re
import sys
import os.path
import csv
from sklearn.metrics import f1_score
#----------------------Your imports-------------------------
import spacy
import numpy as np
import torch.nn as nn
import torch
import nltk
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from IPython.display import clear_output


nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /mnt/iusers01/fse-
[nltk_data]     ugpgt01/compsci01/y12806ja/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Start of your code cells

- The code cells provided below are demo code format for TAs to quickly locate your implementation.

- You have full right to freely add/delete/edit the titles and codes in the following cells.

- Please follow this genre order: "comedy, cult, flashback, historical, revenge, romantic, scifi, violence".

### Data Loading

In [66]:
#this makes sure to use the gpu if available otherwise the cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [67]:
# Your code cells here
df = pd.read_csv(TRAINING_SET)

In [68]:
GENRE_COLS = [
    "comedy", "cult", "flashback", "historical",
    "revenge", "romantic", "scifi", "violence"
]

# extract Y from training df
Y_train = df[GENRE_COLS].values   # shape (7127, 8)

# Count positives and negatives per label
pos_counts = Y_train.sum(axis=0)
neg_counts = (Y_train.shape[0] - pos_counts)

# Compute pos_weight = neg/pos
pos_weight = torch.tensor(neg_counts / pos_counts, dtype=torch.float32).to(device)

print(pos_weight)


tensor([ 4.7943,  2.9572,  2.5724, 36.3141,  3.2423,  2.5760, 33.4300,  1.3506],
       device='cuda:0')


### Tokenization

In [69]:
class MovieDataset(Dataset):
    def __init__(self, df, nlp, has_labels=True):
        self.df = df
        self.nlp = nlp
        self.has_labels = has_labels
        self.word_to_vec = {}

        self.genre_cols = [
            "comedy", "cult", "flashback", "historical",
            "revenge", "romantic", "scifi", "violence"
        ]

    def text_to_matrix(self, tokens):
        vectors = []
        for tok in tokens:
            if tok in self.word_to_vec:
                vectors.append(self.word_to_vec[tok])
            else:
                v = self.nlp(tok).vector
                self.word_to_vec[tok] = v
                vectors.append(v)
        return np.array(vectors)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = f"{row.title} [SEP] {row.plot_synopsis}"

        # tokenize
        tokens = word_tokenize(text)

        # trim
        tokens = tokens[:MAX_SYNOPSIS_LENGHT]

        # embed
        mat = self.text_to_matrix(tokens)

        # lengths
        length = len(mat)

        # pad
        if length < MAX_SYNOPSIS_LENGHT:
            pad = np.zeros((MAX_SYNOPSIS_LENGHT - length, 300), dtype=np.float32)
            mat = np.vstack([mat, pad])

        # convert to tensor
        X = torch.tensor(mat, dtype=torch.float32)

        if not self.has_labels:
            return X, length

        # labels
        y_values = row[self.genre_cols].astype(int).tolist()
        y = torch.tensor(y_values, dtype=torch.float32)

        return X, length, y

    def __len__(self):
        return len(self.df)


### Model and Training

In [70]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, hidden_dim, num_labels=8):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=300,
            hidden_size=hidden_dim,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )

        self.fc = nn.Linear(2 * hidden_dim, num_labels)

    def forward(self, x, lengths):
        lengths = lengths.tolist()

        # pack padded sequences
        packed = nn.utils.rnn.pack_padded_sequence(
            x, lengths, batch_first=True, enforce_sorted=False
        )

        _, (h_n, c_n) = self.lstm(packed)

        # h_n shape: (num_layers*2, batch, hidden_dim)
        h_forward = h_n[0]      # last forward state
        h_backward = h_n[1]     # last backward state

        h_final = torch.cat([h_forward, h_backward], dim=1)
        logits = self.fc(h_final)
        return logits


In [71]:
nlp = spacy.load("en_core_web_md")
dataset = MovieDataset(df, nlp)

loader = DataLoader(
    dataset,
    batch_size= BATCH_SIZE,
    shuffle=True
)


In [72]:
model = BiLSTMClassifier(HIDDEN_DIM).to(device)
print(next(model.parameters()).device)

criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    i = 0
    for X_batch, lengths_batch, Y_batch in loader:
        i += 1
        print(i)
        X_batch = X_batch.to(device)
        lengths_batch = lengths_batch.to(device)
        Y_batch = Y_batch.to(device)
        optimizer.zero_grad()

        # forward pass
        logits = model(X_batch, lengths_batch)

        # compute loss
        loss = criterion(logits, Y_batch)

        # backward pass
        loss.backward()

        # update weights
        optimizer.step()

        #reset the grad
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss = {total_loss:.4f}")


cuda:0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
2

### Finding the best thresholds

In [73]:
# Load test set
validation_df_raw = pd.read_csv(VALIDATION_SET)

X_validation = MovieDataset(validation_df_raw, spacy.load("en_core_web_md"))
validation_loader = DataLoader(X_validation, batch_size= BATCH_SIZE, shuffle=False)

model.eval()
all_probs = []
all_labels = []

with torch.no_grad():
    for X_batch, lengths_batch, Y_batch in validation_loader:
        X_batch = X_batch.to(device)
        lengths = lengths_batch

        logits = model(X_batch, lengths)
        probs = torch.sigmoid(logits).cpu()

        all_probs.append(probs)
        all_labels.append(Y_batch)

probs_val = torch.cat(all_probs, dim=0).numpy()
y_val = torch.cat(all_labels, dim=0).numpy()

In [74]:
def find_best_thresholds(probs, y_true):
    thresholds = np.linspace(0, 1, 101)  # 0.00 → 1.00
    best_thresholds = []

    for col in range(probs.shape[1]):   # for each genre
        best_f1 = 0
        best_thr = 0.5

        for thr in thresholds:
            preds = (probs[:, col] >= thr).astype(int)
            f1 = f1_score(y_true[:, col], preds, zero_division=0)

            if f1 > best_f1:
                best_f1 = f1
                best_thr = thr

        best_thresholds.append(best_thr)

    return np.array(best_thresholds)

best_thresholds = find_best_thresholds(probs_val, y_val)
print("BEST THRESHOLDS PER GENRE:")
print(best_thresholds)

BEST THRESHOLDS PER GENRE:
[0.56 0.38 0.27 0.96 0.44 0.44 0.58 0.45]


### Prediction

In [75]:
word_to_vec = {}

model.eval()
def predict_dataframe(df, model, has_labels = True):
    dataset = MovieDataset(df, nlp, has_labels)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=lambda x: x)

    all_preds = []

    model.eval()
    with torch.no_grad():
        for batch in loader:
            X_batch = [b[0] for b in batch]
            lengths = [b[1] for b in batch]

            X_batch = torch.stack(X_batch).to(device)
            lengths = torch.tensor(lengths).to(device)

            logits = model(X_batch, lengths)
            probs = torch.sigmoid(logits)
            thresholds_tensor = torch.tensor(best_thresholds, device=probs.device, dtype=probs.dtype)
            preds = (probs >= thresholds_tensor).int()
            all_preds.append(preds)

    all_preds = torch.cat(all_preds, dim=0).cpu().numpy()
    return all_preds  # shape: (len(df), 8)


## End of your code cells

### Evaluation scripts

In [76]:
def read_data(submission_file_path, gold_standard_file_path):
    """
    Read submission and gold standard files.
    Extract student ID from filename.
    """
    # Try to find student ID from the filename (looks for 8 digit numbers)
    id_regex = r'\d{8}'

    user_id = re.findall(id_regex, submission_file_path)
    print("Found your ID: ", user_id)
    if user_id:
        user_id = user_id[0]
    else:
        user_id = 'Unknown'

    # Load submission CSV
    print(f"\nLoading submission file: {submission_file_path}")
    submission_df = pd.read_csv(submission_file_path, sep=',', header=None,
                                quoting=csv.QUOTE_NONE, encoding='utf-8')

    # Load gold standard CSV
    print(f"Loading gold standard file: {gold_standard_file_path}")
    gold_standard_df = pd.read_csv(gold_standard_file_path, header=None)

    # Remove columns 1 and 2 (keep only ID and labels)
    gold_standard_df = gold_standard_df.drop([1, 2], axis=1)
    # Skip header row
    gold_standard_df = gold_standard_df.iloc[1:]

    return submission_df, gold_standard_df, user_id


def match_and_prepare_data(submission_df, gold_standard_df, user_id):
    """
    Match submission rows with gold standard rows by ID.
    Prepare data for evaluation.
    """
    gold_standard_labels = []
    submission_labels = []
    missed_rows = []
    submission_df_copy = submission_df.copy()

    print(f"\nMatching submission with gold standard...")
    print(f"Gold standard rows: {len(gold_standard_df)}")
    print(f"Submission rows: {len(submission_df_copy)}")

    # Match each gold standard row with submission
    for index, row in gold_standard_df.iterrows():
        row = row.reset_index(drop=True)
        row_found = False
        row_id = row[0]

        # Extract gold standard labels
        row_labels = [int(row[i]) for i in range(1, len(row))]
        gold_standard_labels.append(row_labels)

        # Find corresponding submission row
        for sub_index, submission_row in submission_df_copy.iterrows():
            if submission_row[0].strip() == row_id.strip():
                try:
                    # Extract submission labels
                    submission_row_labels = [int(submission_row[i]) for i in range(1, len(submission_row))]
                except:
                    # Handle malformed labels (take first character if multi-digit)
                    submission_row_labels = [int(str(submission_row[i])[0]) for i in range(1, len(submission_row))]

                submission_labels.append(submission_row_labels)
                row_found = True
                submission_df_copy.drop(sub_index, inplace=True)
                break

        if not row_found:
            # If row is missing, add inverse labels (worst possible prediction)
            missed_rows.append(row_id)
            submission_labels.append([0 if label == 1 else 1 for label in row_labels])

    return gold_standard_labels, submission_labels, missed_rows


def evaluate_submission(gold_standard_labels, submission_labels):
    """
    Calculate weighted F1 score.
    """
    print(f"\nCalculating weighted F1 score...")

    # Calculate weighted F1 score (accounts for class imbalance)
    f1_weighted = f1_score(gold_standard_labels, submission_labels, average='weighted')

    return f1_weighted


def print_results(user_id, f1_weighted, missed_rows):
    """
    Print evaluation results to screen.
    """
    print("\n" + "="*70)
    print("YOUR SUBMISSION EVALUATION REPORT")
    print("="*70)

    # Alert if ID not found in filename
    if user_id == 'Unknown':
        print('WARNING: ID not found in filename!')
        print('   Please ensure your filename contains your 8-digit student ID.')
        print()

    print(f"Your ID: {user_id}")
    print()

    # Display F1 score with visual indicator
    print("EVALUATION RESULTS:")
    print(f"   Weighted F1 Score: {f1_weighted:.4f}")
    print()

    # Report missing rows
    if missed_rows:
        print(f"MISSING DATA ({len(missed_rows)} rows not found):")
        print("-" * 70)
        for i, row in enumerate(missed_rows[:10], 1):  # Show first 10
            print(f"    {i}. Row ID: {row}")
        if len(missed_rows) > 10:
            print(f"    ... and {len(missed_rows) - 10} more missing rows")
        print()
        print("TIP: Make sure your submission includes all required rows.")
        print("        Missing rows are penalized with worst possible predictions.")
    else:
        print("DATA COMPLETENESS: All expected rows found in your submission!")

    print()
    print("="*70)
    print()


def evaluate(submission_path, gold_standard_path):
    """
    Main function to run the submission evaluation script.
    """

    submission_file = submission_path
    gold_standard_file = gold_standard_path

    # Check if files exist
    if not os.path.exists(submission_file):
        print(f"Error: Your submission file '{submission_file}' not found!")
        print("Make sure the file path is correct and the file exists.")
        sys.exit(1)

    if not os.path.exists(gold_standard_file):
        print(f"Error: Gold standard file '{gold_standard_file}' not found!")
        print("Make sure you have the correct gold standard file.")
        sys.exit(1)

    try:
        # Step 1: Read data
        submission_df, gold_standard_df, user_id = read_data(submission_file, gold_standard_file)

        # Step 2: Match and prepare data
        gold_standard_labels, submission_labels, missed_rows = match_and_prepare_data(
            submission_df, gold_standard_df, user_id
        )

        # Step 3: Evaluate
        f1_weighted = evaluate_submission(gold_standard_labels, submission_labels)

        # Step 4: Print results
        print_results(user_id, f1_weighted, missed_rows)

    except Exception as e:
        print(f"Error during evaluation: {str(e)}")
        print("Please check that your files are in the correct CSV format.")
        print("Each row should contain: ID, label1, label2, label3, ...")
        import traceback
        traceback.print_exc()
        sys.exit(1)

### Evaluate the model on the validation dataset

In [None]:
# Please run the evaluation scripts cell above before running the mark_and_record

# Please make sure that output format is like following (no header row, no title and plot columns):
# 94834c61-0e30-4799-9998-6f74f6sbb204	0	1	0	0	1	0	0	0
# 559sdd28-b6a2-4662-ab55-a6678as26a56	0	0	0	0	0	0	1	0
# b71y3317-04cd-42f5-a380-d21dfasdbd36	0	0	0	0	1	0	0	0

evaluation_results = evaluate(VALIDATION_SET_OUTPUT, VALIDATION_SET)

Error: Your submission file './data/11445473_CW2_task1_validation_results.csv' not found!
Make sure the file path is correct and the file exists.


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


### Save predictions to formatted file.

In [None]:
# Now please modify the code to format your output csv file.

# Please make sure that output format is like following (no header row, no tilte and plot columns):
# 94834c61-0e30-4799-9998-6f74f6sbb204	0	1	0	0	1	0	0	0
# 559sdd28-b6a2-4662-ab55-a6678as26a56	0	0	0	0	0	0	1	0
# b71y3317-04cd-42f5-a380-d21dfasdbd36	0	0	0	0	1	0	0	0

test_df = pd.read_csv(TEST_SET_INPUT)
test_preds = predict_dataframe(test_df, model, False)

output_df = pd.DataFrame(test_preds, columns=GENRE_COLS)
output_df.insert(0, 'ID', test_df['ID'])

# For example, if you have a DataFrame named 'output_df', you can save it
assert isinstance(output_df, pd.DataFrame)
assert len(output_df) == NUM_ROWS_TEST, "Output length is not aligned with the testdata.csv."
assert len(output_df.columns) == 9, "Please make sure to follow the format above and keep only IDs and 8 columns of prediction."
output_df.to_csv(f'./data/{STUDENT_ID}_CW2_task1_validation_results.csv', index=False, header=False)