In [None]:
# ===================================================================
# CELL 1: SETUP
# ===================================================================
print("⏳ Installing stable library versions...")
# This pins spaCy and its plugins to a specific stable version
!pip install "spacy[transformers,lookups]==3.7.2" datasets scikit-learn pandas google-generativeai --quiet

import os
# This restart is required for the new libraries to be loaded correctly.
os.kill(os.getpid(), 9)

In [None]:
# ===================================================================
# 2. IMPORTS AND DATA PREPARATION
# ===================================================================
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from spacy.tokens import DocBin
import spacy
import logging

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    force=True)

# Load dataset
dataset = load_dataset("sepidmnorozy/Finnish_sentiment")
df = pd.DataFrame(dataset['train'])

# Map labels and define categories
label_map = {1: "POSITIVE", 0: "NEGATIVE"}
df['label_name'] = df['label'].map(label_map)
categories = list(df['label_name'].unique())

# Split data
X_train, X_valid, y_train, y_valid = train_test_split(df['text'], df['label_name'], test_size=0.2, random_state=42, stratify=df['label_name'])

# Function to create .spacy files
def make_docs(data, labels, categories):
    docs = []
    nlp = spacy.blank("fi")
    for text, label in zip(data, labels):
        truncated_text = text[:2500]
        doc = nlp.make_doc(truncated_text)
        cats = {cat: False for cat in categories}
        cats[label] = True
        doc.cats = cats
        docs.append(doc)
    return DocBin(docs=docs)

# Create and save the data files
train_doc_bin = make_docs(X_train, y_train, categories)
train_doc_bin.to_disk("train.spacy")
valid_doc_bin = make_docs(X_valid, y_valid, categories)
valid_doc_bin.to_disk("valid.spacy")
logging.info("Data preparation complete.")

In [None]:
# ===================================================================
# 3. DEMO: PROGRAMMATIC LABELING WITH GEMINI
# ===================================================================
# In this section, we demonstrate how an LLM can be used to label
# a sample of the data, and we validate its accuracy.

# --- Configure API and select a sample ---
try:
    genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))
    logging.info("Gemini API configured successfully.")
except Exception as e:
    logging.error(f"Error configuring Gemini API. Check Colab Secrets. Error: {e}")

n_samples_for_demo = 50
demo_df = df.sample(n=n_samples_for_demo, random_state=42)
logging.info(f"Selected {n_samples_for_demo} samples for the Gemini demo.")

# --- API call function with retries and safe sleep time ---
def get_gemini_label(text, retries=3):
    """Uses the Gemini API to classify a single piece of text."""
    prompt = f"Analyze the sentiment of the following Finnish text. Classify it as either POSITIVE or NEGATIVE. Return only the single word classification. Text: \"{text}\" Classification:"

    # Use the fast and efficient Flash model
    model = genai.GenerativeModel('gemini-1.5-flash')

    for i in range(retries):
        try:
            response = model.generate_content(prompt)
            label = response.text.strip().upper()
            if label in ["POSITIVE", "NEGATIVE"]:
                return label
        except Exception as e:
            logging.warning(f"API call failed on attempt {i+1}/{retries}. Error: {e}")

        # Wait a safe amount of time before retrying to avoid rate limits
        time.sleep(10)

    return "FAILED"

# --- Label the sample ---
gemini_labels = []
logging.info("Starting Gemini labeling (this will take several minutes)...")
for i, text in enumerate(demo_df['text']):
    gemini_labels.append(get_gemini_label(text))
    if (i + 1) % 10 == 0:
        logging.info(f"Gemini progress: {i + 1}/{len(demo_df)} texts labeled.")
demo_df['gemini_label'] = gemini_labels
logging.info("Gemini labeling complete.")

# --- Validate Gemini's accuracy ---
valid_predictions = demo_df[demo_df['gemini_label'] != 'FAILED']
accuracy = accuracy_score(valid_predictions['label_name'], valid_predictions['gemini_label'])
logging.info(f"Gemini Labeling Accuracy: {accuracy:.2%}")
print("\n--- Gemini Labeling Validation Report ---")
print(classification_report(valid_predictions['label_name'], valid_predictions['gemini_label']))

In [None]:
# ===================================================================
# 3. CREATE CUSTOM CONFIG FILE
# ===================================================================
config_string = """
[paths]
train = null
dev = null
[system]
gpu_allocator = "pytorch"
seed = 42
[nlp]
lang = "fi"
pipeline = ["transformer", "textcat"]
batch_size = 128
[components]
[components.transformer]
factory = "transformer"
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "TurkuNLP/bert-base-finnish-cased-v1"
[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96
[components.textcat]
factory = "textcat"
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v2"
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
patience = 1600
max_epochs = 0
eval_frequency = 200
[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 2e-5
# FIX: Reduce batcher size to prevent out-of-memory errors
[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
size = 500
buffer = 256
discard_oversize = true
"""
with open("config.cfg", "w") as f:
    f.write(config_string)

logging.info("✅ Created custom 'config.cfg' successfully with a smaller batch size.")

In [None]:
# ===================================================================
# 4. TRAIN THE MODEL
# ===================================================================
!python -m spacy train config.cfg --output ./training --paths.train ./train.spacy --paths.dev ./valid.spacy --gpu-id 0

In [None]:
# ===================================================================
# 5. EVALUATE THE MODEL
# IMPORTANT: After training is complete, restart the session
# (Runtime -> Restart session) before running this cell.
# ===================================================================
import spacy
import spacy_transformers
from sklearn.metrics import classification_report
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Reload the validation data
dataset = load_dataset("sepidmnorozy/Finnish_sentiment")
df = pd.DataFrame(dataset['train'])
_, X_valid, _, y_valid = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# Load the best model
nlp_best = spacy.load("training/model-best")
print("✅ Best model loaded successfully.")

# Get predictions
predicted_labels = []
for text in X_valid:
    doc = nlp_best(text)
    predicted_label = max(doc.cats, key=doc.cats.get)
    predicted_labels.append(predicted_label)

label_map = {1: "POSITIVE", 0: "NEGATIVE"}
true_labels = y_valid.map(label_map)

# Print final report
print("\n--- Final spaCy Model Validation Report ---")
print(classification_report(true_labels, predicted_labels))