In [1]:
import ollama
import time
import pandas as pd
import numpy as np
np.random.seed(1)

# Preparing the annotation prompts & pipeline

## Annotation few-shot (Abortion)

In [None]:
from ollama import chat

def classify_abortion_fewshot(text, LLM_model):
    messages = [
        {"role": "system", "content": (
            "You are an expert classifier. Classify whether political speeches discuss "
            "topics of abortion and/or reproductive rights (including abortion, §218, "
            "reproductive autonomy, family planning, contraception, etc.), either explicitly or implicitly. "
            "If yes, reply only '1'. If not, reply only '0'."
        )},
        {"role": "user", "content": "Ich bin gegen die Reform des § 218. --- Are topics of abortion and reproductive rights discussed? Classify with '1' for yes or '0' for no."},
        {"role": "assistant", "content": "1"},
        {"role": "user", "content": "Der Verkehrsausschuss tagte heute zum Thema Infrastruktur. --- Are topics of abortion and reproductive rights discussed? Classify with '1' for yes or '0' for no."},
        {"role": "assistant", "content": "0"},
        {"role": "user", "content": "Frauen sollen selbst entscheiden dürfen, ob sie ein Kind bekommen. --- Are topics of abortion and reproductive rights discussed? Classify with '1' for yes or '0' for no."},
        {"role": "assistant", "content": "1"},
        {"role": "user", "content": f"{text} --- Are topics of abortion and reproductive rights discussed? Classify with '1' for yes or '0' for no."}
    ]
    
    response = chat(
        model=LLM_model,
        messages=messages,
        options={
            'seed': 90825,
            'temperature': 0.1,
            'top_p': 0.1,
            'num_predict': 1,    
            'num_ctx': 4096,     
        }
    )
    
    annotation_str = response['message']['content'].strip()
    
    if '1' in annotation_str:
        annotation = '1'
    elif '0' in annotation_str:
        annotation = '0'
    else:
        print(f" {annotation_str}, defaulting to '0'")
        annotation = '0'
    
    print(f"Annotation: {annotation}")
    return annotation


## Annotation fewshot (Economy)

In [None]:
from ollama import chat

def classify_economy_fewshot(text, LLM_model):
    messages = [
        {"role": "system", "content": (
            "You are an expert classifier. Classify whether political speeches discuss"
            "topics of economy (including inflation, unemployment, economic growth, etc.), either explicitly or implicitly. "
            "If yes, reply only '1'. If not, reply only '0'."
        )},
        {"role": "user", "content": "Ohne umfassende Reformen der Bürokratie wird der Wirtschaftsstandort Deutschland leiden. --- Are topics of the economy discussed? Classify with '1' for yes or '0' for no."},
        {"role": "assistant", "content": "1"},
        {"role": "user", "content": "Der Verkehrsausschuss tagte heute zum Thema Infrastruktur. --- Are topics of the economy discussed? Classify with '1' for yes or '0' for no."},
        {"role": "assistant", "content": "0"},
        {"role": "user", "content": "Unser Ziel ist, dem Wirtschaftsstandort Deutschland im europäischen Vergleich dauerhaft wettbewerbsfähige Strom- und Gaspreise zu sichern. --- Are topics of the economy discussed? Classify with '1' for yes or '0' for no."},
        {"role": "assistant", "content": "1"},
        {"role": "user", "content": f"{text} --- Are topics of the economy discussed? Classify with '1' for yes or '0' for no."}
    ]
    
    response = chat(
        model=LLM_model,
        messages=messages,
        options={
            'seed': 90825,
            'temperature': 0.1,
            'top_p': 0.1,
            'num_predict': 10,    
            'num_ctx': 4096,    
        }
    )
    
    annotation_str = response['message']['content'].strip()
    
    if '1' in annotation_str:
        annotation = '1'
    elif '0' in annotation_str:
        annotation = '0'
    else:
        print(f" {annotation_str}, defaulting to '0'")
        annotation = '0'
    
    print(f"Annotation: {annotation}")
    return annotation


# Collecting samples from prepared corpus

## Selected abortion topic sample 25.04.1974

In [None]:
Corpus_chunkified = pd.read_csv("./CSV/Corpus_Chunkified_adjusted.csv", index_col=0)
Corpus_chunkified = Corpus_chunkified[Corpus_chunkified['date'] == "[datetime.date(1974, 4, 25)]"]
Corpus_chunkified = Corpus_chunkified.reset_index(drop=True)

#Read after initial corpus creation:
'''Corpus_chunkified = pd.read_csv("./CSV/Corpus_abort_sample_chunked_annotated.csv")
Corpus_chunkified = Corpus_chunkified.reset_index(drop=True)'''

'Corpus_chunkified = pd.read_csv("./CSV/Corpus_abort_sample_chunked_annotated.csv")\nCorpus_chunkified = Corpus_chunkified.reset_index(drop=True)'

## Selected abortion topic sample 2 - 24.06.2022

In [None]:
Corpus_abortion_sample2 = pd.read_csv("./CSV/Corpus_Chunkified_adjusted.csv")
Corpus_abortion_sample2 = Corpus_abortion_sample2[Corpus_abortion_sample2['date'] == "[datetime.date(2022, 6, 24)]"]
Corpus_chunkified = Corpus_abortion_sample2
Corpus_chunkified = Corpus_chunkified.reset_index(drop=True)

#Read after after initial corpus creation:
'''Corpus_chunkified = pd.read_csv("./CSV/Corpus_abortion_sample2_chunked_annotated.csv")
Corpus_chunkified = Corpus_chunkified.reset_index(drop=True)'''

'Corpus_chunkified = pd.read_csv("/home/pc/Uni/MasterThesis/Scripts/Corpus_abortion_sample2_chunked_annotated.csv")\nCorpus_chunkified = Corpus_chunkified.reset_index(drop=True)'

## Generating random sample

In [None]:
Corpus_econ_sample = pd.read_csv("./CSV/CorpusChunked_adjusted.csv")
Corpus_econ_sample = Corpus_econ_sample.sample(n=150, random_state=42).reset_index(drop=True)
Corpus_chunkified = Corpus_econ_sample


#Read after after initial corpus creation:
'''Corpus_chunkified = pd.read_csv("./CSV/Corpus_econ_sample_chunked_annotated.csv")
Corpus_chunkified = Corpus_chunkified.reset_index(drop=True)'''

## Generating sample 2 (diachronic approach)
For the diachronic comparison a second random sample, sampled after 2020 is drawn to contrast against the comparison of the abortion annotation task.

In [None]:
def extract_year(date_str):
	import re
	match = re.search(r'(\d{4})', str(date_str))
	if match:
		return int(match.group(1))
	return None

Corpus_econ_sample2 = pd.read_csv("./CSV/CorpusChunked_adjusted.csv")

Corpus_econ_sample2['year'] = Corpus_econ_sample2['date'].apply(extract_year)
Corpus_econ_sample2 = Corpus_econ_sample2[Corpus_econ_sample2['year'] >= 2020]
Corpus_chunkified = Corpus_econ_sample2.sample(n=150, random_state=42).reset_index(drop=True)

Corpus_chunkified = pd.read_csv("./CSV/Corpus_econ_sample_chunked_annotated.csv")
Corpus_chunkified = Corpus_chunkified[0:150]
print(f"Corpus_chunkified loaded with {len(Corpus_chunkified)} entries.")
print(Corpus_chunkified.head())

In [None]:
#Check if the models are correctly loaded
'''models = ollama.list()
print([model['model'] for model in models['models']])
del models'''
#Create the metrics dataframe for model speed comparison
Model_metrics_df = pd.DataFrame(columns=['Model', 'Speed', 'Time_taken'])

['phi:2.7b', 'phi:latest', 'gemma3:4b', 'gpt-oss:20b', 'qwen2.5:7b', 'mistral:7b', 'stablelm2:12b', 'localmind/sauerkrautlm:latest', 'dolphin-mixtral:latest', 'llama3.2:latest', 'hf.co/TheBloke/SOLAR-10.7B-Instruct-v1.0-uncensored-GGUF:Q2_K', 'deepseek-r1:8b', 'huihui_ai/deepseek-r1-abliterated:latest', 'huihui_ai/deepseek-r1-abliterated:8b']


# Model tester pipeline

In [None]:
def annotator(model, Corpus_df, Model_metrics_df, start_index=0, end_index=None, batch_size=100, output_file="./CSV/corpus_chunks.csv", topic = "Abortion"):
    
    start_time = time.time() #This variable measures the start of the annotation process based on system time. 
    
    if end_index is None:
        end_index = len(Corpus_df)
    for batch_start in range(start_index, end_index, batch_size):
        batch_end = min(batch_start + batch_size, end_index)
        for idx in range(batch_start, batch_end):
            try:
                chunk = Corpus_df.loc[Corpus_df.index == idx, "chunk"].values[0]
                if topic == "Abortion":
                    annotation = classify_abortion_fewshot(chunk, model)
                elif topic == "Economy":
                    annotation = classify_economy_fewshot(chunk, model)
                else:
                    annotation = None
                    print(f"Unknown topic: {topic}")
                Corpus_df.loc[Corpus_df.index == idx, model] = annotation
                time.sleep(1)
            except Exception as e:
                print(f"Error annotating chunk at index {idx}: {e}")
                Corpus_df.loc[Corpus_df.index == idx, model] = None
                time.sleep(1)
        # Save after each batch
        Corpus_df.to_csv(output_file, index=False)
        print(f"Saved batch {batch_start}-{batch_end} to {output_file}")
        
    end_time = time.time() # Capturing end time based on system time
    elapsed_time = end_time - start_time # Calculating elapsed time for the entire annotation per sample for each model. 
    print(f"Elapsed time for annotation: {elapsed_time:.2f} seconds")    
    model_speed = len(Corpus_df) / elapsed_time
    Model_metrics_df.loc[len(Model_metrics_df)] = {'Model': model, 'Speed': model_speed, 'Time_taken': elapsed_time}
    print(Model_metrics_df)

In [None]:
model_list = ["phi:2.7b", "gemma3:4b", "qwen2.5:7b", "mistral:7b", "stablelm2:12b","llama3.2:latest"]

for model in model_list:
    annotator(model, Corpus_chunkified, start_index=0, topic="Abortion", Model_metrics_df=Model_metrics_df) 
    Corpus_chunkified_annotated = Corpus_chunkified.copy()
Corpus_chunkified_annotated.to_csv("./CSV/Corpus_abortion_sample3_chunked_annotated.csv", index=False)

## Finishing touches

In [None]:
for i in range(len(Model_metrics_df)):
    Model_metrics_df['Time_per_annotation'] = Model_metrics_df['Time_taken'] / 150 # Calculates the mean time per individual annotation 

Corpus_df_annotated_cleaned = Corpus_chunkified.copy()

def cleanup(model, Corpus_df):
    
    if Corpus_df[model].dtype != int:
        Corpus_df_annotated_cleaned[model] = Corpus_df[model].astype(str)
        Corpus_df_annotated_cleaned[model] = Corpus_df_annotated_cleaned[model].apply(lambda x: '1' if '1' in x else ('0' if '0' in x else None))
        if Corpus_df_annotated_cleaned[model].isnull().any():
            Corpus_df_annotated_cleaned[model] = Corpus_df_annotated_cleaned[model].fillna(0)
            Corpus_df_annotated_cleaned[model] = Corpus_df_annotated_cleaned[model].astype(int)
    return Corpus_df_annotated_cleaned

# Cleaning up all model columns for enabling the calculation of evaluation metrics. SciLearn prefers integer values.

for model_name in Corpus_chunkified.columns[5:]: #The first five columns are metadata, the later columns are annotations
    Corpus_df_annotated_cleaned = cleanup(model_name, Corpus_chunkified)

## Saving the results after each task

In [None]:
Model_metrics_df.to_csv("./CSV/Model_metrics_econ_sample2.csv", index=False)
Corpus_df_annotated_cleaned.to_csv("./CSV/Corpus_Chunked_annotated_cleaned.csv", index=False)

# For the final corpus

In [None]:
Corpus_chunkified_adjusted_full = annotator("qwen2.5:7b", Corpus_chunkified, start_index=0, topic="Abortion", Model_metrics_df=Model_metrics_df) 
Corpus_chunkified_adjusted_clean = cleanup("qwen2.5:7b", Corpus_chunkified_adjusted_full)
Corpus_chunkified_adjusted_clean.to_csv("./CSV/Corpus_Chunkified_adjusted_full.csv", index=False)