<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/ZeroShots.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##FIRST

In [None]:
!pip install -q transformers datasets seqeval openpyxl


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
!pip install llama-cpp-python --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone


In [None]:
pip install transformers pandas




In [None]:
!wget -q https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf -O tinyllama.gguf


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-bies", name="zero-shot-mistral", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",
    n_ctx=2048,
    n_gpu_layers=0,  # Set to >0 if you're using Colab Pro with T4/A100
    verbose=False
)

# STEP 5: Load BIES-tagged Excel file
df = pd.read_excel("/content/BIES.xlsx")

# Filter valid BIES tags
valid_tags = ["B", "I", "E", "S"]
df = df[df["Word i entity tag"].isin(valid_tags)]

columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define prompt
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the word below into one of these entity tags:\n"
        f"B = Beginning of an entity\n"
        f"I = Inside an entity\n"
        f"E = End of an entity\n"
        f"S = Single-word entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Inference and prediction
n_test = 10  # You can increase this
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log prediction to wandb table
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions to wandb
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Log metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

# STEP 10: Log metrics to wandb
wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 11: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           B      0.000     0.000     0.000         3
           E      0.250     0.500     0.333         2
           I      0.500     0.500     0.500         2
           O      0.000     0.000     0.000         0
           S      0.000     0.000     0.000         3

    accuracy                          0.200        10
   macro avg      0.150     0.200     0.167        10
weighted avg      0.150     0.200     0.167        10


📈 Evaluation Summary:
Accuracy:          0.2000
Macro Precision:   0.1500
Macro Recall:      0.2000
Macro F1 Score:    0.1667
Weighted F1 Score: 0.1667
Micro F1 Score:    0.2000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.2
macro_f1,0.16667
macro_precision,0.15
macro_recall,0.2
micro_f1,0.2
weighted_f1,0.16667


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-bi", name="zero-shot-tinyllama-bi", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",  # Make sure this file exists in your runtime
    n_ctx=2048,
    n_gpu_layers=0,  # >0 for faster inference with GPU
    verbose=False
)

# STEP 5: Load BI-tagged Excel file
df = pd.read_excel("/content/BI.xlsx")  # Change to the correct path if needed

# Keep only valid BI tags
valid_tags = ["B", "I", "O"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Keep necessary columns
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define the prompt function
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the word below into one of these entity tags:\n"
        f"B = Beginning of an entity\n"
        f"I = Inside of an entity\n"
        f"O = Outside (not part of an entity)\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Run inference and collect predictions
n_test = 20  # You can increase this
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log prediction to wandb table
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions to wandb
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Log metrics to wandb
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 10: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           B      0.333     0.111     0.167         9
           I      0.600     0.273     0.375        11
           O      0.000     0.000     0.000         0

    accuracy                          0.200        20
   macro avg      0.311     0.128     0.181        20
weighted avg      0.480     0.200     0.281        20


📈 Evaluation Summary:
Accuracy:          0.2000
Macro Precision:   0.3111
Macro Recall:      0.1279
Macro F1 Score:    0.1806
Weighted F1 Score: 0.2812
Micro F1 Score:    0.2000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.2
macro_f1,0.18056
macro_precision,0.31111
macro_recall,0.12795
micro_f1,0.2
weighted_f1,0.28125


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-ie", name="zero-shot-tinyllama-ie", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",  # Ensure this file exists in your runtime
    n_ctx=2048,
    n_gpu_layers=0,  # Set to >0 if using GPU (e.g., Colab Pro)
    verbose=False
)

# STEP 5: Load IE-tagged Excel file
df = pd.read_excel("/content/IE.xlsx")  # Replace with the correct path if needed

# Keep only valid IE tags
valid_tags = ["I", "E"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Select necessary columns
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define the prompt function
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the word below into one of these entity tags:\n"
        f"I = Inside of an entity\n"
        f"E = End of an entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Run inference and collect predictions
n_test = 20  # Adjust as needed
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log prediction to wandb table
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions to wandb
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Log metrics to wandb
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 10: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           E      0.250     0.250     0.250         8
           I      0.500     0.083     0.143        12
           O      0.000     0.000     0.000         0

    accuracy                          0.150        20
   macro avg      0.250     0.111     0.131        20
weighted avg      0.400     0.150     0.186        20


📈 Evaluation Summary:
Accuracy:          0.1500
Macro Precision:   0.2500
Macro Recall:      0.1111
Macro F1 Score:    0.1310
Weighted F1 Score: 0.1857
Micro F1 Score:    0.1500


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.15
macro_f1,0.13095
macro_precision,0.25
macro_recall,0.11111
micro_f1,0.15
weighted_f1,0.18571


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-iob", name="zero-shot-tinyllama-iob", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",  # Path to your .gguf file
    n_ctx=2048,
    n_gpu_layers=0,  # Set >0 if using GPU (Colab Pro, etc.)
    verbose=False
)

# STEP 5: Load IOB-tagged Excel file
df = pd.read_excel("/content/IOB.xlsx")  # Update this path as needed

# Keep only valid IOB tags
valid_tags = ["I", "O", "B"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Select the necessary columns
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define the prompt template
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the word below into one of these entity tags:\n"
        f"B = Beginning of an entity\n"
        f"I = Inside of an entity\n"
        f"O = Outside any entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Inference
n_test = 20  # Adjust as needed
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log to wandb
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Log metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 10: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           B      0.000     0.000     0.000         3
           I      0.400     0.500     0.444         4
           O      0.692     0.692     0.692        13

    accuracy                          0.550        20
   macro avg      0.364     0.397     0.379        20
weighted avg      0.530     0.550     0.539        20


📈 Evaluation Summary:
Accuracy:          0.5500
Macro Precision:   0.3641
Macro Recall:      0.3974
Macro F1 Score:    0.3789
Weighted F1 Score: 0.5389
Micro F1 Score:    0.5500


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.55
macro_f1,0.37892
macro_precision,0.3641
macro_recall,0.39744
micro_f1,0.55
weighted_f1,0.53889


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-iobes", name="zero-shot-tinyllama-iobes", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",  # Path to your local GGUF model
    n_ctx=2048,
    n_gpu_layers=0,  # Use >0 if you have GPU
    verbose=False
)

# STEP 5: Load IOBES-tagged Excel file
df = pd.read_excel("/content/IOBES.xlsx")  # Update this if your file path is different

# Valid IOBES tags
valid_tags = ["I", "O", "B", "E", "S"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Keep necessary columns
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define prompt template
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the following word using one of these entity tags:\n"
        f"B = Beginning of an entity\n"
        f"I = Inside an entity\n"
        f"O = Outside any entity\n"
        f"E = End of a multi-word entity\n"
        f"S = Single-word entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Inference
n_test = 20  # Adjust for more or less test samples
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log to wandb
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True label
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions to wandb
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

# STEP 10: Log metrics
wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 11: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           B      1.000     0.500     0.667         2
           E      0.250     0.500     0.333         2
           I      0.333     0.500     0.400         2
           O      0.667     0.615     0.640        13
           S      0.000     0.000     0.000         1

    accuracy                          0.550        20
   macro avg      0.450     0.423     0.408        20
weighted avg      0.592     0.550     0.556        20


📈 Evaluation Summary:
Accuracy:          0.5500
Macro Precision:   0.4500
Macro Recall:      0.4231
Macro F1 Score:    0.4080
Weighted F1 Score: 0.5560
Micro F1 Score:    0.5500


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.55
macro_f1,0.408
macro_precision,0.45
macro_recall,0.42308
micro_f1,0.55
weighted_f1,0.556


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-ioe", name="zero-shot-tinyllama-ioe", reinit=True)

# STEP 4: Load model
llm = Llama(
    model_path="tinyllama.gguf",  # Local GGUF model
    n_ctx=2048,
    n_gpu_layers=0,
    verbose=False
)

# STEP 5: Load IOE-tagged dataset
df = pd.read_excel("/content/IOE.xlsx")  # Replace with correct path

# Filter for valid IOE tags
valid_tags = ["I", "O", "E"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Select relevant features
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Prompt generator
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the following word using one of these entity tags:\n"
        f"I = Inside an entity\n"
        f"O = Outside any entity\n"
        f"E = End of an entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Run inference
n_test = 20  # Sample size for testing
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log prediction
    wandb_table.add_data(
        idx,
        row._1,  # Word
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions table
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluate
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Calculate metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

# STEP 10: Log metrics
wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 11: Finish run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           E      0.000     0.000     0.000         3
           I      0.000     0.000     0.000         4
           O      0.688     0.846     0.759        13

    accuracy                          0.550        20
   macro avg      0.229     0.282     0.253        20
weighted avg      0.447     0.550     0.493        20


📈 Evaluation Summary:
Accuracy:          0.5500
Macro Precision:   0.2292
Macro Recall:      0.2821
Macro F1 Score:    0.2529
Weighted F1 Score: 0.4931
Micro F1 Score:    0.5500


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.55
macro_f1,0.25287
macro_precision,0.22917
macro_recall,0.28205
micro_f1,0.55
weighted_f1,0.4931


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-io", name="zero-shot-tinyllama-io", reinit=True)

# STEP 4: Load model
llm = Llama(
    model_path="tinyllama.gguf",  # Make sure this file exists in your Colab or local runtime
    n_ctx=2048,
    n_gpu_layers=0,
    verbose=False
)

# STEP 5: Load IO-tagged dataset
df = pd.read_excel("/content/IO.xlsx")  # Replace with actual path

# Filter for valid IO tags
valid_tags = ["I", "O"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Select relevant features
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Prompt generator
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the following word using one of these entity tags:\n"
        f"I = Inside an entity\n"
        f"O = Outside any entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Inference
n_test = 20  # Adjust for more test samples
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log to wandb table
    wandb_table.add_data(
        idx,
        row._1,  # Word
        row._2,  # True label
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Calculate metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

# STEP 10: Log metrics
wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 11: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           I      0.500     0.143     0.222         7
           O      0.667     0.923     0.774        13

    accuracy                          0.650        20
   macro avg      0.583     0.533     0.498        20
weighted avg      0.608     0.650     0.581        20


📈 Evaluation Summary:
Accuracy:          0.6500
Macro Precision:   0.5833
Macro Recall:      0.5330
Macro F1 Score:    0.4982
Weighted F1 Score: 0.5810
Micro F1 Score:    0.6500


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.65
macro_f1,0.49821
macro_precision,0.58333
macro_recall,0.53297
micro_f1,0.65
weighted_f1,0.581


##SECOND

In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-bies", name="zero-shot-mistral", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",
    n_ctx=2048,
    n_gpu_layers=0,  # Set to >0 if you're using Colab Pro with T4/A100
    verbose=False
)

# STEP 5: Load BIES-tagged Excel file
df = pd.read_excel("/content/BIES2.xlsx")

# Filter valid BIES tags
valid_tags = ["B", "I", "E", "S"]
df = df[df["Word i entity tag"].isin(valid_tags)]

columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define prompt
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the word below into one of these entity tags:\n"
        f"B = Beginning of an entity\n"
        f"I = Inside an entity\n"
        f"E = End of an entity\n"
        f"S = Single-word entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Inference and prediction
n_test = 10  # You can increase this
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log prediction to wandb table
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions to wandb
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Log metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

# STEP 10: Log metrics to wandb
wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 11: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           B      0.000     0.000     0.000         3
           E      0.250     0.500     0.333         2
           I      0.500     0.500     0.500         2
           O      0.000     0.000     0.000         0
           S      0.000     0.000     0.000         3

    accuracy                          0.200        10
   macro avg      0.150     0.200     0.167        10
weighted avg      0.150     0.200     0.167        10


📈 Evaluation Summary:
Accuracy:          0.2000
Macro Precision:   0.1500
Macro Recall:      0.2000
Macro F1 Score:    0.1667
Weighted F1 Score: 0.1667
Micro F1 Score:    0.2000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.2
macro_f1,0.16667
macro_precision,0.15
macro_recall,0.2
micro_f1,0.2
weighted_f1,0.16667


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-bi", name="zero-shot-tinyllama-bi", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",  # Make sure this file exists in your runtime
    n_ctx=2048,
    n_gpu_layers=0,  # >0 for faster inference with GPU
    verbose=False
)

# STEP 5: Load BI-tagged Excel file
df = pd.read_excel("/content/BI2.xlsx")  # Change to the correct path if needed

# Keep only valid BI tags
valid_tags = ["B", "I", "O"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Keep necessary columns
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define the prompt function
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the word below into one of these entity tags:\n"
        f"B = Beginning of an entity\n"
        f"I = Inside of an entity\n"
        f"O = Outside (not part of an entity)\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Run inference and collect predictions
n_test = 20  # You can increase this
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log prediction to wandb table
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions to wandb
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Log metrics to wandb
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 10: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           B      0.667     0.333     0.444        12
           I      0.400     0.250     0.308         8
           O      0.000     0.000     0.000         0

    accuracy                          0.300        20
   macro avg      0.356     0.194     0.251        20
weighted avg      0.560     0.300     0.390        20


📈 Evaluation Summary:
Accuracy:          0.3000
Macro Precision:   0.3556
Macro Recall:      0.1944
Macro F1 Score:    0.2507
Weighted F1 Score: 0.3897
Micro F1 Score:    0.3000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.3
macro_f1,0.25071
macro_precision,0.35556
macro_recall,0.19444
micro_f1,0.3
weighted_f1,0.38974


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-ie", name="zero-shot-tinyllama-ie", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",  # Ensure this file exists in your runtime
    n_ctx=2048,
    n_gpu_layers=0,  # Set to >0 if using GPU (e.g., Colab Pro)
    verbose=False
)

# STEP 5: Load IE-tagged Excel file
df = pd.read_excel("/content/IE2.xlsx")  # Replace with the correct path if needed

# Keep only valid IE tags
valid_tags = ["I", "E"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Select necessary columns
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define the prompt function
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the word below into one of these entity tags:\n"
        f"I = Inside of an entity\n"
        f"E = End of an entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Run inference and collect predictions
n_test = 20  # Adjust as needed
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log prediction to wandb table
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions to wandb
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Log metrics to wandb
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 10: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           E      0.500     0.333     0.400        12
           I      0.500     0.125     0.200         8
           O      0.000     0.000     0.000         0

    accuracy                          0.250        20
   macro avg      0.333     0.153     0.200        20
weighted avg      0.500     0.250     0.320        20


📈 Evaluation Summary:
Accuracy:          0.2500
Macro Precision:   0.3333
Macro Recall:      0.1528
Macro F1 Score:    0.2000
Weighted F1 Score: 0.3200
Micro F1 Score:    0.2500


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.25
macro_f1,0.2
macro_precision,0.33333
macro_recall,0.15278
micro_f1,0.25
weighted_f1,0.32


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-iob", name="zero-shot-tinyllama-iob", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",  # Path to your .gguf file
    n_ctx=2048,
    n_gpu_layers=0,  # Set >0 if using GPU (Colab Pro, etc.)
    verbose=False
)

# STEP 5: Load IOB-tagged Excel file
df = pd.read_excel("/content/IOB2.xlsx")  # Update this path as needed

# Keep only valid IOB tags
valid_tags = ["I", "O", "B"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Select the necessary columns
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define the prompt template
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the word below into one of these entity tags:\n"
        f"B = Beginning of an entity\n"
        f"I = Inside of an entity\n"
        f"O = Outside any entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Inference
n_test = 20  # Adjust as needed
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log to wandb
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Log metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 10: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           B      0.000     0.000     0.000         3
           I      0.400     0.500     0.444         4
           O      0.692     0.692     0.692        13

    accuracy                          0.550        20
   macro avg      0.364     0.397     0.379        20
weighted avg      0.530     0.550     0.539        20


📈 Evaluation Summary:
Accuracy:          0.5500
Macro Precision:   0.3641
Macro Recall:      0.3974
Macro F1 Score:    0.3789
Weighted F1 Score: 0.5389
Micro F1 Score:    0.5500


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.55
macro_f1,0.37892
macro_precision,0.3641
macro_recall,0.39744
micro_f1,0.55
weighted_f1,0.53889


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-iobes", name="zero-shot-tinyllama-iobes", reinit=True)

# STEP 4: Load model (adjust model path as needed)
llm = Llama(
    model_path="tinyllama.gguf",  # Path to your local GGUF model
    n_ctx=2048,
    n_gpu_layers=0,  # Use >0 if you have GPU
    verbose=False
)

# STEP 5: Load IOBES-tagged Excel file
df = pd.read_excel("/content/IOBES2.xlsx")  # Update this if your file path is different

# Valid IOBES tags
valid_tags = ["I", "O", "B", "E", "S"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Keep necessary columns
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Define prompt template
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the following word using one of these entity tags:\n"
        f"B = Beginning of an entity\n"
        f"I = Inside an entity\n"
        f"O = Outside any entity\n"
        f"E = End of a multi-word entity\n"
        f"S = Single-word entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Inference
n_test = 20  # Adjust for more or less test samples
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log to wandb
    wandb_table.add_data(
        idx,
        row._1,  # Word i
        row._2,  # True label
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions to wandb
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

# STEP 10: Log metrics
wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 11: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           B      1.000     0.500     0.667         2
           E      0.250     0.500     0.333         2
           I      0.333     0.500     0.400         2
           O      0.667     0.615     0.640        13
           S      0.000     0.000     0.000         1

    accuracy                          0.550        20
   macro avg      0.450     0.423     0.408        20
weighted avg      0.592     0.550     0.556        20


📈 Evaluation Summary:
Accuracy:          0.5500
Macro Precision:   0.4500
Macro Recall:      0.4231
Macro F1 Score:    0.4080
Weighted F1 Score: 0.5560
Micro F1 Score:    0.5500


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.55
macro_f1,0.408
macro_precision,0.45
macro_recall,0.42308
micro_f1,0.55
weighted_f1,0.556


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-ioe", name="zero-shot-tinyllama-ioe", reinit=True)

# STEP 4: Load model
llm = Llama(
    model_path="tinyllama.gguf",  # Local GGUF model
    n_ctx=2048,
    n_gpu_layers=0,
    verbose=False
)

# STEP 5: Load IOE-tagged dataset
df = pd.read_excel("/content/IOE2.xlsx")  # Replace with correct path

# Filter for valid IOE tags
valid_tags = ["I", "O", "E"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Select relevant features
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Prompt generator
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the following word using one of these entity tags:\n"
        f"I = Inside an entity\n"
        f"O = Outside any entity\n"
        f"E = End of an entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Run inference
n_test = 20  # Sample size for testing
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log prediction
    wandb_table.add_data(
        idx,
        row._1,  # Word
        row._2,  # True tag
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions table
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluate
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Calculate metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

# STEP 10: Log metrics
wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 11: Finish run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           E      0.000     0.000     0.000         3
           I      0.000     0.000     0.000         4
           O      0.688     0.846     0.759        13

    accuracy                          0.550        20
   macro avg      0.229     0.282     0.253        20
weighted avg      0.447     0.550     0.493        20


📈 Evaluation Summary:
Accuracy:          0.5500
Macro Precision:   0.2292
Macro Recall:      0.2821
Macro F1 Score:    0.2529
Weighted F1 Score: 0.4931
Micro F1 Score:    0.5500


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.55
macro_f1,0.25287
macro_precision,0.22917
macro_recall,0.28205
micro_f1,0.55
weighted_f1,0.4931


In [None]:
# STEP 1: Install required packages
!pip install llama-cpp-python wandb pandas scikit-learn openpyxl --quiet

# STEP 2: Import libraries
from llama_cpp import Llama
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import wandb

# STEP 3: Initialize wandb
wandb.init(project="arabic-ner-io", name="zero-shot-tinyllama-io", reinit=True)

# STEP 4: Load model
llm = Llama(
    model_path="tinyllama.gguf",  # Make sure this file exists in your Colab or local runtime
    n_ctx=2048,
    n_gpu_layers=0,
    verbose=False
)

# STEP 5: Load IO-tagged dataset
df = pd.read_excel("/content/IO2.xlsx")  # Replace with actual path

# Filter for valid IO tags
valid_tags = ["I", "O"]
df = df[df["Word i entity tag"].isin(valid_tags)]

# Select relevant features
columns = [
    "Word i", "Word i entity tag", "Word i POS", "Stopword",
    "Word i Gazetteers", "Word i Lexical marker", "Word i definiteness"
]
df = df[columns].reset_index(drop=True)

# STEP 6: Prompt generator
def make_prompt(row):
    return (
        f"You are a medical NLP expert.\n"
        f"Classify the following word using one of these entity tags:\n"
        f"I = Inside an entity\n"
        f"O = Outside any entity\n\n"
        f"Word: {row['Word i']}\n"
        f"POS: {row['Word i POS']}\n"
        f"Stopword: {row['Stopword']}\n"
        f"Gazetteer: {row['Word i Gazetteers']}\n"
        f"Lexical marker: {row['Word i Lexical marker']}\n"
        f"Definiteness: {row['Word i definiteness']}\n"
        f"Entity Tag:"
    )

# STEP 7: Inference
n_test = 20  # Adjust for more test samples
test_rows = df.iloc[:n_test]
prompts = [make_prompt(row) for _, row in test_rows.iterrows()]
true_labels = test_rows["Word i entity tag"].tolist()

predicted_labels = []
wandb_table = wandb.Table(columns=["Index", "Word", "True Label", "Predicted Label", "Prompt", "Model Output"])

for idx, (prompt, row) in enumerate(zip(prompts, test_rows.itertuples())):
    response = llm(prompt, max_tokens=10)
    text = response["choices"][0]["text"].strip()
    tag = text.split()[0].upper()
    prediction = tag if tag in valid_tags else "O"
    predicted_labels.append(prediction)

    # Log to wandb table
    wandb_table.add_data(
        idx,
        row._1,  # Word
        row._2,  # True label
        prediction,
        prompt,
        text
    )

# STEP 8: Log predictions
wandb.log({"predictions_table": wandb_table})

# STEP 9: Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(true_labels, predicted_labels, digits=3))

# Calculate metrics
acc = accuracy_score(true_labels, predicted_labels)
macro_p = precision_score(true_labels, predicted_labels, average="macro")
macro_r = recall_score(true_labels, predicted_labels, average="macro")
macro_f1 = f1_score(true_labels, predicted_labels, average="macro")
weighted_f1 = f1_score(true_labels, predicted_labels, average="weighted")
micro_f1 = f1_score(true_labels, predicted_labels, average="micro")

print("\n📈 Evaluation Summary:")
print(f"Accuracy:          {acc:.4f}")
print(f"Macro Precision:   {macro_p:.4f}")
print(f"Macro Recall:      {macro_r:.4f}")
print(f"Macro F1 Score:    {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print(f"Micro F1 Score:    {micro_f1:.4f}")

# STEP 10: Log metrics
wandb.log({
    "accuracy": acc,
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "weighted_f1": weighted_f1,
    "micro_f1": micro_f1
})

# STEP 11: Finish wandb run
wandb.finish()



📊 Classification Report:

              precision    recall  f1-score   support

           I      0.500     0.143     0.222         7
           O      0.667     0.923     0.774        13

    accuracy                          0.650        20
   macro avg      0.583     0.533     0.498        20
weighted avg      0.608     0.650     0.581        20


📈 Evaluation Summary:
Accuracy:          0.6500
Macro Precision:   0.5833
Macro Recall:      0.5330
Macro F1 Score:    0.4982
Weighted F1 Score: 0.5810
Micro F1 Score:    0.6500


0,1
accuracy,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
micro_f1,▁
weighted_f1,▁

0,1
accuracy,0.65
macro_f1,0.49821
macro_precision,0.58333
macro_recall,0.53297
micro_f1,0.65
weighted_f1,0.581
