In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import sys 
sys.path.append(os.path.abspath(os.path.join('..')))
from transformers import pipeline
from scripts.vendor_scorecard_engine import score_vendors

df = pd.read_csv("../data/telegram_data.csv")
df.head()

In [None]:
from transformers import pipeline
ner_pipeline = pipeline("token-classification", model="Davlan/bert-base-multilingual-cased-ner-hrl", aggregation_strategy="simple")

In [None]:
# Load scraped Telegram data
df = pd.read_csv("../data/telegram_data.csv")

In [None]:
# Normalize column names
df.columns = df.columns.str.lower().str.strip()
print("Columns after load:", df.columns.tolist())

In [None]:
# Rename columns to match expectations in score_vendors
df.rename(columns={
    'channel title': 'vendor',
    'message': 'text',
    'date': 'timestamp'
}, inplace=True)

# Simulate 'views' since missing from data
np.random.seed(42)
df['views'] = np.random.randint(100, 5000, size=len(df))

In [None]:
# Parse timestamps and drop rows with invalid timestamps
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
if df['timestamp'].isnull().any():
    print(f"Dropping {df['timestamp'].isnull().sum()} rows with invalid timestamps")
    df = df.dropna(subset=['timestamp'])

# Remove timezone info if present (to avoid warnings in score_vendors)
df['timestamp'] = df['timestamp'].dt.tz_localize(None)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load your model (replace with your model path or Hugging Face model name)
model_name = "Davlan/afro-xlmr-base"  # or "models/saved_model_dir" for local fine-tuned model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Now define the NER pipeline
ner_pipeline = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

In [None]:
# Filter out rows with missing or empty text
df_with_text = df[df['text'].notna() & (df['text'].str.strip() != '')]

score_df = score_vendors(df_with_text, ner_pipeline)
score_df.to_csv("../data/vendor_scorecard.csv", index=False)
print(score_df.head())