In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Read in data
train_df = pd.read_csv('data/train.csv')
val_df = pd.read_csv('data/val.csv')

In [3]:
# Helper function for cleaning text
def clean_html(text):
    if pd.isna(text):
        return text
    # Remove HTML tags
    clean = re.sub(r'<.*?>', '', str(text))
    # Remove extra whitespaces
    clean = re.sub(r'\s+', ' ', clean).strip()
    # Replace HTML entities
    clean = re.sub(r'&amp;', '&', clean)
    clean = re.sub(r'&lt;', '<', clean)
    clean = re.sub(r'&gt;', '>', clean)
    clean = re.sub(r'&quot;|&#34;', '"', clean)
    clean = re.sub(r'&apos;|&#39;', "'", clean)
    return clean

In [4]:
train_df['cleaned_text'] = train_df['snip'].apply(clean_html)
val_df['cleaned_text'] = val_df['snip'].apply(clean_html)
print(train_df)

                                                    snip   channel  \
0      first of all, it feels like covid again but in...  FOXNEWSW   
1      to be a software drivenrganization where softw...     CSPAN   
2      you discuss the <b>power</b> <b>of</b> <em>ai<...    CSPAN2   
3      <em>ai</em> <b>bots</b> <b>like</b> chatgpt an...   BBCNEWS   
4      . >> i could sleep <b>ten</b> <b>hours</b> <em...  FOXNEWSW   
...                                                  ...       ...   
19868  cardiovascular science, but they're also pione...  FOXNEWSW   
19869  <b>i</b> <b>of</b> <em>ai</em> <b>in</b> <b>di...   BBCNEWS   
19870  weighing down on the major averages, both tech...      KTVU   
19871  i also <b>think</b> <b>crypto</b> <em>ai</em> ...    CSPAN2   
19872  as we have worked to monitor the adoption iden...    CSPAN2   

                                            cleaned_text  
0      first of all, it feels like covid again but in...  
1      to be a software drivenrganization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [6]:
label_encoder = LabelEncoder()
train_df['channel_encoded'] = label_encoder.fit_transform(train_df['channel'])
val_df['channel_encoded'] = label_encoder.transform(val_df['channel'])

X_train = train_df['cleaned_text']
y_train = train_df['channel_encoded']
X_val = val_df['cleaned_text']
y_val = val_df['channel_encoded']

class_names = label_encoder.classes_
print(f"Training classes: {class_names}")
print(f"Num training classes: {len(class_names)}")

val_class_names = val_df['channel'].unique()
print(f"\nValidation classes: {val_class_names}")
print(f"Num validation classes: {len(val_class_names)}")

Training classes: ['1TV' 'ALJAZ' 'BBCNEWS' 'BELARUSTV' 'BLOOMBERG' 'CNBC' 'CNNW' 'COM'
 'CSPAN' 'CSPAN2' 'CSPAN3' 'DW' 'FBC' 'FOXNEWSW' 'GBN' 'KDTV' 'KGO' 'KNTV'
 'KPIX' 'KQED' 'KRON' 'KSTS' 'KTVU' 'LINKTV' 'MSNBCW' 'NTV' 'PRESSTV' 'RT'
 'RUSSIA1' 'RUSSIA24' 'SFGTV']
Num training classes: 31

Validation classes: ['BLOOMBERG' 'KPIX' 'CNNW' 'CSPAN' 'BBCNEWS' 'FOXNEWSW' 'KTVU' 'KRON'
 'KNTV' 'FBC' 'CNBC' 'KDTV' 'CSPAN2' 'KGO' 'DW' 'CSPAN3' 'GBN' 'ALJAZ'
 'MSNBCW' 'RT' 'KSTS' 'SFGTV' 'KQED']
Num validation classes: 23


In [8]:
def train_and_evaluate(vectorizer, vectorizer_name, X_train, y_train, X_val, y_val, class_names):
    """
    Trains a Logistic Regression model using the specified vectorizer
    and evaluates its performance.
    """
    print(f"\n--- Training and Evaluating with {vectorizer_name} ---")

    # Fit and transform the training data
    print(f"Fitting {vectorizer_name}...")
    X_train_vec = vectorizer.fit_transform(X_train)
    print(f"Transforming validation data with {vectorizer_name}...")
    X_val_vec = vectorizer.transform(X_val)
    print(f"Feature shape (Train): {X_train_vec.shape}")

    # Train Logistic Regression model
    print("Training Logistic Regression model...")
    # Increased max_iter for convergence, added random_state for reproducibility
    model = LogisticRegression(max_iter=100, random_state=42, solver='liblinear')
    model.fit(X_train_vec, y_train)
    print("Model training complete.")

    # --- Evaluation ---
    print("\nEvaluating model...")
    # Predictions
    y_train_pred = model.predict(X_train_vec)
    y_val_pred = model.predict(X_val_vec)

    # Accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"\nOverall Accuracy ({vectorizer_name}):")
    print(f"  Training Set: {train_accuracy:.4f}")
    print(f"  Validation Set: {val_accuracy:.4f}")

    # Classification Report (includes per-class metrics)
    print(f"\nClassification Report - Training Set ({vectorizer_name}):")
    print(classification_report(y_train, y_train_pred, target_names=class_names, zero_division=0))

    print(f"\nClassification Report - Validation Set ({vectorizer_name}):")
    labels = sorted(set(np.unique(y_val)) | set(np.unique(y_val_pred)))
    val_names_subset = [class_names[i] for i in labels]
    print(classification_report(y_val, y_val_pred, target_names=val_names_subset, zero_division=0))

    return model, vectorizer # Return trained model and vectorizer if needed later

# --- 4. Run for TF-IDF ---
# Initialize TF-IDF Vectorizer (using default English stop words)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_model, _ = train_and_evaluate(tfidf_vectorizer, "TF-IDF Vectorizer", X_train, y_train, X_val, y_val, class_names)

# --- 5. Run for CountVectorizer ---
# Initialize Count Vectorizer (using default English stop words)
count_vectorizer = CountVectorizer(stop_words='english')
count_model, _ = train_and_evaluate(count_vectorizer, "Count Vectorizer", X_train, y_train, X_val, y_val, class_names)

print("\n--- Part A Complete ---")


--- Training and Evaluating with TF-IDF Vectorizer ---
Fitting TF-IDF Vectorizer...
Transforming validation data with TF-IDF Vectorizer...
Feature shape (Train): (19873, 66333)
Training Logistic Regression model...
Model training complete.

Evaluating model...

Overall Accuracy (TF-IDF Vectorizer):
  Training Set: 0.8479
  Validation Set: 0.5153

Classification Report - Training Set (TF-IDF Vectorizer):
              precision    recall  f1-score   support

         1TV       0.95      0.51      0.67       158
       ALJAZ       1.00      0.89      0.94       230
     BBCNEWS       0.70      0.96      0.81      1576
   BELARUSTV       0.97      0.51      0.67       109
   BLOOMBERG       0.91      0.95      0.93      1441
        CNBC       0.87      0.87      0.87      1319
        CNNW       0.82      0.96      0.88      2725
         COM       1.00      0.17      0.30        40
       CSPAN       0.73      0.83      0.78       913
      CSPAN2       0.71      0.69      0.70       9

In [9]:
def train_and_evaluate(vectorizer, vectorizer_name, X_train, y_train, X_val, y_val, class_names):
    """
    Trains a Logistic Regression model using the specified vectorizer
    and evaluates its performance.
    """
    print(f"\n--- Training and Evaluating with {vectorizer_name} ---")

    # Fit and transform the training data
    print(f"Fitting {vectorizer_name}...")
    X_train_vec = vectorizer.fit_transform(X_train)
    print(f"Transforming validation data with {vectorizer_name}...")
    X_val_vec = vectorizer.transform(X_val)
    print(f"Feature shape (Train): {X_train_vec.shape}")

    # Train Logistic Regression model
    print("Training Logistic Regression model...")
    # Increased max_iter for convergence, added random_state for reproducibility
    model = LogisticRegression(max_iter=100, random_state=42, solver='newton-cg')
    model.fit(X_train_vec, y_train)
    print("Model training complete.")

    # --- Evaluation ---
    print("\nEvaluating model...")
    # Predictions
    y_train_pred = model.predict(X_train_vec)
    y_val_pred = model.predict(X_val_vec)

    # Accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"\nOverall Accuracy ({vectorizer_name}):")
    print(f"  Training Set: {train_accuracy:.4f}")
    print(f"  Validation Set: {val_accuracy:.4f}")

    # Classification Report (includes per-class metrics)
    print(f"\nClassification Report - Training Set ({vectorizer_name}):")
    print(classification_report(y_train, y_train_pred, target_names=class_names, zero_division=0))

    print(f"\nClassification Report - Validation Set ({vectorizer_name}):")
    labels = sorted(set(np.unique(y_val)) | set(np.unique(y_val_pred)))
    val_names_subset = [class_names[i] for i in labels]
    print(classification_report(y_val, y_val_pred, target_names=val_names_subset, zero_division=0))

    return model, vectorizer # Return trained model and vectorizer if needed later

# --- 4. Run for TF-IDF ---
# Initialize TF-IDF Vectorizer (using default English stop words)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_model, _ = train_and_evaluate(tfidf_vectorizer, "TF-IDF Vectorizer", X_train, y_train, X_val, y_val, class_names)

# --- 5. Run for CountVectorizer ---
# Initialize Count Vectorizer (using default English stop words)
count_vectorizer = CountVectorizer(stop_words='english')
count_model, _ = train_and_evaluate(count_vectorizer, "Count Vectorizer", X_train, y_train, X_val, y_val, class_names)

print("\n--- Part A Complete ---")


--- Training and Evaluating with TF-IDF Vectorizer ---
Fitting TF-IDF Vectorizer...
Transforming validation data with TF-IDF Vectorizer...
Feature shape (Train): (19873, 66333)
Training Logistic Regression model...
Model training complete.

Evaluating model...

Overall Accuracy (TF-IDF Vectorizer):
  Training Set: 0.8683
  Validation Set: 0.5235

Classification Report - Training Set (TF-IDF Vectorizer):
              precision    recall  f1-score   support

         1TV       0.97      0.60      0.74       158
       ALJAZ       1.00      0.91      0.95       230
     BBCNEWS       0.74      0.97      0.84      1576
   BELARUSTV       0.97      0.59      0.73       109
   BLOOMBERG       0.93      0.95      0.94      1441
        CNBC       0.88      0.90      0.89      1319
        CNNW       0.85      0.97      0.91      2725
         COM       1.00      0.17      0.30        40
       CSPAN       0.74      0.85      0.79       913
      CSPAN2       0.72      0.73      0.73       9

In [6]:
# ---------------------- 1. Imports & basic setup ----------------------
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, load_metric

# ---------------------- 2. Load your DataFrame -----------------------
# (If train_df already lives in memory, skip this read step.)
# train_df = pd.read_csv("your_dataframe.csv")

texts   = train_df["cleaned_text"].astype(str).tolist()
labels  = train_df["channel"].tolist()

# Encode string labels → integer IDs 0-30
le           = LabelEncoder()
int_labels   = le.fit_transform(labels)          # shape (N,)

# Optional: keep a mapping of id↔label for inference
id2label = {i: l for i, l in enumerate(le.classes_)}
label2id = {l: i for i, l in id2label.items()}

# ---------------------- 3. Train/validation split --------------------
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, int_labels, test_size=0.1, random_state=42, stratify=int_labels
)

# Wrap in 🤗 datasets Dataset objects
train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_ds   = Dataset.from_dict({"text": val_texts,   "label": val_labels})

dataset = DatasetDict({"train": train_ds, "validation": val_ds})

# ---------------------- 4. Tokenisation function ---------------------
MODEL_NAME = "distilbert-base-uncased"           # lightweight; swap for another model if desired
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

# ---------------------- 5. Model & training prep ---------------------
num_labels = len(le.classes_)                    # =31
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

data_collator = DataCollatorWithPadding(tokenizer)

accuracy = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="news-channel-clf",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"               # disable W&B/Comet if not needed
)

# ---------------------- 6. Trainer -------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ---------------------- 7. Train! -------------------------------
trainer.train()

# ---------------------- 8. Quick evaluation ----------------------
print(trainer.evaluate())

# ---------------------- 9. Saving -------------------------------
trainer.save_model("news-channel-clf/best_model")
tokenizer.save_pretrained("news-channel-clf/best_model")

# To reload later:
# model = AutoModelForSequenceClassification.from_pretrained("news-channel-clf/best_model")
# tokenizer = AutoTokenizer.from_pretrained("news-channel-clf/best_model")

# ---------------------- 10. Inference helper ---------------------
def predict_channel(raw_text: str) -> str:
    inputs   = tokenizer(raw_text, return_tensors="pt", truncation=True).to(model.device)
    logits   = model(**inputs).logits
    pred_id  = logits.argmax(-1).item()
    return id2label[pred_id]

# Example:
# print(predict_channel("Breaking: central bank raises interest rates again..."))

ModuleNotFoundError: No module named 'transformers'

In [2]:
from transformers import AutoTokenizer
from datasets import Dataset


  from .autonotebook import tqdm as notebook_tqdm
