In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import pandas as pd

## Train Dataset

In [12]:
ds = load_dataset("habanoz/classifier_1300_610_fetched")

In [13]:
ds

DatasetDict({
    train: Dataset({
        features: ['url', 'length', 'offset', 'filename', 'tag', 'src', 'label', 'raw', 'text'],
        num_rows: 11462
    })
})

In [14]:
df_org = ds['train'].to_pandas()

In [2]:
import re
import urllib.parse

def prep_url(url):
    url = urllib.parse.unquote(url)

    # Find the index of the first occurrence of "//"
    protocol_end_index = url.find("://") + 3

    # Find the index of the first occurrence of "/" after the protocol
    domain_end_index = url.find("/", protocol_end_index)+1

    # Extract the path and query components
    url = url[domain_end_index:]

    url = re.sub(r"\b[2][0]\d{2}\b","<year>",url)
    url = re.sub(r"\b[1][9]\d{2}\b","<year>",url)
    url = re.sub(r"\b[1-9][0-9]{3,}\b","<number>",url)
    url = re.sub(r"\b[1-9][0-9]{2}\b","<3number>",url)
    url = re.sub(r"\b[1-9][0-9]\b","<2number>",url)
    
    return url

def process_row(row):
    url = row.url
    row['url_p'] = prep_url(url)
    return row 


In [16]:
df = df_org.apply(process_row, axis=1)

In [17]:
# Create private test set and training set
private_test = df[df['url'].str.contains('aljazeera', case=False)]
train_data = df[~df['url'].str.contains('aljazeera', case=False)]

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['url_p'], train_data['label'], test_size=0.1, random_state=42
)

### Push to Hub

In [10]:
ds = Dataset.from_pandas(df)

In [19]:
# ds.push_to_hub("habanoz/classifier_1300_610_url_p")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/habanoz/classifier_1300_610_url_p/commit/d9daf1a6c08292db50e74631aa91f62732b42f24', commit_message='Upload dataset', commit_description='', oid='d9daf1a6c08292db50e74631aa91f62732b42f24', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
ds_dict = DatasetDict({
    "train":Dataset.from_pandas(pd.concat([X_train, y_train],axis=1)).remove_columns("__index_level_0__"),
    "validation":Dataset.from_pandas(pd.concat([X_val, y_val],axis=1)).remove_columns("__index_level_0__"),
    "test":Dataset.from_pandas(private_test[['url_p','label']]).remove_columns("__index_level_0__"),
})

In [21]:
# ds_dict.push_to_hub("habanoz/classifier_1300_610_url_p_svc_training_splits")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/habanoz/classifier_1300_610_url_p_svc_training_splits/commit/1ba4fcd98366f450d94a0a73d86f7ca7b68400cb', commit_message='Upload dataset', commit_description='', oid='1ba4fcd98366f450d94a0a73d86f7ca7b68400cb', pr_url=None, pr_revision=None, pr_num=None)

## Train an SVM Classifier

In [22]:
# Feature extraction
vectorizer = TfidfVectorizer(ngram_range=(1,3))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_private_test_vectorized = vectorizer.transform(private_test['url_p'])

clf = SVC(kernel='rbf',C=20, gamma=0.1)
clf.fit(X_train_vectorized, y_train)
y_val_pred = clf.predict(X_val_vectorized)
accuracy = accuracy_score(y_val, y_val_pred)

print(classification_report(y_val, y_val_pred))
print(f"Validation Accuracy: {accuracy:.4f}")

# Evaluate the best classifier on the private test set
y_private_test_pred = clf.predict(X_private_test_vectorized)
private_test_accuracy = accuracy_score(private_test['label'], y_private_test_pred)
private_precision_recall_fscore_support = precision_recall_fscore_support(private_test['label'], y_private_test_pred)
print(f"Private Test Set Accuracy: {private_test_accuracy:.4f}")
print(f"Other Private Test Set Metrics: {private_precision_recall_fscore_support}")

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       547
           1       0.99      1.00      0.99       478

    accuracy                           1.00      1025
   macro avg       0.99      1.00      1.00      1025
weighted avg       1.00      1.00      1.00      1025

Validation Accuracy: 0.9951
Private Test Set Accuracy: 0.9352
Other Private Test Set Metrics: (array([0.91811024, 0.95384615]), array([0.9557377, 0.9147541]), array([0.93654618, 0.93389121]), array([610, 610]))


## Save Classifier

In [3]:
import joblib
import os

In [23]:
if not os.path.exists("../.models"):
    os.mkdir("../.models")
joblib.dump(clf, '../.models/news_classifier.joblib')
joblib.dump(vectorizer, '../.models/news_vectorizer.joblib')


['../.models/news_vectorizer.joblib']

## Load Classifier

In [4]:
loaded_vectorizer = joblib.load('../.models/news_vectorizer.joblib')
loaded_cls = joblib.load('../.models/news_classifier.joblib')

def classify_url(url):
    url = prep_url(url)
    print(f"Processed URL: {url}")
    url_vectorized = loaded_vectorizer.transform([url])
    return loaded_cls.predict(url_vectorized)[0]

# Example usage
new_urls = ["https://tr.euronews.com/hava-durumu/kuzey-amerika/amerika-birlesik-devletleri/wi/cesme-sehir","https://t24.com.tr/video/kaplan-tapinaginda-40-kaplan-yavrusunun-olusu-bulundu,2279","https://www.t24.com.tr/yazarlar/bekir-agirdir/sorun-cozme-kapasitesi-dusuk-ulke,33689"]
for new_url in new_urls:
    prediction = classify_url(new_url)
    print(f"\nThe URL {new_url} is classified as: {prediction}")

Processed URL: hava-durumu/kuzey-amerika/amerika-birlesik-devletleri/wi/cesme-sehir

The URL https://tr.euronews.com/hava-durumu/kuzey-amerika/amerika-birlesik-devletleri/wi/cesme-sehir is classified as: 0
Processed URL: video/kaplan-tapinaginda-<2number>-kaplan-yavrusunun-olusu-bulundu,<number>

The URL https://t24.com.tr/video/kaplan-tapinaginda-40-kaplan-yavrusunun-olusu-bulundu,2279 is classified as: 1
Processed URL: yazarlar/bekir-agirdir/sorun-cozme-kapasitesi-dusuk-ulke,<number>

The URL https://www.t24.com.tr/yazarlar/bekir-agirdir/sorun-cozme-kapasitesi-dusuk-ulke,33689 is classified as: 1


## Validation

In [27]:
svc_ds = load_dataset("habanoz/classifier_1300_610_url_p_svc_training_splits")

Downloading readme:   0%|          | 0.00/518 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/337k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/29.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9217 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1025 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1220 [00:00<?, ? examples/s]

In [28]:
loaded_vectorizer = joblib.load('../.models/news_vectorizer.joblib')
loaded_cls = joblib.load('../.models/news_classifier.joblib')
X_private_test_vectorized = loaded_vectorizer.transform(svc_ds['test']['url_p'])

y_private_test_pred = loaded_cls.predict(X_private_test_vectorized)
private_test_accuracy = accuracy_score(svc_ds['test']['label'], y_private_test_pred)
print(f"Private Test Set Accuracy: {private_test_accuracy:.4f}")

Private Test Set Accuracy: 0.9352
