In [None]:
from sqlalchemy import create_engine
import pandas as pd

# Establish connection using SQLAlchemy
engine = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/dataset_bakalarka')

query = """
WITH eligible_samples AS (
  SELECT url, content, category,
         ROW_NUMBER() OVER (PARTITION BY category ORDER BY RANDOM()) AS rn
  FROM web_data
  WHERE url NOT IN (SELECT url FROM web_features)
)
SELECT url, content, category
FROM eligible_samples
WHERE rn <= 50
AND category NOT IN ('Adult', 'News', 'Recreation', 'Shopping')
LIMIT 500;
"""

chunks = []
for chunk in pd.read_sql_query(query, engine, chunksize=10):
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)


In [24]:
print(df['category'].value_counts())

category
Computers    50
Games        50
Health       50
Reference    50
Science      50
Society      50
Sports       50
Name: count, dtype: int64


In [25]:
from GroupExtractor import ExtractFeatures

label_map = {
        0: 'Adult',
        1: 'Computers',
        2: 'Games',
        3: 'Health',
        4: 'News',
        5: 'Recreation',
        6: 'Reference',
        7: 'Science',
        8: 'Shopping',
        9: 'Society',
        10: 'Sports'
    }

X_new = ExtractFeatures(df['content'])
Y_true = df['category'].map({v: k for k, v in label_map.items()})

  soup_list = [BeautifulSoup(html, 'lxml') for html in html_list]


In [26]:
import joblib
import numpy as np

label_map = {
        0: 'Adult',
        1: 'Computers',
        2: 'Games',
        3: 'Health',
        4: 'News',
        5: 'Recreation',
        6: 'Reference',
        7: 'Science',
        8: 'Shopping',
        9: 'Society',
        10: 'Sports'
    }

model = joblib.load('svm_linear_model.joblib')
y_pred = model.predict(X_new)

from sklearn.metrics import accuracy_score, classification_report

print(classification_report(
    Y_true.map(label_map),
    [label_map[pred] for pred in y_pred]
))


              precision    recall  f1-score   support

   Computers       0.78      0.58      0.67        50
       Games       0.79      0.52      0.63        50
      Health       0.70      0.74      0.72        50
        News       0.00      0.00      0.00         0
  Recreation       0.00      0.00      0.00         0
   Reference       0.81      0.60      0.69        50
     Science       0.76      0.38      0.51        50
    Shopping       0.00      0.00      0.00         0
     Society       0.75      0.72      0.73        50
      Sports       0.84      0.76      0.80        50

    accuracy                           0.61       350
   macro avg       0.54      0.43      0.47       350
weighted avg       0.78      0.61      0.68       350



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
