In [None]:
from sqlalchemy import create_engine
import pandas as pd
import gc

# Establish connection using SQLAlchemy
engine = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/dataset_bakalarka')

query = """
SELECT content, category
FROM (
  SELECT content, category,
         ROW_NUMBER() OVER (PARTITION BY category ORDER BY RANDOM()) AS rn
  FROM web_data
  WHERE category = 'Computers'
) sub
WHERE rn <= 100000
"""

chunks = []
for chunk in pd.read_sql_query(query, engine, chunksize=10000):
    chunks.append(chunk)
    
df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()
df.head()

Unnamed: 0,content,category
0,"<!DOCTYPE html><html lang=""en-US""><head><meta ...",Computers
1,"<script type=""text/javascript"" src=""/?q6qcZpOO...",Computers
2,"\r\n\r\n\r\n<!DOCTYPE html>\r\n<html xmlns=""ht...",Computers
3,"<html>\n<head>\n <meta http-equiv=""Content-T...",Computers
4,"<!doctype html>\n<html class=""no-js no-touch"" ...",Computers


In [2]:
from bs4 import BeautifulSoup

def clean_html(text):
    soup = BeautifulSoup(text, "lxml")
    body = soup.body
    return body.get_text(separator=" ") if body else ""

df['clean_content'] = df['content'].apply(clean_html)
df.head()
df['category'].value_counts()

  soup = BeautifulSoup(text, "lxml")


category
Shopping    37261
Name: count, dtype: int64

In [None]:
import spacy

is_gpu_enabled = spacy.require_gpu()
print(f"Is GPU enabled: {is_gpu_enabled}")
nlp = spacy.load('en_core_web_md')
nlp.max_length = 5000000

def lemmatize_text(text):
    # Process the text through the spaCy NLP pipeline
    doc = nlp(text)
    # Return the lemmatized text
    return " ".join([token.lemma_ for token in doc])

df['clean_content'] = df['clean_content'].apply(lemmatize_text)

Is GPU enabled: True


In [4]:
from langdetect import detect

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Apply the language detection function
df['is_english'] = df['clean_content'].apply(is_english)

# Calculate the number of non-English samples
non_english_count = df['is_english'].value_counts().get(False, 0)
print(f"Number of non-English samples removed: {non_english_count}")

# Filter out non-English samples
df = df[df['is_english']].drop(columns=['is_english'])

Number of non-English samples removed: 467


In [None]:
import pandas as pd
import re
import string


with open("stopwords-en.txt", "r") as file:
    stopwords_list = file.read().splitlines()

stopwords = set(stopwords_list)
custom_stopwords = set([
    'contact', 'service', 'policy', 'site', 'privacy', 'support', 'email', 'blog',
    'post', 'learn', 'read', 'offer', 'provide', 'include', 'click', 'update',
    'feature', 'link', 'search', 'website', 'program', 'start', 'view', 'resource',
    'experience', 'list', 'free', 'info', 'shop', 'video', 'share', 'member',
    'add', 'start', 'work', 'order', 'day', 'people', 'history', 'office',
    'time', 'year', 'event', 'national', 'state', 'high', 'month', 'week', 'open',
    'cookies', 'menu', 'cart', 'browser', 'select', 'choose', 'hope', 'enjoy', 'disabled',
    'facebook', 'twitter', 'youtube', 'instagram', 'account', 'cookie', 'subscribe',
    'newsletter', 'sign', 'message', 'comment', 'form', 'login', 'user', 'member',
    'join', 'write', 'update', 'search', 'review',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',
    'september', 'october', 'november', 'december', 'year', 'today', 'yesterday', 'tomorrow', 'datum', 'date',
    'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
    'regional', 'albuquerque', 'chicago', 'minneapolis', 'philadelphia', 'phoenix', 'rhode', 'island', 'scottsdale', 'washington', 'wisconsin', 'michigan',
    'bay', 'beach', 'dakota', 'florida', 'georgia', 'hampshire', 'harbor', 'iowa', 'maine',  'missouri', 'park', 'virginia', 'vista', 'wisconsin', 'massachusetts',
    'minnesota',
    'skip', 'content', 'main', 'term', 'condition', 'toggle', 'navigation', 'wordpress', 'social', 'medium', 'upcoming', 'event',
    'photo', 'gallery', 'news', 'frequently', 'question', 'ask', 'press', 'release', 'quick', 'link', 'continue', 'read', 'phone', 'fax', 'answer', 'question',
    'board', 'director', 'real', 'estate', 'los', 'angeles', 'new', 'york', 'city', 'san', 'francisco', 'power', 'united', 'kingdom', 'states', 'america', 'fran', 'ais',
    'north', 'carolina', 'las', 'vegas', 'annual', 'report', 'highly', 'recommend', 'rss', 'feed', 'white', 'paper', 'hong', 'kong', 'credit', 'card', 'mental', 'health', 'public', 'save', 'money',
    'annual', 'meeting', 'wide', 'range', 'care', 'gift', 'professional', 'live', 'stream', 'quality', 'product', 'project', 'management', 'meet', 'nonprofit', 'organization', 'blogthis', 'pinter',
    'design', 'success', 'story', 'summer', 'camp', 'chain', 'register', 'trademark', 'username', 'password', 'certificate', 'plan', 'visit', 'regular', 'price', 'covid', 'pandemic', 'south', 'africa', 'west', 'east', 'regional',

    # Bait words for shopping
    'accessories', 'custom', 'art', 'online', 'copyright', 'item', 'special', 'company', 'create', 'business', 'book', 'collection', 'sell', 'hand', 'size', 'close',
    'love', 'faq', 'set', 'family', 'check', 'box', 'black', 'life', 'easy', 'follow', 'kit', 'style', 'usa', 'address', 'natural', 'color', 'equipment', 'american',
    'jewelry', 'return', 'kits', 'music', 'original', 'books', 'food', 'gifts', 'fine', 'brand', 'water', 'body', 'selection', 'flower', 'series', 'car', 'wood', 'receive', 'accessory', 'model', 'option',
    'tools', 'glass', 'light', 'furniture', 'bags', 'perfect', 'oil', 'cover', 'baby', 'piece', 'classic', 'cards', 'red', 'air', 'dog', 'blue', 'christmas', 'type', 'bag', 'women', 'gold', 'wall', 'wedding',
    'silver', 'green', 'lead', 'outdoor', 'compare', 'coffee', 'usd', 'tea', 'inch', 'wine', 'rights', 'mail', 'unique', 'send', 'print', 'log', 'build', 'hour', 'beautiful', 'browse', 'map', 'pay', 'guarantee', 'guide',
    'conditions', 'team', 'complete', 'country', 'late', 'base', 'change', 'feel', 'international', 'personal', 'house', 'leave', 'fit', 'material', 'popular', 'standard', 'play', 'performance', 'repair', 'club',
    'pro', 'center', 'hot', 'control', 'safety', 'single', 'chocolate', 'spring', 'skin', 'cut', 'watch', 'star', 'party', 'details', 'bar', 'door', 'mini', 'game', 'lighting', 'steel', 'storage', 'clearance', 'rings',
    'cat', 'birthday', 'golf', 'msrp', 'reserve', 'links', 'artist', 'image', 'fresh', 'process', 'local', 'happy', 'road', 'variety', 'request', 'fun', 'fast', 'note', 'grow', 'safe', 'featured', 'picture',
    'travel', 'holiday', 'deliver', 'child', 'paint', 'wear', 'deal', 'training', 'display', 'pattern', 'digital', 'pre', 'code', 'edition', 'systems', 'frame', 'stand', 'tree', 'replacement', 'track', 'super',
    'dvd', 'head', 'engine', 'weight', 'audio', 'hair', 'medical', 'scale', 'hats', 'lights', 'battery', 'enter', 'friend', 'street', 'result', 'discover', 'download', 'favorite', 'canada', 'require', 'traditional', 'ready',
    'choice', 'school', 'modern', 'carry', 'hard', 'class', 'premium', 'finish', 'key', 'security', 'extra', 'machine', 'short', 'military', 'mount', 'double', 'decor', 'soft', 'horse', 'pink', 'wheel', 'plastic', 'ball', 'crystal',
    'bed', 'boat', 'vinyl', 'bead', 'guitar', 'seed', 'cable', 'gloves', 'filters', 'jackets', 'wheels', 'flags', 'testimonial', 'bring', 'location', 'accept', 'current', 'touch', 'market', 'media', 'heart', 'warranty', 'level', 'pick',
    'commercial', 'step', 'space', 'animal', 'antique', 'vehicle', 'essential', 'dry', 'eye', 'clean', 'unit', 'personalized', 'fish', 'protection', 'plate', 'electric', 'quantity', 'floor', 'cross', 'hardware', 'fishing', 'games', 'bird',
    'wire', 'covers', 'tops', 'icon', 'filter', 'chair', 'tables', 'coin', 'tie', 'pool', 'stay', 'simple', 'javascript', 'industry', 'protect', 'express', 'cost', 'specialty', 'field', 'lot', 'retail', 'drive', 'mobile', 'age', 'advanced',
    'john', 'software', 'designs', 'sound', 'exclusive', 'sweet', 'apple', 'woman', 'shape', 'rock', 'foot', 'sea', 'french', 'flowers', 'window', 'solid', 'western', 'arrivals', 'candy', 'earrings', 'ford', 'fly', 'fuel', 'knife', 'socks',
    'pins', 'stamps', 'serve', 'dealer', 'studio', 'videos', 'source', 'award', 'technology', 'secure', 'handmade', 'california', 'trade', 'healthy', 'painting', 'excellent', 'season', 'collections', 'certificates', 'logo', 'stone', 'fruit',
    'children', 'truck', 'mix', 'king', 'winter', 'memorial', 'racing', 'rose', 'bulk', 'yellow', 'bath', 'soap', 'pads', 'seat', 'bike', 'hat', 'islands', 'cheese', 'cables', 'pants', 'mens', 'batteries',
    'community', 'connect', 'solution', 'corporate', 'sitemap', 'master', 'amazing', 'english', 'friendly', 'ensure', 'farm', 'prints', 'total', 'bear', 'rate', 'blend', 'floral', 'cool', 'dark', 'multi', 'dollar', 'kid',
    'cream', 'marine', 'label', 'fashion', 'wooden', 'mask', 'brown', 'toy', 'packs', 'pin', 'camera', 'silk', 'stamp', 'vat', 'flag', 'events', 'submit', 'paypal', 'idea', 'terms', 'access', 'google', 'staff', 'sales', 'occasion', 'button',
    'title', 'night', 'british', 'apply', 'pure', 'match', 'ground', 'speed', 'auto', 'flat', 'band', 'cup', 'cleaning', 'awards', 'magic', 'stainless', 'sterling', 'university', 'sauce', 'models', 'frames', 'masks', 'county', 'cake', 'shorts',
    'rugs', 'events', 'submit', 'paypal', 'idea', 'terms', 'access', 'google', 'staff', 'sales', 'occasion', 'button', 'title', 'night', 'british', 'apply', 'pure', 'match', 'ground', 'speed', 'auto', 'flat', 'band', 'cup', 'cleaning', 'awards',
    'magic', 'stainless', 'sterling', 'university', 'sauce', 'models', 'frames', 'masks', 'county', 'cake', 'shorts', 'resources', 'garden', 'improve', 'flavor', 'individual', 'australia', 'direct', 'theme', 'tips', 'texas', 'issue', 'record',
    'fall', 'wild', 'heavy', 'comfort', 'block', 'radio', 'orange', 'bracelets', 'girl', 'pocket', 'sleep', 'gun', 'plates', 'candle', 'mats', 'sleeve', 'clock', 'rug', 'seeds', 'pond', 'chess',
    'owner', 'explore', 'nature', 'recipes', 'taste', 'simply', 'installation', 'hold', 'expert', 'rare', 'magazine', 'mark', 'recipe', 'charge', 'drop', 'energy', 'mountain', 'factory', 'handle', 'adult', 'screen', 'seller', 'patterns',
    'interior', 'square', 'shower', 'honey', 'caps', 'doll', 'personalize', 'florist', 'brake', 'footwear', 'llc', 'mailing', 'locate', 'win', 'enable', 'notice', 'worldwide', 'pricing', 'mind', 'daily', 'pinterest', 'quote', 'application', 'expand',
    'anniversary', 'mother', 'edge', 'deep', 'sheet', 'roll', 'salt', 'canvas', 'motorcycle', 'bars', 'library', 'gas', 'rod', 'favorites', 'belts', 'cad', 'quilt', 'pumps', 'sku',
    'hear', 'wonderful', 'ingredient', 'method', 'treat', 'additional', 'device', 'chart', 'grade', 'arrive', 'war', 'basic', 'heat', 'colour', 'panel', 'santa', 'indoor', 'ice', 'powder', 'iron', 'fiber', 'arrangement', 'nut', 'pump', 'wool',
    'dining', 'baseball', 'football', 'tile', 'piano', 'iphone', 'funeral', 'client', 'benefit', 'true', 'maintenance', 'memory', 'tech', 'treatment', 'cold', 'aluminum', 'motor', 'acrylic', 'auction', 'dance', 'bass', 'tube', 'cap', 'stitch',
    'pad', 'tape', 'holders', 'jewellery', 'boards', 'signs', 'racks', 'trees', 'height', 'beads', 'bulbs', 'cigar', 'ties', 'develop', 'testimonials', 'minute', 'stuff', 'specials', 'tour', 'gourmet', 'arrow', 'train', 'sugar', 'film',
    'miscellaneous', 'weather', 'suit', 'slide', 'portable', 'brass', 'oils', 'desk', 'purple', 'necklace', 'college', 'belt', 'stands', 'aid', 'mounts', 'maps', 'ornaments', 'exhaust', 'boot', 'manufacture', 'inventory', 'policies', 'production',
    'previous', 'purpose', 'status', 'ultimate', 'living', 'stick', 'london', 'pop', 'fan', 'royal', 'ideas', 'file', 'letter', 'ultra', 'rear', 'colors', 'bbq', 'string', 'rubber', 'rack', 'clocks', 'embroidery', 'beer', 'led', 'esc', 'chf',
     'visa', 'future', 'instruction', 'person', 'sample', 'refund', 'advice', 'player', 'action', 'walk', 'decorative', 'race', 'instrument', 'wrap', 'industrial', 'grand', 'manual', 'measure', 'duty', 'jersey', 'copper', 'fitness', 'pearl',
    'tank', 'suspension', 'tags', 'hospital', 'ski', 'arms'

    
])
stopwords.update(custom_stopwords)
stopwords = sorted(stopwords)

# Function to further clean the text
def further_clean_text(text, stopwords):
    # Normalize spaces; replaces all kinds of whitespace with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove all numbers (digits) from the text
    text = re.sub(r'\d+', '', text)

    # Remove non-English characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Convert text to lower case to standardize for stopwords removal
    text = text.lower()

    # Split text into words, remove short words and stopwords
    text = ' '.join([word for word in text.split() if len(word) >= 3 and word not in stopwords])
    text = text.strip()

    return text

df['clean_content'] = df['clean_content'].apply(lambda x: further_clean_text(x, stopwords))
df.head()

Unnamed: 0,content,category,clean_content
0,"<!doctype html>\n<html xmlns:og=""http://opengr...",Shopping,shipping shipping shipping divine chakra cherr...
1,"\n<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\...",Shopping,loading qty flash pause repeat repeat mute unm...
2,"<!DOCTYPE html><html\nlang=""en-US""><head><meta...",Shopping,tennis fuse tennis geeks heelys unisex kids vo...
3,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\t<...",Shopping,tulegoose pillow pillows comforters tulegoose ...
4,"\r\n<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4....",Shopping,services shopping basket basket preprinted rev...


In [22]:

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['clean_content'])

feature_names = vectorizer.get_feature_names_out()
mean_tfidf = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
top_keywords = [feature_names[i] for i in mean_tfidf.argsort()[::-1]]

# Print as a Python array
print(top_keywords)


['products', 'sale', 'shipping', 'store', 'stock', 'purchase', 'delivery', 'shopping', 'ship', 'catalog', 'services', 'supply', 'items', 'wholesale', 'checkout', 'category', 'craft', 'tool', 'pack', 'payment', 'supplies', 'vintage', 'returns', 'table', 'produce', 'gear', 'discount', 'categories', 'plant', 'limited', 'reviews', 'metal', 'options', 'basket', 'pet', 'brands', 'wishlist', 'clothing', 'organic', 'designer', 'sport', 'faqs', 'fabric', 'sports', 'leather', 'boxes', 'kitchen', 'sets', 'kids', 'apparel', 'package', 'manufacturer', 'beauty', 'shirt', 'shirts', 'toll', 'private', 'remember', 'articles', 'addition', 'bottle', 'article', 'shoes', 'toys', 'building', 'sellers', 'diamond', 'cotton', 'wait', 'increase', 'strong', 'practice', 'david', 'dress', 'electronic', 'chairs', 'boots', 'native', 'baskets', 'cast', 'pair', 'necklaces', 'cars', 'milk', 'indian', 'shoe', 'herbal', 'spray', 'lock', 'plants', 'coat', 'dresses', 'trailer', 'navy', 'coins', 'jacket', 'roast', 'yarn', '