In [None]:
from sqlalchemy import create_engine
import pandas as pd
import gc

# Establish connection using SQLAlchemy
engine = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/dataset_bakalarka')

query = """
SELECT content, category
FROM (
  SELECT content, category,
         ROW_NUMBER() OVER (PARTITION BY category ORDER BY RANDOM()) AS rn
  FROM web_data
  WHERE category = 'Games'
) sub
WHERE rn <= 100000
"""

chunks = []
for chunk in pd.read_sql_query(query, engine, chunksize=10000):
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()
df.head()

Unnamed: 0,content,category
0,"\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0...",Games
1,"<!DOCTYPE html>\r\n<html lang=""en-US"" class=""u...",Games
2,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//...",Games
3,"<!doctype html><html lang=""en""><head><meta cha...",Games
4,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",Games


In [2]:
from bs4 import BeautifulSoup

def clean_html(text):
    soup = BeautifulSoup(text, "lxml")
    body = soup.body
    return body.get_text(separator=" ") if body else ""

df['clean_content'] = df['content'].apply(clean_html)
df.head()
df['category'].value_counts()

  soup = BeautifulSoup(text, "lxml")


category
Games    6404
Name: count, dtype: int64

In [None]:
import spacy

is_gpu_enabled = spacy.require_gpu()
print(f"Is GPU enabled: {is_gpu_enabled}")
nlp = spacy.load('en_core_web_md')
nlp.max_length = 5000000

def lemmatize_text(text):
    # Process the text through the spaCy NLP pipeline
    doc = nlp(text)
    # Return the lemmatized text
    return " ".join([token.lemma_ for token in doc])

df['clean_content'] = df['clean_content'].apply(lemmatize_text)

Is GPU enabled: True


In [4]:
from langdetect import detect

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Apply the language detection function
df['is_english'] = df['clean_content'].apply(is_english)

# Calculate the number of non-English samples
non_english_count = df['is_english'].value_counts().get(False, 0)
print(f"Number of non-English samples removed: {non_english_count}")

# Filter out non-English samples
df = df[df['is_english']].drop(columns=['is_english'])

Number of non-English samples removed: 110


In [None]:
import pandas as pd
import re
import string


with open("stopwords-en.txt", "r") as file:
    stopwords_list = file.read().splitlines()

stopwords = set(stopwords_list)
custom_stopwords = set([
    'contact', 'service', 'policy', 'site', 'privacy', 'support', 'email', 'blog',
    'post', 'learn', 'read', 'offer', 'provide', 'include', 'click', 'update',
    'feature', 'link', 'search', 'website', 'program', 'start', 'view', 'resource',
    'experience', 'list', 'free', 'info', 'shop', 'video', 'share', 'member',
    'add', 'start', 'work', 'order', 'day', 'people', 'history', 'office',
    'time', 'year', 'event', 'national', 'state', 'high', 'month', 'week', 'open',
    'cookies', 'menu', 'cart', 'browser', 'select', 'choose', 'hope', 'enjoy', 'disabled',
    'facebook', 'twitter', 'youtube', 'instagram', 'account', 'cookie', 'subscribe',
    'newsletter', 'sign', 'message', 'comment', 'form', 'login', 'user', 'member',
    'join', 'write', 'update', 'search', 'review',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',
    'september', 'october', 'november', 'december', 'year', 'today', 'yesterday', 'tomorrow', 'datum', 'date',
    'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
    'regional', 'albuquerque', 'chicago', 'minneapolis', 'philadelphia', 'phoenix', 'rhode', 'island', 'scottsdale', 'washington', 'wisconsin', 'michigan',
    'bay', 'beach', 'dakota', 'florida', 'georgia', 'hampshire', 'harbor', 'iowa', 'maine',  'missouri', 'park', 'virginia', 'vista', 'wisconsin', 'massachusetts',
    'minnesota',
    'skip', 'content', 'main', 'term', 'condition', 'toggle', 'navigation', 'wordpress', 'social', 'medium', 'upcoming', 'event',
    'photo', 'gallery', 'news', 'frequently', 'question', 'ask', 'press', 'release', 'quick', 'link', 'continue', 'read', 'phone', 'fax', 'answer', 'question',
    'board', 'director', 'real', 'estate', 'los', 'angeles', 'new', 'york', 'city', 'san', 'francisco', 'power', 'united', 'kingdom', 'states', 'america', 'fran', 'ais',
    'north', 'carolina', 'las', 'vegas', 'annual', 'report', 'highly', 'recommend', 'rss', 'feed', 'white', 'paper', 'hong', 'kong', 'credit', 'card', 'mental', 'health', 'public', 'save', 'money',
    'annual', 'meeting', 'wide', 'range', 'care', 'gift', 'professional', 'live', 'stream', 'quality', 'product', 'project', 'management', 'meet', 'nonprofit', 'organization', 'blogthis', 'pinter',
    'design', 'success', 'story', 'summer', 'camp', 'chain', 'register', 'trademark', 'username', 'password', 'certificate', 'plan', 'visit', 'regular', 'price', 'covid', 'pandemic', 'south', 'africa', 'west', 'east', 'regional',

    # Games
    'close', 'change', 'fun', 'map', 'guide', 'file', 'book', 'set', 'late', 'base', 'software', 'follow', 'code', 'store', 'easy', 'build', 'result', 'posts', 'word', 'life',
    'log', 'archive', 'star', 'lot', 'rule', 'complete', 'item', 'space', 'send', 'series', 'current', 'special', 'table', 'hour', 'leave', 'age', 'classic', 'love', 'note', 'race', 'feel',
    'pick', 'machine', 'edition', 'final', 'dark', 'hand', 'action', 'topic', 'article', 'issue', 'party', 'type', 'bet', 'black', 'total', 'skill', 'option', 'gold', 'hold', 'king',
    'links', 'faq', 'enter', 'fast', 'forums', 'host', 'company', 'source', 'entertainment', 'fan', 'official', 'image', 'windows', 'art', 'events', 'music', 'original', 'access', 'family',
    'bit', 'require', 'bring', 'future', 'bug', 'discussion', 'mobile', 'class', 'pack', 'super', 'house', 'control', 'night', 'title', 'popular', 'daily', 'hard', 'unit',
    'receive', 'horse', 'master', 'racing', 'improve', 'match', 'hotel', 'prize', 'hit', 'pro', 'pass', 'championship', 'gain', 
    'stuff', 'address', 'tool', 'business', 'internet', 'model', 'collection', 'idea', 'theme', 'happy', 'sale', 'screen', 'return', 'watch', 'sell', 'remember', 'talk', 'simple',
    'category', 'draw', 'lead', 'lose', 'affiliate', 'minute', 'entry', 'red', 'style', 'pay', 'force', 'happen', 'deal', 'football', 'chance', 'pool', 'piece', 'attack', 'cup',
    'reserve', 'english', 'staff', 'stay', 'explore', 'box', 'guest', 'schedule', 'sound', 'publish', 'unique', 'location', 'purchase', 'reviews', 'cover', 'finally', 'school',
    'decide', 'light', 'speed', 'land', 'step', 'standard', 'single', 'pretty', 'break', 'bad', 'thread', 'rank', 'switch',
    'mail', 'customer', 'random', 'develop', 'previous', 'custom', 'track', 'fall', 'ship', 'app', 'mark', 'favorite', 'edit', 'platform',
    'monster', 'head', 'person', 'international', 'core', 'sport', 'brand', 'legend', 'grand', 'cash', 'double',
    'submit', 'picture', 'database', 'engine', 'print', 'center', 'color', 'library', 'notice', 'market', 'modern', 'field', 'advanced', 'finish', 'drop', 'rate', 'ticket',
    'earn', 'elite', 'personal', 'graphic', 'farm', 'ready', 'active', 'country', 'wait', 'calendar', 'command', 'basic', 'partner', 'local', 'expect',
    'record', 'choice', 'heroes', 'media', 'language', 'developer', 'flight', 'letter', 'focus', 'mind', 'apply', 'cost', 'death', 'limit', 'ability', 'details', 'tour', 'quiz',
    'relate', 'activity', 'network', 'format', 'articles', 'technology', 'bar', 'american', 'short', 'john', 'vote', 'rating', 'client', 'author', 'movie', 'tips', 'membership', 'reach',
    'guy', 'nice', 'reason', 'die', 'button', 'guides', 'usa', 'pre', 'session', 'ultimate', 'ball', 'roll', 'reward', 'registration', 'load', 'drive', 'dead', 'window', 'discuss',
    'grow', 'display', 'remove', 'competition', 'connect', 'addition', 'travel',


    
])
stopwords.update(custom_stopwords)
stopwords = sorted(stopwords)

# Function to further clean the text
def further_clean_text(text, stopwords):
    # Normalize spaces; replaces all kinds of whitespace with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove all numbers (digits) from the text
    text = re.sub(r'\d+', '', text)

    # Remove non-English characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Convert text to lower case to standardize for stopwords removal
    text = text.lower()

    # Split text into words, remove short words and stopwords
    text = ' '.join([word for word in text.split() if len(word) >= 3 and word not in stopwords])
    text = text.strip()

    return text

df['clean_content'] = df['clean_content'].apply(lambda x: further_clean_text(x, stopwords))
df.head()

Unnamed: 0,content,category,clean_content
0,"\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0...",Games,keyblades hearts offline hearts trailer mercha...
1,"<!DOCTYPE html>\r\n<html lang=""en-US"" class=""u...",Games,ufstarfleet ufstarfleet discover ufstarfleet t...
2,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.1//...",Games,gameboy mame cheat files tools simply massive ...
3,"<!doctype html><html lang=""en""><head><meta cha...",Games,achievement xbox xbox arcade xbox application ...
4,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",Games,games detailed spec regard game console tomb r...


In [34]:

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['clean_content'])

feature_names = vectorizer.get_feature_names_out()
mean_tfidf = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
top_keywords = [feature_names[i] for i in mean_tfidf.argsort()[::-1]]

# Print as a Python array
print(top_keywords)


['game', 'play', 'games', 'online', 'download', 'player', 'forum', 'version', 'copyright', 'casino', 'create', 'club', 'check', 'gaming', 'community', 'team', 'server', 'character', 'win', 'bridge', 'chess', 'tournament', 'puzzle', 'level', 'fantasy', 'war', 'poker', 'development', 'adventure', 'friend', 'wiki', 'rules', 'rpg', 'battle', 'strategy', 'arcade', 'season', 'magic', 'virtual', 'guild', 'nintendo', 'league', 'quest', 'role', 'challenge', 'patch', 'clan', 'chat', 'dragon', 'xbox', 'discord', 'score', 'slot', 'campaign', 'combat', 'pinball', 'fight', 'mod', 'wars', 'winner', 'playing', 'mode', 'bonus', 'multiplayer', 'weapon', 'mission', 'universe', 'tournaments', 'sports', 'beta', 'major', 'playstation', 'players', 'key', 'award', 'dice', 'wow', 'dungeon', 'upgrade', 'kill', 'cheat', 'steam', 'lottery', 'sudoku', 'betting', 'blue', 'gambling', 'gameplay', 'bingo', 'warcraft', 'expansion', 'hero', 'solitaire', 'increase', 'armor', 'beat', 'sims', 'blackjack', 'minecraft', 'col