In [19]:
import pandas as pd

df = pd.read_csv("assets/source_websites.csv")
df

Unnamed: 0,url,url_host_registered_domain,content_languages
0,http://brainlingo.com/html/top.htm,brainlingo.com,lin
1,https://www.everybarataa.com/,everybarataa.com,orm
2,https://www.everybarataa.com/a/prayers.html,everybarataa.com,orm
3,https://www.everybarataa.com/a/trinity1.html,everybarataa.com,orm
4,https://www.everybarataa.com/a/where.html,everybarataa.com,orm
...,...,...,...
127253,https://app.revoride.com/login,revoride.com,swa
127254,https://rexdlapk.com/ng/blog/disappearing-mess...,rexdlapk.com,hau
127255,https://rexdlapk.com/ng/blog/edit-sent-message...,rexdlapk.com,hau
127256,http://mrw.reycreo.com/gaming/contact-us,reycreo.com,kin


In [None]:
import os
import json
import gzip

folder = "output/parsed"
for file in os.listdir(folder):
    if file[-8:] != ".json.gz":
        continue
    with gzip.open(os.path.join(folder, file), "rt") as f:
        for line in f:
            line = json.loads(line)
            print(line["language"])
            break
    break


In [61]:
df = df.sample(frac=1, random_state=0)
with open("output/seed_urls.txt", "w")  as f:
    for url in df.url:
        f.write(url)
        f.write("\n")

In [74]:
import re

def remove_extra_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

# Example usage
text = "This   is   a   test    string."
clean_text = remove_extra_whitespace(text)
print(clean_text)  # Output: "This is a test string."


This is a test string.


In [47]:
from dataclasses import dataclass
import os
import pandas as pd
from abc import ABC, abstractmethod
from typing import List
import requests
import re
import threading
import json
from joblib import Parallel, delayed

@dataclass
class CrawlerConfig:

    def __init__(self,
        output_folder : str = "output",
        html_folder : str = "html",
        seed_file : str = "websites_to_crawl.csv",
        languages : List[str] = ['swa', 'kin', 'yor', 'run', 'hau', 'amh', 'orm', 'lin'],
        ):

        self.output_folder : str = output_folder
        self.html_folder : str = html_folder
        self.seed_file : str = seed_file
        self.languages = languages

def download(url):
    html = requests.get(url).text
    json_data = {
        "url": url,
        "html": html
    }
    json_data = json.dumps(json_data)
    return json_data

class HTMLStore:

    def __init__(self, config : CrawlerConfig):

        self.html_folder = os.path.join(config.output_folder, config.html_folder)
        if not os.path.exists(self.html_folder):
            os.makedirs(self.html_folder)

        self.current_round = 1
        self.crawled_urls = set()

        self.downloaded_urls_file = os.path.join(config.output_folder, "downloaded_urls.txt")
        
        self.downloaded_urls = set()
        if os.path.exists(self.downloaded_urls_file):
            self.downloaded_urls = set(open(self.downloaded_urls_file).readlines())

        self.downloaded_urls_writer = open(self.downloaded_urls_file, "a")

        self.dump_file_index = 1
        self.dump_writer = None

    def init_dump_writer(self):
        while True:
            self.dump_file = os.path.join(self.html_folder, f"{self.dump_file_index:05}.json")
            if os.path.exists(self.dump_file):
                self.dump_file_index += 1
            else:
                break

        self.dump_writer = open(self.dump_file, "w")
        self.dump_writer_lines = 0

    def download_urls(self, urls : List[str]):

        if self.dump_writer is None:
            self.init_dump_writer()
            
        batch_size = 100
        for i in range(0, len(urls), batch_size):
            batch = urls[i:i+batch_size]

            def download(url):
                json_data = {
                    "url": url,
                }

                try:
                    r = requests.get(url)
                    json_data["status"] = r.status_code
                    if r.status_code >= 200 and r.status_code < 300: 
                        json_data["html"] = r.text
                except Exception as e:
                    json_data["status"] = -1
                    json_data["error"] = str(e)

                return json_data

            data = Parallel(n_jobs=5)(delayed(download)(x) for x in urls)

            for row in data:
                self.dump_writer.write(json.dumps(row))
                self.dump_writer.write("\n")
                self.downloaded_urls_writer.write(url)
                self.downloaded_urls_writer.write("\n")
            

config = CrawlerConfig()
html_store = HTMLStore(config)
url = "https://www.everybarataa.com/a/prayers.html"
urls = list(df.sample(n=10, random_state=0).url)
html_store.download_urls(urls)

In [60]:
from bs4 import BeautifulSoup

def extract_paragraphs(infile):
    html = open(infile)
    html = html.readlines()[1:]
    html = "\n".join(html)

    soup = BeautifulSoup(html, "html.parser")
    for p in extract_paragraphs_from_soup(soup):
        yield p

def extract_paragraphs_from_soup(soup):

    for line in soup.get_text().split("\n"):
        line = line.strip()

        if len(line) == 0:
            continue

        # needs to have a minimum length
        if len(line) < 50:
            continue

        # needs to contain at least one sentence marks
        sentence_marks = ".,!?"
        counts = sum([line.count(x) for x in sentence_marks])

        if counts == 0:
            continue

        # needs to have a ratio of upper / lower characters
        lower = "abcdefghijklmnobqrstuvwxyz"
        upper = lower.upper()

        lower_ratio = sum(line.count(x) for x in lower) / len(line)
        upper_ratio = sum(line.count(x) for x in upper) / len(line)

        if lower_ratio > 0.95 or upper_ratio > 0.2:
            continue

        # should not end with ...
        needle = "..."
        if line[-3:] == needle:
            continue

        yield line

def analyze_download(line):
    data = json.loads(line)
    print(data["url"])
    soup = BeautifulSoup(data["html"], "html.parser")
    paragraphs = extract_paragraphs_from_soup(soup)
    print(list(paragraphs))

infile = "output/html/00001.json"
for line in open(infile, "r"):
    analyze_download(line)
    break

http://rw.inductiondeepfryer.com/household-induction-cooker/
["ï»¿     Uruganda rwinjiza uruganda rukora ibicuruzwa n'ababitanga - Uruganda rwo mu rugo rwinjira mu Bushinwa                      Hamagara: + 86-0757-29311597            Murugo Ibicuruzwa  Guteka Kwinjiza Ubucuruzi Kwinjiza Ubucuruzi Byimbitse Ubushyuhe bwo Kwinjira Umutetsi wo murugo  Gukomatanya Infrared na Induction Cooker Guteka murugo Urugo Rutemewe Urugo Imitwe myinshi-Induction Guteka     Isosiyete  Ibyerekeye Twebwe Urugendo Imurikagurisha Icyemezo   OEM / ODM Amakuru Twandikire         English                      Murugo Ibicuruzwa  Umutetsi wo murugo  Guteka murugo  ibicuruzwa         Ibyiciro   Guteka Kwinjiza Ubucuruzi Kwinjiza Ubucuruzi Byimbitse Ubushyuhe bwo Kwinjira Umutetsi wo murugo  Gukomatanya Infrared na Induction Cooker Guteka murugo Urugo Rutemewe Urugo Imitwe myinshi-Induction Guteka       Ibicuruzwa byihariye          Indabyo z'ubucuruzi Indyo ...        Imirimo iremereye Yubatswe muri Commerc ... 

In [52]:
html = """
<div class="row bigbox container mi-df-local locked-single">test</div>
"""

soup = BeautifulSoup(html, "html.parser")


In [38]:
def xx(url):
    return url

with multiprocessing.Pool(4) as pool:
    downloads = pool.map(xx, ["https://darshopping.shop/product/4-in-1-laser-measuring-tool-laser-level-measuring-ruler/"])

Process SpawnPoolWorker-137:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/kinybench/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/miniconda3/envs/kinybench/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/miniconda3/envs/kinybench/lib/python3.12/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/miniconda3/envs/kinybench/lib/python3.12/multiprocessing/queues.py", line 389, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'xx' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
Process SpawnPoolWorker-139:
Process SpawnPoolWorker-138:
Process SpawnPoolWorker-136:
Process SpawnPoolWorker-135:


KeyboardInterrupt: 

In [62]:
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)
model.predict("Hello, world!")


  from .autonotebook import tqdm as notebook_tqdm


(('__label__eng_Latn',), array([0.61224753]))

In [66]:
from urllib.parse import urlparse

domain = urlparse('http://example.test/foo/bar').netloc
print(domain) # --> www.example.test



example.test


In [70]:
urlp = urlparse('http://sub.example.test/foo/bar')
urlp

ParseResult(scheme='http', netloc='sub.example.test', path='/foo/bar', params='', query='', fragment='')

In [73]:
urlp = urlparse('/foo/bar')
clean_url = f"{urlp.scheme}://{urlp.netloc}{urlp.path}"
clean_url

':///foo/bar'