In [2]:
import requests
import warnings
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from langdetect import detect
from selenium import webdriver
import os
from googlesearch import search
import time


warnings.filterwarnings("ignore", message="Unverified HTTPS request")
driver = webdriver.Chrome()

# pip install requests
# pip install beautifulsoup4
# pip install pandas
# pip install langdetect
# pip install google


In [3]:
def removeHtmlTags(htmlText):
    soup = BeautifulSoup(htmlText, 'html.parser')
    text = soup.get_text(separator=" ", strip=True)
    words = text.split()
    # Filter out words containing "https://"
    filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
    filtered_text = ' '.join(filtered_words)
    return filtered_text


def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')
        visibleText = soup.get_text()
        visibleText = removeHtmlTags(visibleText)
        lines = visibleText.split('\n')
        # i check for '.' and for a line longer that 200 chars to eliminate titles, headers
        text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0 and len(line) > 200 and '.' in line)
        return text
    return None

def canTextBeExtracted(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')
        visibleText = soup.get_text()
        lines = visibleText.split('\n')
        text = ' '.join(line.strip() for line in lines if len(line.strip()) > 0 and len(line) > 200 and '.' in line)
        text = removeHtmlTags(text)
        # i choose articles longer than 500 chars
        if len(text) > 500 and detect(text) == 'ro':
            return True
    return False

from urllib.parse import urlparse, urljoin

def findLinks(url, sitesToVisit, visitedSites):
    base_domain = urlparse(url).netloc.lower()
    blacklist = ['youtube.com', 'instagram.com', 'pinterest.com', 'twitter.com', 'facebook.com',
                 'login', 'cookie', 'cookies', 'politica-de-confidentialitate', 'despre-noi', 
                 'termeni-si-conditii', 'contact', 'privacy-policy', 'search', 'archive', 
                 'tag', 'category', 'forum', 'login', 'register', 'profile', 'logout', 
                 'sign-up','log-in','my-account','privacy','conditii','service','terms','comment','comentariu','respond',
                 'conditions','about','sitemap','cont','comments','feed','politica-editoriala','cum-ne-poti-ajuta',
                 'password','paywall','arhiva','archive','termeni','despre','admin','newsletter',
                 'cart', 'checkout', 'shop', 'store', 'download', 'subscribe', 'unsubscribe','produs','abonare',
                 'terms-of-service', 'about-us', 'faq', 'donate', 'events', 'calendar', 
                 'faq', 'gallery', 'help', 'guidelines', 'policy']
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        soup = BeautifulSoup(res.content, 'html.parser')
        navbar = soup.find('nav', class_='navbar')
        footer = soup.find('footer', class_='footer')
        links = soup.find_all('a', href=True)
        for link in links:
            if navbar and link in navbar.find_all('a') or footer and link in footer.find_all('a'):
                continue
            href = link['href']
            if href.startswith('#') or href.startswith('javascript:'):
                continue  
            currentLink = urljoin(url, href)
            current_domain = urlparse(currentLink).netloc.lower()
            if current_domain == base_domain:
                if currentLink not in visitedSites and currentLink not in sitesToVisit and all(b not in currentLink.lower() for b in blacklist):
                    sitesToVisit.append(currentLink)
        return 0
    return 1

def extractAll(nrOfSitesToVisit, url, file_name,siteIdx):
    data = pd.DataFrame(columns=['siteIdx','link','Text'])
    sitesToVisit = []
    visitedSites = []
    initialSitesToVisit = nrOfSitesToVisit
    tries = 1
    findLinksCode = findLinks(url, sitesToVisit, visitedSites)
    startTime = time.time()
    if findLinksCode == 0:
        while nrOfSitesToVisit:
            if sitesToVisit:
                link = sitesToVisit[-1]
                extractableText = canTextBeExtracted(link)
                if extractableText:
                    text = extractText(link)
                    if text:
                        #succesful text extraction
                        if not any(data['Text'].isin([text])):
                            new_row = {'siteIdx':siteIdx, 'link': link, 'Text': text}
                            data = pd.concat([data, pd.DataFrame([new_row])], ignore_index=True)
                            nrOfSitesToVisit -= 1
                            startTime = time.time()
                visitedSites.append(link)
                sitesToVisit.pop()
                print("Visited site: ",link)
            elif nrOfSitesToVisit:
                if len(visitedSites) >= tries and tries <= 2 * initialSitesToVisit:
                    findLinksCode = findLinks(visitedSites[-tries], sitesToVisit, visitedSites)
                    if findLinksCode == 1:
                        tries += 1
                else:
                    print('Only', initialSitesToVisit - nrOfSitesToVisit, 'articles out of', initialSitesToVisit, 'for', url)
                    nrOfSitesToVisit = 0
            endTime = time.time()
            if endTime - startTime > 180:
                print("3 minutes time limit exceeded")
                if not os.path.isfile(file_name):
                    data.to_csv(file_name, index=False)
                else:
                    data.to_csv(file_name, index=False, mode='a', header=False)
                return
        if not os.path.isfile(file_name):
            data.to_csv(file_name, index=False)
        else:
            data.to_csv(file_name, index=False, mode='a', header=False)
        return



def processUrls(urls, fileName):

    with open(fileName, 'w'):
        pass

    allData = pd.DataFrame(columns=['link', 'Text'])
    siteIdx = 0
    for url in urls:
        try:
            siteIdx += 1
            data = extractAll(100, url,fileName,siteIdx)
            allData = pd.concat([allData, data], ignore_index=True) 
        except Exception as e:
            print(f"Error occurred while processing {url}: {e}")
    return allData


In [5]:
def findBlogs(query, language, numResults=100):
    urls = []
    try:
        searchResults = search(query, num=numResults, lang=language, stop=numResults)
        for result in searchResults:
            urls.append(result)
    except Exception as e:
        print("An error occurred:", e)
    return urls

In [23]:
query = "site:.ro parenting blog"
parentingBlogsRo = findBlogs(query, "ro")
print("100 Romanian Parenting Blogs:")
for i, blog in enumerate(parentingBlogsRo, 1):
    print(f"{i}. {blog}")


100 Romanian Parenting Blogs:
1. https://www.printesaurbana.ro/category/parenting
2. https://www.parentingconstient.ro/blog/
3. https://mamicaurbana.ro/
4. https://parentool.ro/blog/
5. https://zoso.ro/articole/parenting/
6. https://www.ceaicumamici.ro/
7. https://suntpitic.ro/
8. https://www.recenziidetop.ro/bloguri-parentale/
9. https://www.parentingineradigitala.ro/blog
10. https://www.bebelorelli.ro/blog/parenting
11. https://www.clubulcopiilor.ro/tema/parenting/
12. https://parentingpr.ro/blog/
13. https://lumealuifram.ro/blog/
14. https://mamalapatrat.ro/blog/
15. https://www.printesaurbana.ro/search/label/carti-de-parenting
16. https://www.parentingineradigitala.ro/blog?tag=greseli+de+parenting
17. https://clinica-hope.ro/blog-parenting/
18. https://uraniacremene.ro/blog/
19. https://parentool.ro/parenting/
20. http://blog.pandoram.ro/category/parenting/
21. https://www.parentingads.ro/
22. https://www.bebelorelli.ro/blog
23. https://www.blog.bloomcoding.ro/category/parenting/
2

In [5]:
query = "site:.md parenting blog"
parentingBlogsMd = findBlogs(query, "md")
print("100 Moldovan Parenting Blogs:")
for i, blog in enumerate(parentingBlogsMd, 1):
    print(f"{i}. {blog}")
#do not delete!!!!!!!!!!

100 Moldovan Parenting Blogs:
1. https://www.educatieparentala.md/
2. https://blog.blogtop.md/25-bloguri-de-parenting-din-moldova/
3. https://ea.md/tag/blog-de-parenting/
4. https://odoras.md/articole/topuri/topul-celor-mai-interesante-bloguri-de-parenting-din-romania/
5. https://smartkids.md/tag/parenting/
6. https://www.twinkl.md/blog/helping-my-child-to-form-their-numbers
7. https://blog.blogtop.md/category/uncategorized/
8. https://www.drepturilecopilului.md/index.php?option=com_content&view=category&layout=blog&id=85&Itemid=836&lang=en
9. https://iticket.md/event/bloggerita-printesa-urbana-despre-parenting-cu-iubire
10. https://diez.md/2019/12/07/ai-nevoie-de-sfaturi-sau-vrei-sa-cunosti-experienta-altor-parinti-iata-25-bloguri-de-parenting-din-moldova/
11. https://tibidoo.md/blog/page/2/
12. https://wb.md/2Gxpcf9
13. https://ea.md/baietelul-ei-a-inspirat-o-sa-creeze-un-blog-cunoaste-o-pe-a-blonde-mommy-mariana-mereuta-foto/
14. https://www.twinkl.md/blog/modern-art-a-guide-for-par

In [15]:
query = "site:.md sport (experiențe OR povestiri OR jurnal) blog personal"
sportBlogsMd = findBlogs(query, "md")
print("100 Moldovan Sport Blogs:")
for i, blog in enumerate(sportBlogsMd, 1):
    print(f"{i}. {blog}")


100 Moldovan Sport Blogs:
1. https://fmf.md/blog
2. https://www.siteweb.md/blog/item/57-ce-reprezinta-totusi-un-blog-cum-cream-un-blog
3. https://fmf.md/industria-dezvoltata-a-fotbalului-inseamna-experiente-si-cunostinte-de-care-noi-avem-nevoie
4. https://www.jurnal.md/ro/news/fc9f7ee007fda8c8/cristiano-ronaldo-va-produce-un-serial-despre-fotbal.html
5. https://nataalbot.md/page/48/
6. https://voloshin.md/en/category/interview/about-sport/
7. http://www.medialab.md/upload/teze/o_1b5iabtae1hu21ilcq9hprv1v75e.pdf
8. https://www.jurnaltv.md/category/desteptarea/11
9. http://www.mariamarian.articol.md/?pg=view_art&id=2616
10. https://locals.md/2016/e-la-blog-totul-despre-sanatate-frumusete-moda-si-sport/
11. https://blogosfera.md/view-post-v-105717-0-romana.html
12. https://blogosfera.md/view-post-v-105797-0-romana.html
13. https://blogosfera.md/view-post-v-177442-0-romana.html
14. https://blogosfera.md/view-post-v-18013-0-romana.html
15. https://blogosfera.md/view-post-v-208540-0-romana.h

In [14]:
query = "site:.md agricultura (experiențe OR sfaturi OR jurnal) blog personal"
agricultureBlogsMd = findBlogs(query, "md")
print("100 Md Agriculture Blogs:")
for i, blog in enumerate(agricultureBlogsMd, 1):
    print(f"{i}. {blog}")


100 Md Agriculture Blogs:
1. https://agrobiznes.md/5-sfaturi-practice-pentru-o-recolta-de-cartofi-de-calitate.html
2. https://ecolocal.md/blog/
3. https://utm.md/en/blog/2024/02/27/modernizing-moldovan-agriculture-the-vision-of-utm-rector-viorel-bostan/
4. https://www.jurnal.md/ro/social/2017/4/20/producatoarea-agricola-aliona-mandatii-intimidata-pentru-pozitia-sa-civica/
5. https://nataalbot.md/author/nataalbot/page/16/
6. https://agromedia.md/agricultura-moderna/zootehnie/cresterea-animalelor/cresterea-iepurilor-hrana-ingrijire
7. https://agrobiznes.md/sfaturi-la-intretinerea-gainilor-in-perioada-rece.html
8. https://www.zdg.md/tag/agricultori/
9. https://anvelope.md/sfaturi/sfaturi-de-top-pentru-alegerea-anvelopelor-agricole-69
10. https://www.maib.md/en/blog
11. https://abdc.ucipifad.md/curs/ghidul-agricultorului-pentru-lansarea-si-dezvoltarea-gospodariilor-taranesti-in-republica-moldova/
12. https://youth.md/page/941/?post_type=news&order&orderby
13. https://www.zdg.md/importante/

In [16]:
query = "site:.md (travel blog OR travel journal OR travel tips OR travel experiences OR adventure blog)"
travelingBlogsMd = findBlogs(query, "md")
print("100 Md Traveling Blogs:")
for i, blog in enumerate(travelingBlogsMd, 1):
    print(f"{i}. {blog}")


100 Md Traveling Blogs:
1. http://www.goadventure.md/
2. https://autocare.md/en/blog/
3. https://ibn.idsi.md/vizualizare_articol/174717
4. https://www.ways.md/en/blog?page=3
5. https://goadventure.md/blog
6. http://xamax.md/2016/01/25/looking-for-an-advanture/
7. https://autocare.md/en/blog/tag/lifestyle-and-entertainment/
8. https://pandatur.md/en/group-excursions
9. https://www.ways.md/en/blog/travel-stories-by-wendela-kilmer
10. https://flamingotur.md/home-2/
11. https://astitour.md/
12. http://xamax.md/category/%D0%B1%D0%B5%D0%B7-%D1%80%D1%83%D0%B1%D1%80%D0%B8%D0%BA%D0%B8/
13. https://bookshop.md/en/carti/travel-activity-book-2/
14. http://winetours.md/eng/about-us
15. https://999.md/ro/59974823
16. https://www.maib.md/en/noutati/bucurati-va-de-calatorii-cu-mastercard
17. https://www.elefant.md/how-to-travel-with-kids-without-losing-your-mind-full-color-edition-real-world-tips-and-practical-solutions-for-traveling-with-your-children_f2f00a54-904f-11eb-964a-0242c0b10011
18. https://

In [18]:
query = "site:.md (carte blog OR cititor blog OR lectura blog) personal"
beautyBlogsMd = findBlogs(query, "md")
print("100 Md Beauty Blogs:")
for i, blog in enumerate(beautyBlogsMd, 1):
    print(f"{i}. {blog}")


100 Md Beauty Blogs:
1. https://cartier.md/blog/
2. https://nataalbot.md/carti/
3. https://nataalbot.md/categorie-produs/carti/
4. https://999.md/ro/60719000
5. https://www.elefant.md/be-a-fashion-blogger-build-your-blog-turn-it-into-a-profitable-business-and-attract-brands-paperback_b3dc022d-b4f7-5bf5-9fc0-48b4722701dc
6. https://blog.blogtop.md/top-20-bloguri-din-moldova-august-2019/
7. https://www.elefant.md/learn-how-to-become-a-blogger-an-easy-step-by-step-guide-to-starting-your-own-blog-paperback_d05cd37b-4aba-4b69-9b5b-236adf54664f
8. https://www.bncreanga.md/?page=17
9. https://diez.md/2018/04/20/blogtop-desemnat-cele-mai-populare-bloguri-din-moldova-ale-lunii-martie-2018-care-sunt-acestea/
10. https://blogosfera.md/read.php?post=http://clubbib2.wordpress.com/2014/05/15/opt-romane-clasice-de-citit-intr-o-viata/&id=315178&s=a
11. https://travelblog.md/carti-pe-cale-sa-schimbe-vieti-sau-carti-bune-pe-care-sa-le-iei-in-vacanta/
12. https://www.zdg.md/blog/viata-redactiei/in-fond-f

In [7]:
query = "site:.ro agricultura (experiențe OR sfaturi OR jurnal) blog personal"
agricultureBlogsRo = findBlogs(query, "ro")
print("100 Ro Agriculture Blogs:")
for i, blog in enumerate(agricultureBlogsRo, 1):
    print(f"{i}. {blog}")

An error occurred: HTTP Error 429: Too Many Requests
100 Ro Agriculture Blogs:
