In [1]:
import requests
import warnings
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from langdetect import detect
from selenium import webdriver
import os
from googlesearch import search
import time
from urllib.parse import urlparse, urljoin


warnings.filterwarnings("ignore", message="Unverified HTTPS request")
driver = webdriver.Chrome()

# pip install requests
# pip install beautifulsoup4
# pip install pandas
# pip install langdetect
# pip install google


In [2]:
def removeHtmlTags(htmlText):
    soup = BeautifulSoup(htmlText, 'html.parser')
    for htmlText in soup.find_all(['header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        htmlText.decompose()

    text = soup.get_text(separator=" ", strip=True)
    words = text.split()
    # Filter out words containing "https://"
    filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
    filtered_text = ' '.join(filtered_words)
    return filtered_text


def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')
        # visibleText = soup.get_text()
        body_tag = soup.body
        if body_tag:
                visibleText = body_tag.get_text(separator='\n', strip=True)
                visibleText = removeHtmlTags(visibleText)
                lines = visibleText.split('\n')
                # i check for '.' and for a line longer that 200 chars to reduce titles, headers
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0 and len(line) > 200 and '.' in line)
                return text
        else:
            print("Body tag not found.")
    return None


def findLinks(url, sitesToVisit, visitedSites):
    base_domain = urlparse(url).netloc.lower()
    blacklist = ['youtube.com', 'instagram.com', 'pinterest.com', 'twitter.com', 'facebook.com',
                 'login', 'cookie', 'cookies', 'politica-de-confidentialitate', 'despre-noi', 
                 'termeni-si-conditii', 'contact', 'privacy-policy', 'search', 'archive', 
                 'tag', 'category', 'forum', 'login', 'register', 'profile', 'logout', 
                 'sign-up','log-in','my-account','privacy','conditii','service','terms','comment','comentariu','respond',
                 'conditions','about','sitemap','cont','comments','feed','politica-editoriala','cum-ne-poti-ajuta',
                 'password','paywall','arhiva','archive','termeni','despre','admin','newsletter',
                 'cart', 'checkout', 'shop', 'store', 'download', 'subscribe', 'unsubscribe','produs','abonare',
                 'terms-of-service', 'about-us', 'faq', 'donate', 'events', 'calendar', 
                 'faq', 'gallery', 'help', 'guidelines', 'policy']
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        soup = BeautifulSoup(res.content, 'html.parser')
        navbar = soup.find('nav', class_='navbar')
        footer = soup.find('footer', class_='footer')
        links = soup.find_all('a', href=True)
        for link in links:
            if navbar and link in navbar.find_all('a') or footer and link in footer.find_all('a'):
                continue
            href = link['href']
            if href.startswith('#') or href.startswith('javascript:'):
                continue  
            currentLink = urljoin(url, href)
            current_domain = urlparse(currentLink).netloc.lower()
            if current_domain == base_domain:
                if currentLink not in visitedSites and currentLink not in sitesToVisit and all(b not in currentLink.lower() for b in blacklist):
                    sitesToVisit.append(currentLink)
        return 0
    return 1

def extractAll(url, file_name,siteIdx, nrOfSitesToVisit = 100):
    data = pd.DataFrame(columns=['siteIdx','link','text'])
    sitesToVisit = []
    visitedSites = []
    initialSitesToVisit = nrOfSitesToVisit
    tries = 1
    findLinksCode = findLinks(url, sitesToVisit, visitedSites)
    startTime = time.time()
    if findLinksCode == 0:
        while nrOfSitesToVisit:
            if sitesToVisit:
                link = sitesToVisit[-1]
                text = extractText(link)
                if text:
                    #succesful text extraction
                    new_row = {'siteIdx':siteIdx, 'link': link, 'text': text}
                    data = pd.concat([data, pd.DataFrame([new_row])], ignore_index=True)
                    nrOfSitesToVisit -= 1
                    startTime = time.time()
                    # print("Succesfull visited site: ",link)

                visitedSites.append(link)
                sitesToVisit.pop()
                # print("Visited site: ",link)
            elif nrOfSitesToVisit:
                if len(visitedSites) > tries: 
                    findLinksCode = findLinks(visitedSites[tries], sitesToVisit, visitedSites)
                    tries += 1
                else:
                    print('Only', initialSitesToVisit - nrOfSitesToVisit, 'articles out of', initialSitesToVisit, 'for', url)
                    nrOfSitesToVisit = 0
            endTime = time.time()
            if endTime - startTime > 300:
                print("3 minute time limit exceeded, ",tries," unsuccesful tries")
                if not os.path.isfile(file_name):
                    data.to_csv(file_name, index=False)
                else:
                    data.to_csv(file_name, index=False, mode='a', header=False)
                return data
    if not os.path.isfile(file_name):
        data.to_csv(file_name, index=False)
    else:
        data.to_csv(file_name, index=False, mode='a', header=False)
    return data



def processUrls(nrOfSitesToVisit,urls, fileName):

    with open(fileName, 'w'):
        pass

    allData = pd.DataFrame(columns=['siteIdx','link', 'Text'])
    siteIdx = 0
    for url in urls:
        try:
            siteIdx += 1
            data = extractAll(nrOfSitesToVisit, url,fileName,siteIdx)
            allData = pd.concat([allData, data], ignore_index=True) 
        except Exception as e:
            print(f"Error occurred while processing {url}: {e}")
    return allData


In [95]:
agricultureBlogsMd = [
"https://agroexpert.md/",
"https://agrobiznes.md/",
"https://agrotv.md/category/stiri/agricultura/",
"https://md.agrointel.ro/category/fermier-in-republica-moldova",
"https://agricolahub.md/blog/"
]

fileName = "agricultureMd.csv"
with open(fileName, 'w'):
    pass

agricultureDataMd = pd.DataFrame(columns=['siteIdx','link', 'text'])
# agricultureDataMd = pd.concat([agricultureDataMd, ...], ignore_index=True) 

In [49]:
#agroexpert
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            
        for strong_tag in soup.find_all('strong'):
            strong_tag.extract()

        for fulger_tag in soup.find_all('p', class_="p-socials"):
            fulger_tag.extract()

        content_div = soup.find('div', class_='content')

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 1
url = agricultureBlogsMd[blogIdx-1]
agroexpertData = extractAll(url,fileName,blogIdx,100)



In [53]:
#agrobiznes
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        meta_tag = soup.find('meta', property="og:description")

        if meta_tag:
            extracted_content = meta_tag['content']
            
            if extracted_content:
                soup = BeautifulSoup(extracted_content, 'html.parser')
                text = soup.get_text(separator=" ", strip=True)
                words = text.split()
                # Filter out words containing "https://"
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
    
blogIdx = 2
url = agricultureBlogsMd[blogIdx-1]
agrobiznesData = extractAll(url,fileName,blogIdx,100)


  soup = BeautifulSoup(extracted_content, 'html.parser')


In [109]:
#agrotv
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_='entry-content')

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 3
url = agricultureBlogsMd[blogIdx-1]
agrotvData = extractAll(url,fileName,blogIdx,100)



1 minute time limit exceeded,  140  unsuccesful tries


In [91]:
#agrointel
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        for address_tag in soup.find_all('address'):
            address_tag.extract()

        for h1_tag in soup.find_all('h1'):
            h1_tag.extract()

        for ads_tag in soup.find_all('ins', class_="adsbygoogle"):
            ads_tag.extract()

        content_div = soup.find('article')

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 4
url = agricultureBlogsMd[blogIdx-1]
agrointelData = extractAll(url,fileName,blogIdx,100)

In [104]:
#agricolahub
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='ty-mainbox-body')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
    
blogIdx = 5
url = agricultureBlogsMd[blogIdx-1]
agricolahubData = extractAll(url,fileName,blogIdx,100)

In [111]:
agricultureDataMd = pd.DataFrame(columns=['siteIdx','link', 'text'])
agricultureDataMd = pd.concat([agricultureDataMd, agroexpertData], ignore_index=True) 
agricultureDataMd = pd.concat([agricultureDataMd, agrobiznesData], ignore_index=True) 
agricultureDataMd = pd.concat([agricultureDataMd, agrotvData], ignore_index=True) 
agricultureDataMd = pd.concat([agricultureDataMd, agrointelData], ignore_index=True) 
agricultureDataMd = pd.concat([agricultureDataMd, agricolahubData], ignore_index=True) 

agricultureDataMd.to_csv(fileName, index=False)


In [44]:
#################################################RO######AGRICULTURE##############################################
agricultureBlogsRo = [
"https://agrointel.ro/category/stiri-agricole",
"https://blog.magazialucostica.ro/",
"https://www.stiriagricole.ro/",
"https://www.agroinfo.ro/",
"https://agropress.ro/"

]

fileName = "agricultureRo.csv"
with open(fileName, 'w'):
    pass

agricultureDataRo = pd.DataFrame(columns=['siteIdx','link', 'text'])
# agricultureDataMd = pd.concat([agricultureDataMd, ...], ignore_index=True) 

In [9]:
#agrointel
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        for address_tag in soup.find_all('address'):
            address_tag.extract()

        for h1_tag in soup.find_all('h1'):
            h1_tag.extract()

        for ads_tag in soup.find_all('ins', class_="adsbygoogle"):
            ads_tag.extract()

        content_div = soup.find('article')

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 1
url = agricultureBlogsRo[blogIdx-1]
agrointelRoData = extractAll(url,fileName,blogIdx,100)

In [12]:
#magazialucostica

def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('article')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
    
blogIdx = 2
url = agricultureBlogsRo[blogIdx-1]
magazialucosticaData = extractAll(url,fileName,blogIdx,100)


In [32]:
#stiriagricole

def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='elementor-element-3e16f961')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
    
blogIdx = 3
url = agricultureBlogsRo[blogIdx-1]
stiriAgricoleData = extractAll(url,fileName,blogIdx,100)


In [31]:
#agroinfo
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()


        content_div = soup.find('div', id='news_body')

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 4
url = agricultureBlogsRo[blogIdx-1]
agroinfoRoData = extractAll(url,fileName,blogIdx,100)

  k = self.parse_starttag(i)


1 minute time limit exceeded,  18  unsuccesful tries


In [43]:
#agropress
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()


        content_div = soup.find('div', class_='td-post-content')

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
    
blogIdx = 5
url = agricultureBlogsRo[blogIdx-1]
agropressData = extractAll(url,fileName,blogIdx,100)

In [46]:
agricultureDataRo = pd.DataFrame(columns=['siteIdx','link', 'text'])
agricultureDataRo = pd.concat([agricultureDataRo, agrointelRoData], ignore_index=True) 
agricultureDataRo = pd.concat([agricultureDataRo, magazialucosticaData], ignore_index=True) 
agricultureDataRo = pd.concat([agricultureDataRo, stiriAgricoleData], ignore_index=True) 
agricultureDataRo = pd.concat([agricultureDataRo, agroinfoRoData], ignore_index=True) 
agricultureDataRo = pd.concat([agricultureDataRo, agropressData], ignore_index=True) 

agricultureDataRo.to_csv(fileName, index=False)


In [20]:
culturalBlogsMd = [
"https://nataalbot.md/category/evenimente-speciale/",
"https://eucitesc.md/blog-full-width/page/61/",
"https://teosunny.wordpress.com/",
"http://www.vitalie-vovc.com/",
"https://ganduridespletite.com/"
]

fileName = "culturalMd.csv"
with open(fileName, 'w'):
    pass

# cultureDataRo = pd.DataFrame(columns=['siteIdx','link', 'text'])


In [18]:
#natal bot
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='post-entry')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
    
blogIdx = 1
url = culturalBlogsMd[blogIdx-1]
natalBotData = extractAll(url,fileName,blogIdx,100)


In [13]:
#eucitesc
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='single-entry-summary-post-content')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
    
      
    
blogIdx = 2
url = culturalBlogsMd[blogIdx-1]
eucitescData = extractAll(url,fileName,blogIdx,100)


In [14]:
#teosunny
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()

        for img_tag in soup.find_all('img'):
            img_tag.extract()
        
        for tag in soup.find_all('div', class_='p-tag'):
            tag.extract()

        for tag in soup.find_all('div', class_='sharedaddy'):
            tag.extract()

        content_div = soup.find('div', class_='p-con')
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 3
url = culturalBlogsMd[blogIdx-1]
teosunnyData = extractAll(url,fileName,blogIdx,100)


In [10]:
#vitalie
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()

        for img_tag in soup.find_all('img'):
            img_tag.extract()
        
        for tag in soup.find_all('div', class_='p-tag'):
            tag.extract()

        content_div = soup.find('div', class_='contenuArticle')
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 4
url = culturalBlogsMd[blogIdx-1]
vitalieData = extractAll(url,fileName,blogIdx,100)


Only 62 articles out of 100 for http://www.vitalie-vovc.com/


In [23]:
#ganduri despletite
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()

        for img_tag in soup.find_all('img'):
            img_tag.extract()
        
        div = soup.find('div', class_='entry-content')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
blogIdx = 5
url = culturalBlogsMd[blogIdx-1]
ganduriDespletiteData = extractAll(url,fileName,blogIdx,100)


In [24]:
culturalDataMd = pd.DataFrame(columns=['siteIdx','link', 'text'])
culturalDataMd = pd.concat([culturalDataMd, natalBotData], ignore_index=True) 
culturalDataMd = pd.concat([culturalDataMd, eucitescData], ignore_index=True) 
culturalDataMd = pd.concat([culturalDataMd, teosunnyData], ignore_index=True) 
culturalDataMd = pd.concat([culturalDataMd, vitalieData], ignore_index=True) 
culturalDataMd = pd.concat([culturalDataMd, ganduriDespletiteData], ignore_index=True) 

culturalDataMd.to_csv(fileName, index=False)

In [9]:
culturalBlogsRo = [
"https://lecturile-emei.blogspot.com/",
"https://jurnalul-unei-cititoare.ro/blog",
"https://blogdecititori.ro",
"https://www.inefabil.ro/",
"https://booknation.ro/",

 ]

fileName = "culturalRo.csv"
with open(fileName, 'w'):
    pass

In [58]:
#lecturileEmei
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()

        for img_tag in soup.find_all('img'):
            img_tag.extract()

        for tr_div in soup.find_all ('tr'):
            tr_div.extract()
        
        div = soup.find('div', class_='entry-content')
    
        text = div.get_text(separator=" ", strip=True)
        words = text.split()
        filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
        filtered_text = ' '.join(filtered_words)
        lines = filtered_text.split('\n')
        text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
        
        if len(text) > 1000 and detect(text) == 'ro':
            return text
      
blogIdx = 1
url = culturalBlogsRo[blogIdx-1]
lecturileemeiData = extractAll(url,fileName,blogIdx,2)


In [6]:
#jurnalul
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()

        for img_tag in soup.find_all('img'):
            img_tag.extract()

        for h_div in soup.find_all ('h1'):
            h_div.extract()
        
        for h_div in soup.find_all ('h2'):
            h_div.extract()

        for h_div in soup.find_all ('h3'):
            h_div.extract()

        for h_div in soup.find_all ('h4'):
            h_div.extract()

        content_div = soup.find_all('div',class_='sqs-html-content')

        if content_div:
            extracted_paragraphs = []
            
            for p_tag in content_div:
                paragraph_text = p_tag.get_text(separator=" ", strip=True)
                extracted_paragraphs.append(paragraph_text)
            
            text = ' '.join(extracted_paragraphs)
            
            words = text.split()
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
blogIdx = 2
url = culturalBlogsRo[blogIdx-1]
jurnalulData = extractAll(url,fileName,blogIdx,100)


In [7]:
#blog de cititori
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        for script_tag in soup.find_all('script'):
            script_tag.extract()

        for script_tag in soup.find_all('div', class_="comments_intro"):
            script_tag.extract()

        
        for script_tag in soup.find_all('div', class_="commentmetadata"):
            script_tag.extract()

        for script_tag in soup.find_all("div", id="commentsAdd"):
            script_tag.extract()

        div = soup.find('article')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
      
blogIdx = 3
url = culturalBlogsRo[blogIdx-1]
blogDeCititoriData = extractAll(url,fileName,blogIdx,100)


In [8]:
#inefabil
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        for img_tag in soup.find_all("span", class_="rt-reading-time"):
            img_tag.extract()

        div = soup.find('div', class_="post-entry")
        if div:
            content_div = div.find_all("span")

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
      
blogIdx = 4
url = culturalBlogsRo[blogIdx-1]
inefabilData = extractAll(url,fileName,blogIdx,100)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Only 10 articles out of 100 for https://www.inefabil.ro/


In [13]:
#booknation
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()

        for img_tag in soup.find_all('img'):
            img_tag.extract()
        
        content_div = soup.find('div', class_='et_pb_post_content_0_tb_body')
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
      
      
blogIdx = 5
url = culturalBlogsRo[blogIdx-1]
booknation = extractAll(url,fileName,blogIdx,50)


In [15]:
culturalDataRo = pd.DataFrame(columns=['siteIdx','link', 'text'])
culturalDataRo = pd.concat([culturalDataRo, lecturileemeiData], ignore_index=True) 
culturalDataRo = pd.concat([culturalDataRo, jurnalulData], ignore_index=True) 
culturalDataRo = pd.concat([culturalDataRo, blogDeCititoriData], ignore_index=True) 
culturalDataRo = pd.concat([culturalDataRo, inefabilData], ignore_index=True) 
culturalDataRo = pd.concat([culturalDataRo, booknation], ignore_index=True) 

culturalDataRo.to_csv(fileName, index=False)



In [60]:
parentingBlogsMd = [
"https://www.planetamami.com/",
"https://copilariamamicilor.wordpress.com/",
"https://jurnaldetatic.blogspot.com/",
"https://vulpea.blog/category/maternitate/"
]


fileName = "parentingMd.csv"
with open(fileName, 'w'):
    pass


In [22]:
#planetamami
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        for garb in soup.find_all(class_='entry-title'):
            garb.extract()

        div = soup.find('article', class_="post")
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
      
      
blogIdx = 1
url = parentingBlogsMd[blogIdx-1]
planetaMami = extractAll(url,fileName,blogIdx,100)


In [40]:
#copilariamamicilor
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div', class_="entry-content")
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
      
      
blogIdx = 2
url = parentingBlogsMd[blogIdx-1]
copilariaMamicilor = extractAll(url,fileName,blogIdx,100)


3 minute time limit exceeded,  210  unsuccesful tries


In [62]:
#jurnaldetatic
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div', class_="entry-content")
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
      
      
blogIdx = 3
url = parentingBlogsMd[blogIdx-1]
jurnaldetatic = extractAll(url,fileName,blogIdx,100)


Only 20 articles out of 100 for https://jurnaldetatic.blogspot.com/


In [64]:
#vulpea
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div', class_="entry-content")
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
                

blogIdx = 4
url = parentingBlogsMd[blogIdx-1]
vulpea = extractAll(url,fileName,blogIdx,100)

3 minute time limit exceeded,  440  unsuccesful tries


In [65]:
parentingDataMd = pd.DataFrame(columns=['siteIdx','link', 'text'])
parentingDataMd = pd.concat([parentingDataMd, planetaMami], ignore_index=True) 
parentingDataMd = pd.concat([parentingDataMd, copilariaMamicilor], ignore_index=True) 
parentingDataMd = pd.concat([parentingDataMd, jurnaldetatic], ignore_index=True) 
parentingDataMd = pd.concat([parentingDataMd, vulpea], ignore_index=True) 

parentingDataMd.to_csv(fileName, index=False)


In [56]:
parentingBlogsRo = [
"https://www.parentingconstient.ro/blog/",
"https://suntpitic.ro/",
"https://www.parentingineradigitala.ro/blog",
"https://meseriadeparinte.ro/",
"https://fricidemamici.ro/category/parinteala/"
]

fileName = "parentingRo.csv"
with open(fileName, 'w'):
    pass

parentingDataRo = pd.DataFrame(columns=['siteIdx','link', 'text'])

In [11]:
#parenting constient
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        div = soup.find('div', class_="entry-content")
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
                

blogIdx = 1
url = parentingBlogsRo[blogIdx-1]
parenting_constient = extractAll(url,fileName,blogIdx,100)

Only 70 articles out of 100 for https://www.parentingconstient.ro/blog/


In [49]:
#sunt pitic
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        div = soup.find('div', class_="entry-content")
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
                

blogIdx = 2
url = parentingBlogsRo[blogIdx-1]
suntpitic = extractAll(url,fileName,blogIdx,100)

In [16]:
#era digitala
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        div = soup.find('div', class_="blog-post-body__content")
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
                

blogIdx = 3
url = parentingBlogsRo[blogIdx-1]
era_digitala = extractAll(url,fileName,blogIdx,100)

Only 52 articles out of 100 for https://www.parentingineradigitala.ro/blog


In [54]:
#mederia_de_parinte
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="the_content")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 100 and detect(text) == 'ro':
                return text
      
    
blogIdx = 4
url = parentingBlogsRo[blogIdx-1]
mederia_de_parinte = extractAll(url,fileName,blogIdx,100)



In [58]:
#frici de mamici
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="cm-entry-summary")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 100 and detect(text) == 'ro':
                return text
      
    
blogIdx = 5
url = parentingBlogsRo[blogIdx-1]
frici_de_mamici = extractAll(url,fileName,blogIdx,100)



In [60]:
parentingDataRo = pd.DataFrame(columns=['siteIdx','link', 'text'])
parentingDataRo = pd.concat([parentingDataRo, parenting_constient], ignore_index=True) 
parentingDataRo = pd.concat([parentingDataRo, suntpitic], ignore_index=True) 
parentingDataRo = pd.concat([parentingDataRo, era_digitala], ignore_index=True) 
parentingDataRo = pd.concat([parentingDataRo, mederia_de_parinte], ignore_index=True) 
parentingDataRo = pd.concat([parentingDataRo, frici_de_mamici], ignore_index=True) 

parentingDataRo.to_csv(fileName, index=False)


In [89]:
sportBlogsRo = [
"https://www.holokolo.ro/blog/reactor",
"https://www.humanfitness.ro/blog",
"https://www.imreandrea.ro/blog/",
"https://dancefit.ro/blog/",
"https://almalibre.ro/blog/",
"https://alexanderflorescu.ro/blog/",
"https://doinaiosif.ro/blog",
]

fileName = "sportRo.csv"
with open(fileName, 'w'):
    pass


In [65]:
#kolokolo
import json
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        script_tag = soup.find('script', {'type': 'application/ld+json'})
        
        if script_tag:
            json_content = json.loads(script_tag.string)
            article_body = json_content.get('articleBody', '')
            soup = BeautifulSoup(article_body, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0 and '.' in line)
            
            if len(text) > 100 and detect(text) == 'ro':
                return text
            
blogIdx = 1
url = sportBlogsRo[blogIdx-1]
holokolo = extractAll(url,fileName,blogIdx,100)

3 minute time limit exceeded,  23  unsuccesful tries


In [69]:
#human fitness
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="elementor-element-493dc825")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 100 and detect(text) == 'ro':
                return text
      
    
blogIdx = 2
url = sportBlogsRo[blogIdx-1]
human_fitness = extractAll(url,fileName,blogIdx,100)



Only 60 articles out of 100 for https://www.humanfitness.ro/blog


In [73]:
#imreandrea
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="text-content")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 100 and detect(text) == 'ro':
                return text
      
    
blogIdx = 3
url = sportBlogsRo[blogIdx-1]
imreandrea = extractAll(url,fileName,blogIdx,100)

Only 34 articles out of 100 for https://www.imreandrea.ro/blog/


In [71]:
#dance fit
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="entry-content")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 100 and detect(text) == 'ro':
                return text
      
    
blogIdx = 4
url = sportBlogsRo[blogIdx-1]
dance_fit = extractAll(url,fileName,blogIdx,100)



Only 41 articles out of 100 for https://dancefit.ro/blog/


In [75]:
#alma_libre
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="entry-content")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 100 and detect(text) == 'ro':
                return text
      
    
blogIdx = 5
url = sportBlogsRo[blogIdx-1]
alma_libre = extractAll(url,fileName,blogIdx,100)



Only 29 articles out of 100 for https://almalibre.ro/blog/


In [77]:
#alexander_florescu
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="entry-content")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 100 and detect(text) == 'ro':
                return text
      
    
blogIdx = 6
url = sportBlogsRo[blogIdx-1]
alexander_florescu = extractAll(url,fileName,blogIdx,100)



Only 37 articles out of 100 for https://alexanderflorescu.ro/blog/


In [92]:
#doina_iosif
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="elementor-widget-theme-post-content")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 7
url = sportBlogsRo[blogIdx-1]
doina_iosif = extractAll(url,fileName,blogIdx,100)



Only 56 articles out of 100 for https://doinaiosif.ro/blog


In [93]:
sportDataRo = pd.DataFrame(columns=['siteIdx','link', 'text'])
sportDataRo = pd.concat([sportDataRo, holokolo], ignore_index=True) 
sportDataRo = pd.concat([sportDataRo, human_fitness], ignore_index=True) 
sportDataRo = pd.concat([sportDataRo, imreandrea], ignore_index=True) 
sportDataRo = pd.concat([sportDataRo, dance_fit], ignore_index=True) 
sportDataRo = pd.concat([sportDataRo, alma_libre], ignore_index=True) 
sportDataRo = pd.concat([sportDataRo, alexander_florescu], ignore_index=True) 
sportDataRo = pd.concat([sportDataRo, doina_iosif], ignore_index=True) 


sportDataRo.to_csv(fileName, index=False)

In [115]:
sportBlogsMd = [
"https://blog.antrenor.md/",
"https://sandugrecu.blogspot.com/",
"https://acvilasport.md/blog",
"https://dinamo.md/blog/",
"https://unica.md/sport/",
"https://www.ellefitness.md/ro/blog",
"https://fmf.md/blog"
]

fileName = "sportMd.csv"
with open(fileName, 'w'):
    pass


In [106]:
#antrenor
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')


        content_div = soup.find_all('p')

        if content_div:
            extracted_paragraphs = []
            
            for p_tag in content_div:
                paragraph_text = p_tag.get_text(separator=" ", strip=True)
                extracted_paragraphs.append(paragraph_text)
            
            text = ' '.join(extracted_paragraphs)
            
            words = text.split()
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            
            if len(text) > 100 and detect(text) == 'ro':
                return text
                
      
    
blogIdx = 1
url = sportBlogsMd[blogIdx-1]
antrenor = extractAll(url,fileName,blogIdx,100)



Only 12 articles out of 100 for https://blog.antrenor.md/


In [108]:
#sandu_grecu
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="entry-content")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 2
url = sportBlogsMd[blogIdx-1]
sandu_grecu = extractAll(url,fileName,blogIdx,100)



  soup = BeautifulSoup(extracted_content, 'html.parser')


In [110]:

#acvila sport
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()
            

        content_div = soup.find('div', class_="text mt-20-768")

        
        if content_div:
            extracted_content = content_div.get_text()
            soup = BeautifulSoup(extracted_content, 'html.parser')
            text = soup.get_text(separator=" ", strip=True)
            words = text.split()
            # Filter out words containing "https://"
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            if len(text) > 1000 and detect(text) == 'ro':
                return text
      
    
blogIdx = 3
url = sportBlogsMd[blogIdx-1]
acvila_sport = extractAll(url,fileName,blogIdx,100)



3 minute time limit exceeded,  634  unsuccesful tries


In [114]:
#dinamo
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='main-content')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
blogIdx = 4
url = sportBlogsMd[blogIdx-1]
dinamo = extractAll(url,fileName,blogIdx,100)

In [117]:

#unica
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='article-content')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
blogIdx = 5
url = sportBlogsMd[blogIdx-1]
unica = extractAll(url,fileName,blogIdx,100)

In [118]:
sportDataMd = pd.DataFrame(columns=['siteIdx','link', 'text'])
sportDataMd = pd.concat([sportDataMd, antrenor], ignore_index=True) 
sportDataMd = pd.concat([sportDataMd, sandu_grecu], ignore_index=True) 
sportDataMd = pd.concat([sportDataMd, acvila_sport], ignore_index=True) 
sportDataMd = pd.concat([sportDataMd, dinamo], ignore_index=True) 
sportDataMd = pd.concat([sportDataMd, unica], ignore_index=True) 


sportDataMd.to_csv(fileName, index=False)

In [125]:
travelBlogsMd = [
"https://nataalbot.md/category/calatorii/",
"https://travelblog.md/",
"http://gurez.md/",
"https://orheianca.eu/",
"https://anamariaursublog.wordpress.com/",
"https://eucalatorul.net/",
"https://altblogdecalatorii.wordpress.com/",
"https://wagabond.blog/"
]

fileName = "travelMd.csv"
with open(fileName, 'w'):
    pass


In [122]:
#nataalbot
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        meta_tag = soup.find('meta', property="og:description")

        if meta_tag:
            extracted_content = meta_tag['content']
            
            if extracted_content:
                soup = BeautifulSoup(extracted_content, 'html.parser')
                text = soup.get_text(separator=" ", strip=True)
                words = text.split()
                # Filter out words containing "https://"
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
                

blogIdx = 1
url = travelBlogsMd[blogIdx-1]
nataalbot = extractAll(url,fileName,blogIdx,100)

In [124]:
#travel blog
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='entry-content')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
blogIdx = 2
url = travelBlogsMd[blogIdx-1]
travel_blog = extractAll(url,fileName,blogIdx,100)

In [127]:
#gurez
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='post-entry')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
blogIdx = 3
url = travelBlogsMd[blogIdx-1]
gurez = extractAll(url,fileName,blogIdx,100)

In [130]:
#orheianca
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        post_tags_div = soup.find('div', class_='post-tags')
        if post_tags_div:
            post_tags_div.extract()

        div = soup.find('div',id='content-area')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
blogIdx = 4
url = travelBlogsMd[blogIdx-1]
orheianca = extractAll(url,fileName,blogIdx,100)

In [132]:
#ursul
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='entry-content')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
blogIdx = 5
url = travelBlogsMd[blogIdx-1]
ursul = extractAll(url,fileName,blogIdx,100)

Only 54 articles out of 100 for https://anamariaursublog.wordpress.com/


In [133]:
travelDataMd = pd.DataFrame(columns=['siteIdx','link', 'text'])
travelDataMd = pd.concat([travelDataMd, nataalbot], ignore_index=True) 
travelDataMd = pd.concat([travelDataMd, travel_blog], ignore_index=True) 
travelDataMd = pd.concat([travelDataMd, gurez], ignore_index=True) 
travelDataMd = pd.concat([travelDataMd, orheianca], ignore_index=True) 
travelDataMd = pd.concat([travelDataMd, ursul], ignore_index=True) 


travelDataMd.to_csv(fileName, index=False)



In [150]:
travelBlogsRo = [
"https://www.imperatortravel.ro/",
"https://lipa-lipa.ro/",
"https://sacalatorim.ro/",
"https://travelista.ro/",
"https://blogulmeudecalator.ro/",
"https://chiperaria.wordpress.com/",
"https://www.mihaijurca.ro/"
]


fileName = "travelRo.csv"
with open(fileName, 'w'):
    pass


In [139]:
#imperator travel

def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        post_tags_div = soup.find('div', class_="comment-respond")
        if post_tags_div:
            post_tags_div.extract()


        content_div = soup.find_all('p')

        if content_div:
            extracted_paragraphs = []
            
            for p_tag in content_div:
                paragraph_text = p_tag.get_text(separator=" ", strip=True)
                extracted_paragraphs.append(paragraph_text)
            
            text = ' '.join(extracted_paragraphs)
            
            words = text.split()
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            
            if len(text) > 1000 and detect(text) == 'ro':
                return text
    
blogIdx = 1
url = travelBlogsRo[blogIdx-1]
imperator = extractAll(url,fileName,blogIdx,100)

In [142]:
#lipa-lipa
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='entry-content')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
blogIdx = 2
url = travelBlogsMd[blogIdx-1]
lipa_lipa = extractAll(url,fileName,blogIdx,100)

In [146]:
# sa calatorim

def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        post_tags_div = soup.find('div', class_="tdm-descr")
        if post_tags_div:
            post_tags_div.extract()

        post_tags_div = soup.find('div', class_="comments")
        if post_tags_div:
            post_tags_div.extract()

        post_tags_div = soup.find('div', class_="tds-title")
        if post_tags_div:
            post_tags_div.extract()

        content_div = soup.find_all('p')

        if content_div:
            extracted_paragraphs = []
            
            for p_tag in content_div:
                paragraph_text = p_tag.get_text(separator=" ", strip=True)
                extracted_paragraphs.append(paragraph_text)
            
            text = ' '.join(extracted_paragraphs)
            
            words = text.split()
            filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
            filtered_text = ' '.join(filtered_words)
            lines = filtered_text.split('\n')
            text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
            
            if len(text) > 1000 and detect(text) == 'ro':
                return text
    
blogIdx = 3
url = travelBlogsRo[blogIdx-1]
sacalatorim = extractAll(url,fileName,blogIdx,100)

In [152]:
# travelista
def extractText(url):
    res = requests.get(url, verify=False)
    if res.status_code == 200:
        htmlPage = res.content
        soup = BeautifulSoup(htmlPage, 'html.parser')

        for a_tag in soup.find_all('a'):
            a_tag.extract()
        
        for img_tag in soup.find_all('img'):
            img_tag.extract()

        div = soup.find('div',class_='mkdf-post-text-main')
        if div:
            content_div = div.find_all('p')

            if content_div:
                extracted_paragraphs = []
                
                for p_tag in content_div:
                    paragraph_text = p_tag.get_text(separator=" ", strip=True)
                    extracted_paragraphs.append(paragraph_text)
                
                text = ' '.join(extracted_paragraphs)
                
                words = text.split()
                filtered_words = [word for word in words if "https://" not in word and "http://" not in word]
                filtered_text = ' '.join(filtered_words)
                lines = filtered_text.split('\n')
                text = '\n'.join(line.strip() for line in lines if len(line.strip()) > 0  and '.' in line)
                
                if len(text) > 1000 and detect(text) == 'ro':
                    return text
      
blogIdx = 4
url = travelBlogsRo[blogIdx-1]
travelista = extractAll(url,fileName,blogIdx,100)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

In [154]:
travelDataRo = pd.DataFrame(columns=['siteIdx','link', 'text'])
travelDataRo = pd.concat([travelDataRo, imperator], ignore_index=True) 
travelDataRo = pd.concat([travelDataRo, lipa_lipa], ignore_index=True) 
travelDataRo = pd.concat([travelDataRo, sacalatorim], ignore_index=True) 
travelDataRo = pd.concat([travelDataRo, travelista], ignore_index=True) 


travelDataRo.to_csv(fileName, index=False)