# Web Scrapping

In [61]:
# import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [62]:
# Define headers to make the request look like it's from a real browser
headers = {
    "Accept": "application/json, text/plain, */*",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}

# Send GET request to JobLeads and get page content
website = requests.get(
    "https://www.jobleads.com/search/jobs?noQuizAfterRegister=1&quizClosable=1&campaignId=100&location_country=PAK&keywords=indeed",
    headers=headers
).text  # Convert response to text


In [63]:
# Load the HTML text into BeautifulSoup so we can start scraping.
soup = BeautifulSoup(website, 'html.parser')

In [64]:
print(soup.prettify())

<!DOCTYPE html>
<html data-theme="aurora" lang="en-US" translate="no">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0" name="viewport"/>
  <title>
   Enhance Your Job Search Now | JobLeads Job Search
  </title>
  <link href="/nuxt/css/fonts-aurora.css" nonce="e0hE6AbWCsrbqItfnyuFiw==" rel="stylesheet"/>
  <style nonce="e0hE6AbWCsrbqItfnyuFiw==">
  </style>
  <style nonce="e0hE6AbWCsrbqItfnyuFiw==">
  </style>
  <style nonce="e0hE6AbWCsrbqItfnyuFiw==">
   :export{mobile:768px;sm:320px;xsm:640px;md:768px;lg:980px;xl:1280px;xxl:1440px;uhd:1920px}.jlu-icon{align-items:center;display:inline-flex;height:24px;justify-content:center;width:24px}.jlu-icon:after{background-color:currentColor;content:"";display:flex;height:100%;-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;width:100%}.jlu-icon_account:after{-webkit-mask-image:url("data:ima

In [65]:
# Get the text from the very first <div> tag found in the HTML.
soup.find_all('div')[0].text

'Jobs Headhunters Free resume review About Us ENRegisterLoginEmpoweringjob seekersToolsJobsResume reviewHeadhuntersCompanyAbout usCareers at JobLeadsSite noticeReviewsSupportHelpPartner integrationATS PartnersSocialYouTubeLinkedInInstagramFacebookPrivacy PolicyTerms of UseCountriesChoose your preferences and find exciting, senior-level jobs in 40+ countriesAmericaArgentinaBrazilCanadaChileColombiaMexicoPeruUnited StatesVenezuelaEuropeAustriaBelgiumDenmarkFinlandFranceGermanyIrelandItalyNetherlandsNorwayPolandPortugalSpainSwedenSwitzerlandUnited KingdomAsiaBahrainHong KongIndiaIndonesiaKuwaitMalaysiaOmanPakistanPhilippinesQatarSaudi ArabiaSingaporeTurkeyUnited Arab EmiratesOceaniaAustraliaNew ZealandAfricaSouth Africa © JobLeads 2007 - 2025 | All rights reserved'

In [66]:
# Count total <div> elements on the page
len(soup.find_all('div'))

31

In [67]:
# Loop through all <div> elements and print their text content without extra spaces
for i in soup.find_all('div'):
    print(i.text.strip())

Jobs Headhunters Free resume review About Us ENRegisterLoginEmpoweringjob seekersToolsJobsResume reviewHeadhuntersCompanyAbout usCareers at JobLeadsSite noticeReviewsSupportHelpPartner integrationATS PartnersSocialYouTubeLinkedInInstagramFacebookPrivacy PolicyTerms of UseCountriesChoose your preferences and find exciting, senior-level jobs in 40+ countriesAmericaArgentinaBrazilCanadaChileColombiaMexicoPeruUnited StatesVenezuelaEuropeAustriaBelgiumDenmarkFinlandFranceGermanyIrelandItalyNetherlandsNorwayPolandPortugalSpainSwedenSwitzerlandUnited KingdomAsiaBahrainHong KongIndiaIndonesiaKuwaitMalaysiaOmanPakistanPhilippinesQatarSaudi ArabiaSingaporeTurkeyUnited Arab EmiratesOceaniaAustraliaNew ZealandAfricaSouth Africa © JobLeads 2007 - 2025 | All rights reserved
Jobs Headhunters Free resume review About Us ENRegisterLoginEmpoweringjob seekersToolsJobsResume reviewHeadhuntersCompanyAbout usCareers at JobLeadsSite noticeReviewsSupportHelpPartner integrationATS PartnersSocialYouTubeLinkedIn

In [68]:
# Loop through all <p> (paragraph) elements and print their cleaned text
for i in soup.find_all('p'):
    print(i.text.strip())

America
Europe
Asia
Oceania
Africa
© JobLeads 2007 - 2025 | All rights reserved


In [69]:
# Loop through all <button> elements and print their text content
for i in soup.find_all('button'):
    print(i.text.strip())

Register
Login
Countries


In [70]:
company = soup.find_all('div', class_='JobsCard_container__kErNU')

In [72]:
company

[]

# new scraping

In [73]:
def scrape_wiki_page(page_name):
    url = f"https://en.wikipedia.org/wiki/{page_name}"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/120.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9"
    }
    
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extract page title
    title = soup.select_one("h1#firstHeading").get_text(strip=True)
    
    # Extract **all paragraphs**
    paragraphs = soup.select("div.mw-parser-output > p")
    all_text = []
    for p in paragraphs:
        text = p.get_text(strip=True)
        if text:
            all_text.append(text) 
    
    full_text = "\n".join(all_text)   # join with new lines
    
    return {
        "title": title,
        "url": url,
        "full_text": full_text
    }

# --- Scrape multiple pages ---
pages = ["Artificial_intelligence", "Machine_learning", "Deep_learning"]
results = [scrape_wiki_page(p) for p in pages]

# --- Save into CSV ---
df = pd.DataFrame(results)
df.to_csv("../Dataset/wikipedia_full_articles.csv", index=False, encoding="utf-8")

print("✅ Scraping complete! Saved to wikipedia_full_articles.csv")
print(df.head())


✅ Scraping complete! Saved to wikipedia_full_articles.csv
                     title                                                url  \
0  Artificial intelligence  https://en.wikipedia.org/wiki/Artificial_intel...   
1         Machine learning     https://en.wikipedia.org/wiki/Machine_learning   
2            Deep learning        https://en.wikipedia.org/wiki/Deep_learning   

                                           full_text  
0  Artificial intelligence(AI) is the capability ...  
1                                                     
2  Inmachine learning,deep learningfocuses on uti...  
