In [30]:
#pip install selenium webdriver-manager pandas beautifulsoup4

In [108]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from bs4 import BeautifulSoup

# Set up Selenium
options = Options()
# options.add_argument("--headless")  # Uncomment to run in headless mode (no GUI)
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

def scrape_job_details(job_url):
    driver.get(job_url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'styles_jd-header-title__rZwM1')))
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extract job details
    title = soup.find('h1', class_='styles_jd-header-title__rZwM1').text.strip() if soup.find('h1', class_='styles_jd-header-title__rZwM1') else 'N/A'
    company = soup.find('div', class_='styles_jd-header-comp-name__MvqAI').text.strip() if soup.find('div', class_='styles_jd-header-comp-name__MvqAI') else 'N/A'
    location = soup.find('span', class_='styles_jhc__location__W_pVs').text.strip() if soup.find('span', class_='styles_jhc__location__W_pVs') else 'N/A'
    salary = soup.find('div', class_='styles_jhc__salary__jdfEC').text.strip() if soup.find('div', class_='styles_jhc__salary__jdfEC') else 'N/A'
    experience = soup.find('div', class_='styles_jhc__exp__k_giM').text.strip() if soup.find('div', class_='styles_jhc__exp__k_giM') else 'N/A'
    jobposted = soup.find('span', class_='styles_jhc__stat__PgY67').text.strip() if soup.find('span', class_='styles_jhc__stat__PgY67') else 'N/A'
    education = soup.find('div', class_='styles_education__KXFkO').text.strip() if soup.find('div', class_='styles_education__KXFkO') else 'N/A'
    employmenttype = soup.find('div', class_='styles_details__Y424J').text.strip() if soup.find('div', class_='styles_details__Y424J') else 'N/A'
    
    return {
        'Job Title': title,
        'Company': company,
        'Location': location,
        'Salary': salary,
        'Experience': experience,
        'Jobposted': jobposted,
        'Education': education,
        'Role': employmenttype
    }

def scrape_jobs(base_url):
    job_data = []

    # Loop through the first 50 pages
    for i in range(1, 51):  # Pages 1 to 50
        url = f"{base_url}&page={i}"  # Construct the URL for each page
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'srp-jobtuple-wrapper')))
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        job_listings = soup.find_all('div', class_='srp-jobtuple-wrapper')
        
        print(f"Loaded {len(job_listings)} job listings from {url}")

        for job in job_listings:
            try:
                # Get job link
                job_link = job.find('a', class_='title')['href']
                full_job_url = job_link if job_link.startswith('http') else 'https://www.naukri.com' + job_link
                
                # Scrape job details
                job_details = scrape_job_details(full_job_url)

                # Print details
                print(job_details)
                job_data.append(job_details)

            except Exception as e:
                print(f"Error processing job: {e}")

    # Convert list to DataFrame
    job_df = pd.DataFrame(job_data)
    return job_df

# Start scraping from the first page
jobs_df = scrape_jobs('https://www.naukri.com/data-scientist-jobs?experience=0')
driver.quit()  # Close the browser when done

# Save to CSV
#jobs_df.to_csv('naukri_jobs.csv', index=False)
print("Scraping completed and data saved to naukri_jobs.csv")


Loaded 20 job listings from https://www.naukri.com/data-scientist-jobs?experience=0&page=1
{'Job Title': 'Data Scientist', 'Company': 'Decision Point4.3110 Reviews', 'Location': 'Gurugram', 'Salary': 'Not Disclosed', 'Experience': '0 - 3 years', 'Jobposted': 'Posted: 1 day ago', 'Education': 'EducationUG: Any Graduate', 'Role': 'Role: Data Scientist,'}
{'Job Title': 'Data Scientist', 'Company': 'Aptlogica Technologies4.514 Reviews', 'Location': 'Pune', 'Salary': 'Not Disclosed', 'Experience': '0 - 2 years', 'Jobposted': 'Posted: 9 days ago', 'Education': 'EducationUG: Any GraduatePG: Any Postgraduate', 'Role': 'Role: Full Stack Data Scientist,'}
{'Job Title': 'Data Scientist - Phd only', 'Company': 'NDS Infoserv2.38 Reviews', 'Location': 'Mumbai', 'Salary': '12-15 Lacs P.A.', 'Experience': '0 - 2 years', 'Jobposted': 'Posted: 4 days ago', 'Education': 'EducationDoctorate: Ph.D/Doctorate in Electronics/Telecommunication, Computers', 'Role': 'Role: Data Scientist,'}
{'Job Title': 'Quanti

In [109]:
jobs_df

Unnamed: 0,Job Title,Company,Location,Salary,Experience,Jobposted,Education,Role
0,Data Scientist,Decision Point4.3110 Reviews,Gurugram,Not Disclosed,0 - 3 years,Posted: 1 day ago,EducationUG: Any Graduate,"Role: Data Scientist,"
1,Data Scientist,Aptlogica Technologies4.514 Reviews,Pune,Not Disclosed,0 - 2 years,Posted: 9 days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Full Stack Data Scientist,"
2,Data Scientist - Phd only,NDS Infoserv2.38 Reviews,Mumbai,12-15 Lacs P.A.,0 - 2 years,Posted: 4 days ago,EducationDoctorate: Ph.D/Doctorate in Electron...,"Role: Data Scientist,"
3,Quantitative Analyst / Data Scientist,Bestex Research,Bengaluru,Not Disclosed,0 - 3 years,Posted: 30+ days ago,EducationUG: Any GraduatePG: MBA/PGDM in Marke...,"Role: Data Scientist,"
4,Principal Data Scientist/Senior Data Scientist...,Benovymed Healthcare,"Noida, Gurugram",Not Disclosed,0 - 5 years,Posted: 19 days ago,EducationUG: B.Tech/B.E. in Any Specialization...,"Role: Data Scientist,"
...,...,...,...,...,...,...,...,...
995,Data Scientist,Clustor Computing3.74 Reviews,Nagpur,Unpaid,No fixed duration,Posted: 30+ days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Data Scientist,"
996,Data Scientist,Scimitar Research,Pune,Not Disclosed,0 - 1 years,Posted: 30+ days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Data Scientist,"
997,Data scientist,Growthjockey4.37 Reviews,Gurugram,Not Disclosed,0 - 1 years,Posted: 30+ days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Data Scientist,"
998,ML / Data Scientist / Gen AI Specialist,Aiqod3.76 Reviews,Pune,Not Disclosed,0 - 4 years,Posted: 30+ days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Data Scientist,"


In [110]:
# Save to CSV
jobs_df.to_csv('naukri_jobs_50pages.csv', index=False)


In [3]:
import pandas as pd
naukri = pd.read_csv("C:/Users/LENOVO/naukri_jobs_50pages.csv")
naukri

Unnamed: 0,Job Title,Company,Location,Salary,Experience,Jobposted,Education,Role
0,Data Scientist,Decision Point4.3110 Reviews,Gurugram,Not Disclosed,0 - 3 years,Posted: 1 day ago,EducationUG: Any Graduate,"Role: Data Scientist,"
1,Data Scientist,Aptlogica Technologies4.514 Reviews,Pune,Not Disclosed,0 - 2 years,Posted: 9 days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Full Stack Data Scientist,"
2,Data Scientist - Phd only,NDS Infoserv2.38 Reviews,Mumbai,12-15 Lacs P.A.,0 - 2 years,Posted: 4 days ago,EducationDoctorate: Ph.D/Doctorate in Electron...,"Role: Data Scientist,"
3,Quantitative Analyst / Data Scientist,Bestex Research,Bengaluru,Not Disclosed,0 - 3 years,Posted: 30+ days ago,EducationUG: Any GraduatePG: MBA/PGDM in Marke...,"Role: Data Scientist,"
4,Principal Data Scientist/Senior Data Scientist...,Benovymed Healthcare,"Noida, Gurugram",Not Disclosed,0 - 5 years,Posted: 19 days ago,EducationUG: B.Tech/B.E. in Any Specialization...,"Role: Data Scientist,"
...,...,...,...,...,...,...,...,...
993,Data Scientist,Clustor Computing3.74 Reviews,Nagpur,Unpaid,No fixed duration,Posted: 30+ days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Data Scientist,"
994,Data Scientist,Scimitar Research,Pune,Not Disclosed,0 - 1 years,Posted: 30+ days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Data Scientist,"
995,Data scientist,Growthjockey4.37 Reviews,Gurugram,Not Disclosed,0 - 1 years,Posted: 30+ days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Data Scientist,"
996,ML / Data Scientist / Gen AI Specialist,Aiqod3.76 Reviews,Pune,Not Disclosed,0 - 4 years,Posted: 30+ days ago,EducationUG: Any GraduatePG: Any Postgraduate,"Role: Data Scientist,"


In [4]:
naukri.isnull().sum()

Job Title     0
Company       0
Location      0
Salary        0
Experience    0
Jobposted     0
Education     0
Role          0
dtype: int64

In [5]:
# Remove the word "Education" from the 'Education' column
naukri['Education'] = naukri['Education'].str.replace('Education', '', regex=False)

# Remove the word "Posted:" from the 'Job Posted' column
naukri['Jobposted'] = naukri['Jobposted'].str.replace('Posted:', '', regex=False)

# Remove the word "Role" from the 'Role' column
naukri['Role'] = naukri['Role'].str.replace('Role:', '', regex=False)


In [6]:
# Define a regular expression to match numeric values and everything after
pattern = r'\d+(\.\d+)? Reviews?$'  # Matches numbers (with optional decimals) followed by " Reviews"

# Remove everything from numeric values to the end of the string in the 'Company' column
naukri['Company'] = naukri['Company'].str.replace(pattern, '', regex=True)

# Optionally, strip any leading or trailing whitespace
naukri['Company'] = naukri['Company'].str.strip()

In [7]:
naukri

Unnamed: 0,Job Title,Company,Location,Salary,Experience,Jobposted,Education,Role
0,Data Scientist,Decision Point,Gurugram,Not Disclosed,0 - 3 years,1 day ago,UG: Any Graduate,"Data Scientist,"
1,Data Scientist,Aptlogica Technologies,Pune,Not Disclosed,0 - 2 years,9 days ago,UG: Any GraduatePG: Any Postgraduate,"Full Stack Data Scientist,"
2,Data Scientist - Phd only,NDS Infoserv,Mumbai,12-15 Lacs P.A.,0 - 2 years,4 days ago,Doctorate: Ph.D/Doctorate in Electronics/Telec...,"Data Scientist,"
3,Quantitative Analyst / Data Scientist,Bestex Research,Bengaluru,Not Disclosed,0 - 3 years,30+ days ago,UG: Any GraduatePG: MBA/PGDM in Marketing,"Data Scientist,"
4,Principal Data Scientist/Senior Data Scientist...,Benovymed Healthcare,"Noida, Gurugram",Not Disclosed,0 - 5 years,19 days ago,UG: B.Tech/B.E. in Any SpecializationPG: M.Tec...,"Data Scientist,"
...,...,...,...,...,...,...,...,...
993,Data Scientist,Clustor Computing,Nagpur,Unpaid,No fixed duration,30+ days ago,UG: Any GraduatePG: Any Postgraduate,"Data Scientist,"
994,Data Scientist,Scimitar Research,Pune,Not Disclosed,0 - 1 years,30+ days ago,UG: Any GraduatePG: Any Postgraduate,"Data Scientist,"
995,Data scientist,Growthjockey,Gurugram,Not Disclosed,0 - 1 years,30+ days ago,UG: Any GraduatePG: Any Postgraduate,"Data Scientist,"
996,ML / Data Scientist / Gen AI Specialist,Aiqod,Pune,Not Disclosed,0 - 4 years,30+ days ago,UG: Any GraduatePG: Any Postgraduate,"Data Scientist,"


In [13]:
import pandas as pd
import mysql.connector
from mysql.connector import errorcode


# MySQL database connection
config = {
    "user": "root",
    "password": "malathiK16#",
    "host": "localhost",
    "database": "naukri",
    "raise_on_warnings": True
}

try:
    # Connect to the database
    connection = mysql.connector.connect(**config)
    cursor = connection.cursor()

    # Insert data into the Products table
    for index, row in naukri.iterrows():
        cursor.execute("""
            INSERT INTO job (`Job Title`,Company,Location,Salary,Experience,Jobposted,Education,`Role`)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """, (
            row['Job Title'],
            row['Company'],  
            row['Location'],
            row['Salary'],
            row['Experience'], 
            row['Jobposted'],
            row['Education'],
            row['Role']
            
        ))

    # Commit the transaction
    connection.commit()

except mysql.connector.Error as err:
    if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
        print("Something is wrong with your user name or password")
    elif err.errno == errorcode.ER_BAD_DB_ERROR:
        print("Database does not exist")
    else:
        print(err)
finally:
    cursor.close()
    connection.close()