In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests

# Configure Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run headless Chrome
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)

# Function to get job URLs from a single page
def get_job_urls(page_url):
    driver.get(page_url)
    time.sleep(5)  # Wait for the page to load
    
    # Find elements with the specified class containing job links
    job_elements = driver.find_elements(By.TAG_NAME, 'a')

    job_urls = []
    for job_element in job_elements:
        url = job_element.get_attribute('href')
        if url and '/companies/' in url:
            job_urls.append(url)

    return job_urls

# Function to extract job details from the job page
def extract_job_details(job_url):
    driver.get(job_url)
    time.sleep(5)  # Wait for the page to load
    
    job_details = {
        "Poste": "",
        "Entreprise": "",
        "Localisation": "",
        "Description": "",
        "Contrat": "",
        "Salaire": "",
        "Teletravail": "",
        "Experience": "",
        "Education": "",
        "Competences": "",
        "Date de debut": "",
        "Date de parution": "",
        "Secteur d'activite": "",
        "Nom du site": "Welcome to the jungle",
        "Lien du site": job_url
    }
    
    # Job title
    job_title_selectors = ['.sc-gvZAcH.lodDwl.wui-text', '.sc-fulCBj.bCgioU.wui-text']
    for selector in job_title_selectors:
        try:
            job_details["Poste"] = driver.find_element(By.CSS_SELECTOR, selector).text
            break
        except:
            continue
    
    # Company
    company_selectors = ['.sc-fulCBj.rSLAX.wui-text', '.sc-gvZAcH.inYPTi.wui-text', '.sc-fulCBj.hRsmOP.wui-text', '.sc-emIrwa.Trejm']
    for selector in company_selectors:
        try:
            job_details["Entreprise"] = driver.find_element(By.CSS_SELECTOR, selector).text
            break
        except:
            continue

    # Location
    location_selectors = ['.sc-1eoldvz-0.bZJPQK', '.q7vo0q-2.hKloXu']
    for selector in location_selectors:
        try:
            job_details["Localisation"] = driver.find_element(By.CSS_SELECTOR, selector).text
            break
        except:
            continue
    
    # Description
    try:
        job_details["Description"] = driver.find_element(By.CSS_SELECTOR, '.sc-1tacsq-1.kqgROr').text
    except:
        pass

    # Secteur d'activité
    try:
        job_details["Secteur d'activite"] = driver.find_element(By.CSS_SELECTOR, '.sc-bOhtcR.dVVhYl').text
    except:
        pass

    # Date de parution
    try:
        time_element = driver.find_element(By.CSS_SELECTOR, '.sc-fulCBj.kphePP.wui-text time')
    
        # Get the value of the datetime attribute
        datetime_value = time_element.get_attribute('datetime')
    
        # Extract just the date part (first 10 characters)
        date_part = datetime_value[:10]
    
        # Assign the extracted date to "Date de parution" key in job_details
        job_details["Date de parution"] = date_part

    except:

        pass
    
    # Combined elements for contrat, début, salaire, télétravail, experience, education
    elements_selectors = ['.sc-dQEtJz.iIerXh', '.sc-bOhtcR.eDrxLt']
    combined_text = ""
    for selector in elements_selectors:
        elements = driver.find_elements(By.CSS_SELECTOR, selector)
        if elements:
            combined_text = " | ".join([el.text for el in elements])
            break
    
    job_details["combined"] = combined_text

    # Parse combined text into separate fields
    parsed_data = parse_combined_text(job_details["combined"])
    job_details.update(parsed_data)
    
    # Remove combined column
    del job_details["combined"]

    # Competences
    competences_selectors = ['.sc-bOhtcR.kOIrWV']
    for selector in competences_selectors:
        try:
            competences_elements = driver.find_elements(By.CSS_SELECTOR, selector)
            job_details["Competences"] = " , ".join([el.text for el in competences_elements])
            break
        except:
            continue
    
    return job_details

# Function to parse combined text into separate columns
def parse_combined_text(combined_text):
    data = {
        "Contrat": "",
        "Salaire": "",
        "Date de debut": "",
        "Teletravail": "",
        "Experience": "",
        "Education": ""
    }
    
    parts = combined_text.split(" | ")
    
    for part in parts:
        if "Lieu" in part:
            data["Localisation"] = part.replace("Lieu : ", "").strip()
        elif part.startswith("Salaire"):
            data["Salaire"] = part.replace("Salaire : ", "").strip()
        elif part.startswith("Début"):
            data["Date de debut"] = part.replace("Début : ", "").strip()
        elif part.startswith("Télétravail"):
            data["Teletravail"] = part.replace("Télétravail : ", "").strip()
        elif part.startswith("Expérience"):
            data["Experience"] = part.replace("Expérience : ", "").strip()
        elif part.startswith("Éducation"):
            data["Education"] = part.replace("Éducation : ", "").strip()
        else:
            # Assume first part is the contract and place/position split by "|"
            if not data["Contrat"]:
                contrat_lieu = part.split(" | ")
                data["Contrat"] = contrat_lieu[0].strip()
                if len(contrat_lieu) > 1:
                    data["Localisation"] = contrat_lieu[1].strip()

    return data

# Main function
def main():
    # Base URL with a placeholder for the page number
    base_url = "https://www.welcometothejungle.com/fr/jobs?refinementList%5Boffices.country_code%5D%5B%5D=FR&refinementList%5Bexperience_level_minimum%5D%5B%5D=0-1&query=data&page={}&aroundQuery=France"
    
    job_data_list = []
    
    for page in range(1, 2):  # Loop through pages 1 to 27
        page_url = base_url.format(page)
        job_urls = get_job_urls(page_url)
        
        for job_url in job_urls:
            job_details = extract_job_details(job_url)
            
            # Ensure job_details contains valid data before appending
            if job_details["Poste"] and job_details["Entreprise"]:
                job_data_list.append(job_details)
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(job_data_list)
    
    # Clean the DataFrame columns by removing specific prefixes including newline characters
    df['Education'] = df['Education'].str.replace('Éducation :\n', '', regex=False)
    df['Experience'] = df['Experience'].str.replace('Expérience :\n', '', regex=False)
    df['Salaire'] = df['Salaire'].str.replace('Salaire :\n', '', regex=False)
    df['Date de debut'] = df['Date de debut'].str.replace('Début :\n', '', regex=False)
    df['Teletravail'] = df['Teletravail'].str.replace('Télétravail\n', '', regex=False)
    
    # Rearrange columns in the specified order
    df = df[["Poste", "Entreprise", "Localisation", "Contrat", "Description", "Nom du site", "Lien du site", 
             "Experience", "Education", "Competences", "Salaire", "Teletravail", "Date de debut", 
             "Date de parution", "Secteur d'activite"]]
    
    # Replace empty strings with NaN (missing values)
    df.replace("", pd.NA, inplace=True)

    # Drop duplicated values
    df = df.drop_duplicates()

    df['Date de parution'] = df['Date de parution'].fillna(method='ffill')
    
    return df

# Run the main function and save the result in a variable
if __name__ == "__main__":
    job_df = main()
    print(job_df.head())  # Print the first few rows of the DataFrame to verify

# Close the Selenium driver
driver.quit()

                                               Poste  \
0  Alternance - Data/Business Analyst SFE Junior ...   
2                      Data Analyst IA - Stage - H/F   
4  Data Scientist Industrie 4.0 en alternance - P...   
6        Data Quality Operator en alternance - Paris   
8               ALT - DATA SCIENTIST / ACTUARIAT F/H   

                       Entreprise Localisation                     Contrat  \
0                        BIOGARAN     Colombes                  Alternance   
2                        DATALEON        Paris             Stage\n(2 mois)   
4  PSB - PARIS SCHOOL OF BUSINESS        Paris                  Alternance   
6  PSB - PARIS SCHOOL OF BUSINESS        Paris                  Alternance   
8                 GENERALI FRANCE        Paris  Alternance\n(12 à 24 mois)   

                                         Description            Nom du site  \
0  Au sein de la direction Export, vous travaille...  Welcome to the jungle   
2  Tes missions :\nDévelopper des sc

In [3]:
job_df['Date de parution'][0] = "2024-06-30"
job_df.head()

Unnamed: 0,Poste,Entreprise,Localisation,Contrat,Description,Nom du site,Lien du site,Experience,Education,Competences,Salaire,Teletravail,Date de debut,Date de parution,Secteur d'activite
0,Alternance - Data/Business Analyst SFE Junior ...,BIOGARAN,Colombes,Alternance,"Au sein de la direction Export, vous travaille...",Welcome to the jungle,https://www.welcometothejungle.com/fr/companie...,> 6 mois,Bac +5 / Master,"Excel , Power bi",Non spécifié,Télétravail non autorisé,,2024-06-30,"Pharmaceutique / Biotechnologique, Santé"
2,Data Analyst IA - Stage - H/F,DATALEON,Paris,Stage\n(2 mois),Tes missions :\nDévelopper des scripts Python ...,Welcome to the jungle,https://www.welcometothejungle.com/fr/companie...,< 6 mois,Bac +1,"Sql , Python",Non spécifié,Télétravail occasionnel,01 juillet 2024,2024-06-29,"Logiciels, Intelligence artificielle / Machine..."
4,Data Scientist Industrie 4.0 en alternance - P...,PSB - PARIS SCHOOL OF BUSINESS,Paris,Alternance,Contrat d'alternance basé à Paris et au sein d...,Welcome to the jungle,https://www.welcometothejungle.com/fr/companie...,< 6 mois,Bac +3,"Hadoop , Azure , Snowflake , Aws , Python",Non spécifié,Télétravail fréquent,,2024-06-29,"Education, Formation"
6,Data Quality Operator en alternance - Paris,PSB - PARIS SCHOOL OF BUSINESS,Paris,Alternance,Contrat d'alternance basé à Paris et au sein d...,Welcome to the jungle,https://www.welcometothejungle.com/fr/companie...,< 6 mois,Bac +3,Sql,Non spécifié,Télétravail fréquent,,2024-06-29,"Education, Formation"
8,ALT - DATA SCIENTIST / ACTUARIAT F/H,GENERALI FRANCE,Paris,Alternance\n(12 à 24 mois),Vos missions\n Le Generali Climate Lab est exp...,Welcome to the jungle,https://www.welcometothejungle.com/fr/companie...,> 6 mois,Bac +4,Python,Non spécifié,Télétravail non autorisé,,2024-06-29,"Banque, Assurance, FinTech / InsurTech"


In [4]:
job_df.to_csv('welcome_to_the_jungle_30_06.csv', index=False)