# Creating the URLs from job posting information

In [68]:
import pandas as pd
import re

# Function to replace spaces and punctuation with hyphens
def replace_with_hyphen(text):
    return re.sub(r'[\s\W]+', '-', text)

def get_job_url(csv_file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Process each column and store the edited data in new columns
    df['edited_job-title'] = df['job-title'].apply(replace_with_hyphen)
    df['edited_company'] = df['company'].apply(replace_with_hyphen)
    df['edited_locations'] = df['locations'].apply(replace_with_hyphen)
    df['edited_job-id'] = (df['job-id']).astype(str)
    
    # Add a new column 'job-url'
    df['job-url'] = 'https://foundit.in/job/' + df['edited_job-title'] + "-" + df['edited_company'] + "-" + df['edited_locations'] + "-" + df['edited_job-id']
    
    # Display the DataFrame
    return df

In [69]:
df = get_job_url("foundit-IT.csv")

In [70]:
job_url = df['job-url']

# Scrape job posting information from URLs

In [73]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

# Create a fake user agent
ua = UserAgent()

# Create a request header containing the fake user agent
request_headers = {
    'user-agent': userAgent
}

# Function to scrape job information from a URL
def scrape_job_info(url):
    # 2 seconds of "rest" time between each GET request
    time.sleep(2)
    
    # Randomly choose a new fake user agent
    userAgent = ua.random

    try:
        response = requests.get(url, headers=request_headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
    
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract job information
        years_of_experience = soup.find('span', string=re.compile("years", re.I)).text.strip()    
        job_description = soup.find('div', id="jobDescription").contents[0].contents[1].text.strip()
        job_type = soup.find('span', string=re.compile("job type", re.I)).parent.contents[1].text.strip()
    
        return {
            'years_of_experience': years_of_experience,
            'job_description': job_description,
            'job_type': job_type
        }
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return {
            'years_of_experience': 'N/A',
            'job_description': 'N/A',
            'job_type': 'N/A'
        }

# Apply the scraping function to each URL in the DataFrame
job_info_list = job_url.apply(scrape_job_info)

# Convert the list of dictionaries to a DataFrame and concatenate with the original DataFrame
job_info_df = pd.DataFrame(job_info_list.tolist())
result_df = pd.concat([df, job_info_df], axis=1)

# Display the result DataFrame
print(result_df)

   web-scraper-order                              web-scraper-start-url  \
0       1739412261-1  https://www.foundit.in/srp/results?sort=1&limi...   
1       1739412261-2  https://www.foundit.in/srp/results?sort=1&limi...   
2       1739412261-3  https://www.foundit.in/srp/results?sort=1&limi...   
3       1739412261-4  https://www.foundit.in/srp/results?sort=1&limi...   
4       1739412261-5  https://www.foundit.in/srp/results?sort=1&limi...   
..               ...                                                ...   
95     1739412261-96  https://www.foundit.in/srp/results?sort=1&limi...   
96     1739412261-97  https://www.foundit.in/srp/results?sort=1&limi...   
97     1739412261-98  https://www.foundit.in/srp/results?sort=1&limi...   
98     1739412261-99  https://www.foundit.in/srp/results?sort=1&limi...   
99    1739412261-100  https://www.foundit.in/srp/results?sort=1&limi...   

                                            job-title  \
0                                  Process

In [75]:
result_df.to_csv("foundit-data.csv")