In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time, os

class PJF():
    def __init__(self):
        self.search_url = []
        self.cv_urls = []
        self.keywords = ''

    def load_cv_data(self, path:str):
        # Check if the file exists
        try:
            self.cv_data = pd.read_csv(path, sep=';')
        except FileNotFoundError:
            self.cv_data = pd.DataFrame(columns=['Keyword', 'ID', 'Link', 'Title', 'Location', 'PostedDate'])
        
    def search(self, allwords:str='', nowords:str='', title:str='', textwords:str='', location:str='', radius:int=500, r:int=100):
        for p in range(1, 6):
            _url = "https://www.postjobfree.com/resumes?q="+allwords.replace(' ', '+')+"&n="+nowords.replace(' ', '+')+"&t="+title.replace(' ', '+')+"&d="+textwords.replace(' ', '+')+"&l="+location.replace(' ', '+')+"&radius="+str(radius)+"&r="+str(r)+"&p="+str(p)
            self.search_url.append(_url)

        for url in self.search_url:
            try:
                self.get_resume_links(url)
            except Exception as e:
                print("Error in scraping", url)
            time.sleep(0.5)
        print("Total number of resume:", len(self.cv_urls))

    def scraping(self):
        # Initialize the dataframe or load the existing one
        self.load_cv_data('cv_data.csv')
        for i, link in enumerate(self.cv_urls):
            print(f"{self.keywords}: Scraping link {i+1} / {len(self.cv_urls)}")
            if len(self.cv_data) > 0:
                unique_links = self.cv_data['Link'].unique()
                # Loop through the links
                if link in unique_links:
                    print(f"Link {i+1} already scraped")
                    continue
                else:
                    self.get_resume_content(link)
            else:
                self.get_resume_content(link)
                

    def search_and_scraping(self, allwords:str='', nowords:str='', title:str='', textwords:str='', location:str='', radius:int=500, r:int=100):
        self.keywords = title
        self.search(allwords, nowords, title, textwords, location, radius, r)
        self.scraping()

    def get_resume_links(self, search_url:str):
        page = requests.get(search_url)
        soup = BeautifulSoup(page.content, 'html.parser')
        link_elements = soup.find_all('h3', class_='itemTitle')
        for link in link_elements:
            _combined_link = "https://www.postjobfree.com" + link.a['href']
            if _combined_link not in self.cv_urls:
                self.cv_urls.append(_combined_link)
            else:
                print("Duplicate link found", link.a['href'])

    def save_to_new_txt(self, text:str, filename:str):
        # Define the directory path
        title = self.keywords.replace(' ', '_').lower()
        directory = f'.../postjobfree/{title}'
        # Ensure the directory exists
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        # Create the file and write text
        file_path = os.path.join(directory, f"{filename}.txt")
        try:
            with open(file_path, 'w') as f:
                f.write(text)
            print(f"File saved successfully at {file_path}")
        except Exception as e:
            print(f"Error saving the file: {e}")

    def get_resume_content(self, resume_link:str):
        try:
            page = requests.get(resume_link)
            soup = BeautifulSoup(page.content, 'html.parser')
            _title = soup.find('h1').text
            _location = soup.find('a', class_='colorLocation').text
            _posted_data = soup.find('span', class_='colorDate').text
            _text = [element.text for element in soup.find_all('div', class_='normalText')]
            _text = '\n'.join(_text)
            _text = _text.replace(';', ',')

            # Save _text to a text file
            self.save_to_new_txt(_text, resume_link[35:].replace("/", "_"))
            
            # Fix: wrap scalar values in lists to create a single-row DataFrame
            _df = pd.DataFrame({
                'Keyword': [self.keywords],
                'ID': [resume_link[35:].replace('/', '_')],
                'Link': [resume_link],
                'Title': [_title],
                'Location': [_location],
                'PostedDate': [_posted_data],
            })

            # Concatenate with the existing cv_data
            self.cv_data = pd.concat([self.cv_data, _df], ignore_index=True)
            # Save the dataframe to a csv file
            self.cv_data.to_csv('cv_data.csv', index=False, sep=';')
            print(f"Scraped link:", resume_link[27:])

        except Exception as e:
            print(e)
            print(f"Error in scraping {resume_link}")
        time.sleep(0.2)


In [None]:
ResumeCraw = PJF()
keywords = ['Software Engineer', 'Frontend Developer', 'Backend Developer', 'Full Stack Developer', 'Mobile Developer', 
            'DevOps Engineer', 'Embedded Systems Engineer', 'Data Scientist', 'Data Engineer', 'Data Analyst', 'Machine Learning Engineer', 
            'AI Research Scientist', 'Product Manager', 'Technical Program Manager', 'UX/UI Designer', 'Interaction Designer', 'Cloud Engineer', 
            'Site Reliability Engineer (SRE)', 'Infrastructure Engineer', 'Network Engineer', 'Cybersecurity Engineer', 'Security Analyst', 'Penetration Tester', 
            'Information Security Manager', 'QA Engineer', 'Test Automation Engineer', 'Performance Engineer', 'Research Scientist', 'Algorithm Engineer', 'IT Support Specialist', 
            'Systems Administrator', 'Database Administrator (DBA)', 'Robotics Engineer', 'NLP Engineer', 'Computer Vision Engineer', 'Blockchain Developer', 'Cryptography Engineer', 
            'Technical Writer', 'AI Operations Engineer', 'Game Developer', 'Game Engine Developer', 'Solutions Architect', 'Technical Consultant', 'BI Developer', 'BI Analyst',
            'Product Manager', 'Technical Program Manager', 'Project Manager', 'Engineering Manager', 'Development Manager', 'IT Manager', 
            'Operations Manager', 'Quality Assurance Manager', 'Data Science Manager', 'Data Engineering Manager', 'Security Manager', 'Sales Engineering Manager', 
            'Customer Success Manager', 'Marketing Manager', 'Technical Support Manager', 'Change Manager', 'DevOps Manager', 'Compliance Manager', 'Release Manager', 
            'Business Intelligence Manager', 'Digital Marketing Manager', 'HR Manager', 'Talent Acquisition Manager', 'Training and Development Manager', 'Finance Manager', 
            'Risk Manager', 'Strategy Manager', 'Product Marketing Manager'
            ]
for i, keyword in enumerate(keywords):
    print(f"Scraping keyword {i+1} / {len(keywords)}")
    ResumeCraw.search_and_scraping(title=keyword)