In [10]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta


## GoodJobs.eu

In [526]:
# URL of the website you want to extract text from
url = 'https://goodjobs.eu/jobs/bereichsleiterin-wirtschaft-und-finanzen-kleeblatt-pflegeheime-ggmbh'

# Send a GET request to the URL
response = requests.get(url)

In [527]:
# Parse the HTML content of the webpage
soup = BeautifulSoup(response.text, 'html.parser')

In [528]:
def clean_filename():
    # Extract titles
    title = soup.title.text if soup.title else "No title found"
    title.strip()
    cleaned_title = re.sub(r'[^\w\s]', '', title.strip())
    cleaned_title = cleaned_title.replace('GoodJobs', '').strip()
    return cleaned_title

In [529]:
# Function to get lists nested under different sections, e.g. Profile, Tasks, etc.
def get_lists(section):
    list = soup.find('section', id=section).find('div', class_='text-style-body text-responsive-xs checkmark-list').find_all('li')
    return list

In [530]:
# def extract_ansprechpartnerin():
#     keywords = ["ANSPRECHPARTNERIN", "ANSPRECHPARTNER", "Ansprechpartner"]
#     # combine for or condition
#     pattern = re.compile("|".join(keywords))

#     # Find all <strong> elements containing any of the keywords
#     matching_elements = soup.find_all('strong', string=pattern)

#     # Get the text from the <p> element following each matching <strong>
#     for strong_element in matching_elements:
#         parent_p = strong_element.find_parent('p')
#         if parent_p:
#             next_p = parent_p.find_next_sibling('p')
#             if next_p:
#                 next_next_p = next_p.find_next_sibling('p')
#                 if next_next_p:
#                     return next_p.text.strip()

In [531]:
def extract_ansprechpartnerin():
    # Find the <div> element with class "col-span-full lg:col-span-8 lg:row-start-3 border-r-2"
    div_element = soup.find('div', class_='col-span-full lg:col-span-8 lg:row-start-3 border-r-2')

    # If the <div> element is found, find the <h2> element within it
    if div_element:
        h2_element = div_element.find('h2', class_='text-style-headline text-responsive-l')
        # If the <h2> element is found, print its text
        if h2_element:
            return h2_element.get_text().replace('\n', '').replace('            ',' ').strip()
    return None

In [532]:
extract_ansprechpartnerin()

'Stefan Ebert'

In [553]:
def extract_description(section_title):
    intro_section = soup.find('section', id=section_title)
    # If the section is found, extract all text within it
    if intro_section:
        nested_div = intro_section.find('div', class_='text-style-body text-responsive-xs checkmark-list')
        if nested_div:
            paragraphs = nested_div.find_all('p')
            # Extract text from each <p> element
            paragraph_texts = [p.get_text(strip=True) for p in paragraphs]
            # Remove empty strings
            cleaned_data = [item for item in paragraph_texts if item != '']
            return cleaned_data
        else:
            return []
    else:
        return []

In [534]:
# extract job criteria e.g. Starting Date, Location, etc.
def extract_job_criteria(keyword):
    # Find the <strong> tag containing the keyword
    strong_tag = soup.find('strong', string=keyword)
    # Extract the text from the next sibling and remove ": " prefix
    if strong_tag:
        next_sibling = strong_tag.find_next_sibling(string=True)
        if next_sibling:
            text = next_sibling.strip().split(': ')[1]  # Remove ": " prefix
            return text
    # Return None if keyword not found or text extraction fails
    return None

In [535]:
def extract_company():
    # Find the section with the id "company"
    company_section = soup.find('section', id='company')
    # Find the first h2 within the company section
    if company_section:
        first_h2 = company_section.find('h2')
        if first_h2:
            return first_h2.text
    return None

In [536]:
def extract_split(search_term):
    # Find the <p> tag containing the search string
    p_tag = soup.find('p', string=lambda text: text and search_term in text)

    # If the <p> tag is found, return its text content
    if p_tag:
        p_text = p_tag.get_text()
        parts = p_text.split(search_term)
        # Extract the date part
        date = parts[-1].strip()
        return date
    
    return None

In [537]:
def transform_date(date_string):
    # Split the date string based on the "." separator
    parts = date_string.split('.')
    # Rearrange the parts and concatenate them with the desired format
    transformed_date = "[[" + '-'.join(parts[::-1]) + "]]"
    return transformed_date

In [538]:
output_filename = f"{clean_filename()}.md"


In [539]:
jobtyp = extract_job_criteria("Job-Typ")
starting_date = extract_split("Arbeitsbeginn: ")
company = extract_company()
deadline = transform_date(extract_split("Job online bis"))
ansprechpartnerin = extract_ansprechpartnerin()
berufserfahrung = extract_split("Berufserfahrung: ")

In [540]:
tags = f"""\
type:: stellenausschreibung
institution:: {company}
Teilzeit:: {jobtyp}
status:: offen
starting_date:: {starting_date}
kommentar:: offen
deadline:: {deadline}
ansprechpartner:: {ansprechpartnerin}
website:: [Stellenausschreibung]({url})
berufserfahrung:: {berufserfahrung}
kennziffer:: offen
email:: offen

"""

In [554]:
extract_description("bewerbungsprozess")

['Werden Sie Teil der Kleeblatt Familie. Wir freuen uns auf Sie!',
 'Ihre Daten werden nach Abschluss des Verfahrens binnen einer Frist von drei Monaten vernichtet.']

In [541]:
job_description_list = extract_description("intro")
profile_list = get_lists("anforderungen")
task_list = get_lists("aufgaben")
benefits_list = get_lists("benefits")
bewerbungsprozess_list = extract_description("bewerbungsprozess")

In [542]:
with open(output_filename, 'w', encoding='utf-8') as markdown_file:
    # Write tags to markdown
    markdown_file.write(tags)
    # Write Profil / Requirements to markdown
    markdown_file.write(f"- ## Jobbeschreibung\n")
    for item in job_description_list:
        markdown_file.write(f"\t- {item}\n")
    markdown_file.write("\n")
    markdown_file.write(f"- ## Profil\n")
    for item in profile_list:
        markdown_file.write(f"\t- {item.text.strip()}\n")
    # Write Tasks / Aufgaben to markdown
    markdown_file.write(f"- ## Aufgaben\n")
    for item in task_list:
        markdown_file.write(f"\t- {item.text.strip()}\n")
    # Write Benefits to markdown
    markdown_file.write(f"- ## Benefits\n")
    for item in task_list:
        markdown_file.write(f"\t- {item.text.strip()}\n")
    markdown_file.write(f"- ## Bewerbungsprozess\n")
    for item in bewerbungsprozess_list:
        markdown_file.write(f"\t- {item}\n")

## LinkedIn

### Import different job offers for testing

In [6]:
import tkinter as tk
from tkinter import filedialog
from bs4 import BeautifulSoup

# Create a Tkinter root window
root = tk.Tk()
root.withdraw()  # Hide the root window

# Open a file dialog for the user to select the HTML file
html_file_path = filedialog.askopenfilename(title="Select HTML file", filetypes=(("HTML files", "*.html"), ("All files", "*.*")))

# Check if a file was selected
if html_file_path:
    # Read the HTML content from the selected file
    with open(html_file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Now you can use 'soup' to process the HTML content further
    print("HTML file parsed successfully.")
else:
    print("No file selected.")



2024-04-18 15:40:43.845 Python[3054:55539] +[CATransaction synchronize] called within transaction


No file selected.


In [7]:
# Open the HTML file and read its content
with open("/Users/juliankilchling/Downloads/Data Analyst (f_m_d) Scholz & Friends LinkedIn.html", "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

In [220]:
# Adjust
with open("/Users/juliankilchling/Downloads/(1) LinkedIn.html", "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

In [278]:
#ey
html = "/Users/juliankilchling/Downloads/EY senior consultant.html"

In [426]:
# Funke
html = "/Users/juliankilchling/Downloads/(Senior Data Analyst (m_w_d) FUNKE LinkedIn.html"

In [9]:
# Adjust
html = "/Users/juliankilchling/Downloads/(1) LinkedIn.html"

In [11]:
# Scholz & Friends
html = "/Users/juliankilchling/Downloads/Data Analyst (f_m_d) Scholz & Friends.html"

In [12]:
with open(html, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

## Code

In [13]:
#create dictionary to store results of webscrape
linkedin_dict = {}

In [14]:
# Get Job Title => WORKS
linkedin_dict["title"] = soup.find('h1', class_='t-24 t-bold job-details-jobs-unified-top-card__job-title').text.strip()
print(linkedin_dict)

{'title': 'Data Analyst (f/m/d)'}


In [15]:
# Get company name
job_details_div = soup.find('div', {'class':'job-details-jobs-unified-top-card__primary-description-without-tagline mb2'})
linkedin_dict['company'] = job_details_div.find('a').text.strip()

# Get publication date
linkedin_dict['pub_date'] = job_details_div.find(lambda tag: tag.name == 'span' and 'Vor' in tag.get_text()).text.strip()


In [16]:
job_details_2 = soup.find('li', {'class':'job-details-jobs-unified-top-card__job-insight job-details-jobs-unified-top-card__job-insight--highlight'})

#find out the job type (full-time, part-time, etc.)
try:
    linkedin_dict["jobtyp"] = job_details_2.find_all('span', {'class':'job-details-jobs-unified-top-card__job-insight-view-model-secondary'})[0].find('span', {'aria-hidden':'true'}).text.strip()
except AttributeError:
    linkedin_dict["jobtyp"] = ""


In [17]:
#Testing
linkedin_dict

{'title': 'Data Analyst (f/m/d)',
 'company': 'Scholz & Friends',
 'pub_date': 'Vor 1 Woche',
 'jobtyp': 'Vollzeit'}

In [18]:
# Get skills from LinkedIn Job Posting
qualifications_div = soup.find('div', id='how-you-match-card-container')
qualifications = qualifications_div.find_all('h3', {'class':'t-14 t-bold'})

for qual in qualifications:
    key_qual = qual.find_next_sibling("a").text.replace(" und",",").replace("\n","").strip()
    qual_stripped = qual.text.replace("Kenntnisse fehlen auf Ihrem Profil","Fehlende Kenntnisse").replace("Kenntnisse auf Ihrem Profil","Bestehende Kenntnisse").strip()[2:].lstrip()
    linkedin_dict[qual_stripped] = key_qual
print(linkedin_dict)

{'title': 'Data Analyst (f/m/d)', 'company': 'Scholz & Friends', 'pub_date': 'Vor 1 Woche', 'jobtyp': 'Vollzeit', 'Bestehende Kenntnisse': 'Datenanalytik, Datenvisualisierung, Looker (Software), Soziale Medien', 'Fehlende Kenntnisse': 'Benchmarking, Business Insights, Dashboard, Handlungsentwicklung, Kennzahlen, Round Tables'}


In [22]:
# Get job description section
# Find the article element with the specified class
job_description = soup.find('article', class_='jobs-description__container jobs-description__container--condensed')

### Extract Aufgaben / Rolle / Benefits

In [20]:
#WORKS
#Define text replacements for section titles

# Define replacements
replacements = {
    "Profil": "Profil",
    "PROFIL": "Profil",
    "Qualifications": "Profil",
    "WIR LIEBEN": "Profil",
    "Das bringst du mit": "Profil",
    "AUFGABEN": "Aufgaben",
    "Aufgaben": "Aufgaben",
    "Responsibilities": "Aufgaben",
    "Das erwartet dich bei uns": "Aufgaben",
    "Deine Aufgaben": "Aufgaben",
    "Deine Aufgaben": "Aufgaben",
    "DU LIEBST": "Aufgaben",
    "BENEFITS": "Benefits",
    "Perks": "Benefits",
    "Das bieten wir dir": "Benefits",
    "Benefits": "Benefits"
}

'Entwicklung und Pflege von DashboardsEntwicklung und Erstellung von Reportings für KundenErstellung von Audiences und Insights als Basis für StrategieausarbeitungenAbleitung von HandlungsempfehlungenUnterstützung\n bei der Erarbeitung von Kanalstrategien und strategischen Empfehlungen \nfür Kunden im Bereich Datenanalyse/ReportingUnterstützung der Kolleg*innen bei der Interpretation von DatenAufbau und Vorantreiben der Dateninfrastruktur des Teams'

In [75]:
%%python

import tkinter as tk
from tkinter import simpledialog
from bs4 import BeautifulSoup

# Scholz & Friends
html = "/Users/juliankilchling/Downloads/Data Analyst (f_m_d) Scholz & Friends.html"
with open(html, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

job_description = soup.find('article', class_='jobs-description__container jobs-description__container--condensed')
# Find all <ul> elements within the job description
uls = job_description.find_all('ul')

linkedin_dict={}

# Define replacements
replacements = {
    "Profil": "Profil",
    "PROFIL": "Profil",
    "Qualifications": "Profil",
    "WIR LIEBEN": "Profil",
    "Das bringst du mit": "Profil",
    "AUFGABEN": "Aufgaben",
    "Aufgaben": "Aufgaben",
    "Responsibilities": "Aufgaben",
    "Das erwartet dich bei uns": "Aufgaben",
    "Deine Aufgaben": "Aufgaben",
    "Deine Aufgaben": "Aufgaben",
    "DU LIEBST": "Aufgaben",
    "BENEFITS": "Benefits",
    "Perks": "Benefits",
    "Das bieten wir dir": "Benefits",
    "Benefits": "Benefits"
}

options = ["Aufgaben", "Profil", "Benefits", "Jobbeschreibung", "Firmenprofil"]



def get_selected_option(loop_number):
    # Create a Toplevel window
    dialog = tk.Toplevel(root)
    dialog.title("Wähle Section aus für:{}".format(loop_number))

    # Define a list of options
    options = ["Aufgaben", "Profil", "Benefits", "Jobbeschreibung", "Firmenprofil"]

    # Create a variable to store the selected option
    selected_option = tk.StringVar(dialog)
    selected_option.set(options[0])  # Set default option

    # Create a dropdown menu (OptionMenu) for selecting an option
    option_menu = tk.OptionMenu(dialog, selected_option, *options)
    option_menu.pack()

    def ok():
        dialog.destroy()

    # Create an "OK" button to confirm selection and close the dialog
    ok_button = tk.Button(dialog, text="Auswählen", command=ok)
    ok_button.pack()

    # Set the size of the dialog window
    dialog.geometry("700x400")  # Adjust the width and height as needed

    # Wait for the dialog window to be closed
    dialog.wait_window()

    # Return the selected option
    return selected_option.get()

# Create a Tkinter root window
root = tk.Tk()
root.withdraw()  # Hide the root window

# Extract list items from each <ul> and put them into separate lists
list_of_lists = []
for index, ul in enumerate(uls, start=1):
    list_entries = [li.get_text(strip=True) for li in ul.find_all('li')]
    # Remove newline characters from list entries
    list_entries = [entry.replace('\n', '') for entry in list_entries]
    # Get section heading to assign lists to the right section => use if statements for different html structures
    if ul.parent.find_previous_sibling().text.strip() != '':
        parent = ul.parent.find_previous_sibling().text.strip()
        keyword = uls[0].parent.find_previous_sibling().text.strip()
        # Perform multiple replacements
        for key, value in replacements.items():
            if key in parent:
                # Replace the entire original string with the corresponding replacement value
                parent = value
                # Break the loop after the first replacement is done
                break
        if parent not in options:
            parent = get_selected_option(ul.text.strip().replace('\n', ' '))
    elif ul.parent.find_previous_sibling().find_previous_sibling().text.strip() != '':
        parent = ul.parent.find_previous_sibling().find_previous_sibling().text.strip()
        keyword = uls[0].parent.find_previous_sibling().find_previous_sibling().text.strip()
        # Perform multiple replacements
        for key, value in replacements.items():
            if key in parent:
                # Replace the entire original string with the corresponding replacement value
                parent = value
                # Break the loop after the first replacement is done
                break
        if parent not in options:
            parent = get_selected_option(ul.text.strip().replace('\n', ' '))
    elif ul.parent.find_previous_sibling().find_previous_sibling().find_previous_sibling().text.strip() != '':
        parent = ul.parent.find_previous_sibling().find_previous_sibling().find_previous_sibling().text.strip()
        keyword = uls[0].parent.find_previous_sibling().find_previous_sibling().find_previous_sibling().text.strip()
        # Perform multiple replacements
        for key, value in replacements.items():
            if key in parent:
                # Replace the entire original string with the corresponding replacement value
                parent = value
                # Break the loop after the first replacement is done
                break
        if parent not in options:
            parent = get_selected_option(ul.text.strip().replace('\n', ' '))
    else:
        parent = get_selected_option(ul.text.strip())
    linkedin_dict[parent]= list_entries

# Close the Tkinter root window
root.destroy()




print(linkedin_dict)


{'Aufgaben': ['Entwicklung und Pflege von Dashboards', 'Entwicklung und Erstellung von Reportings für Kunden', 'Erstellung von Audiences und Insights als Basis für Strategieausarbeitungen', 'Ableitung von Handlungsempfehlungen', 'Unterstützung bei der Erarbeitung von Kanalstrategien und strategischen Empfehlungen für Kunden im Bereich Datenanalyse/Reporting', 'Unterstützung der Kolleg*innen bei der Interpretation von Daten', 'Aufbau und Vorantreiben der Dateninfrastruktur des Teams'], 'Profil': ['Du hast bereits Berufserfahrung im Bereich Social Media in einer digitalen Agentur mit dem Schwerpunkt Datenanalyse gesammelt (Performance Agentur, Mediaagentur)', 'Du bringst Kenntnisse in der Social Media Analyse von Tools mit (zB Facebook Business Manager, Twitter Analytics, Instagram Analytics, Emplifi etc.), sowie Kenntnisse in der Visualisierung von Tools und Datenbanken (zB Tableau, Looker Studio, Power BI, IBM Kognos / SQL, MS SQL, P SQL)', 'Du hast bereits fundierte Erfahrung mit Web-

In [64]:
parent = "hello"
for key, value in replacements.items():
    if key in parent:
        # Replace the entire original string with the corresponding replacement value
        parent = value
        # Break the loop after the first replacement is done
        break

In [74]:
uls[1].parent.find_previous_sibling().find_previous_sibling().find_previous_sibling().text.strip()
uls[1].parent.find_previous_sibling().find_previous_sibling().text.strip()
uls[1].text.strip().replace('\n', ' ')

'Du  hast bereits Berufserfahrung im Bereich Social Media in einer digitalen  Agentur mit dem Schwerpunkt Datenanalyse gesammelt (Performance  Agentur, Mediaagentur)Du bringst  Kenntnisse in der Social Media Analyse von Tools mit (zB Facebook  Business Manager, Twitter Analytics, Instagram Analytics, Emplifi etc.),  sowie Kenntnisse in der Visualisierung von Tools und Datenbanken (zB  Tableau, Looker Studio, Power BI, IBM Kognos / SQL, MS SQL, P SQL)Du  hast bereits fundierte Erfahrung mit Web- und Marketing Analytics (zB  Google Analytics, SAS, Salesforce, Adobe, Microsoft etc.)Du besitzt fachliches Know How in Global Web Index oder Best4PlanningBesonders deine Erfahrungen mit Looker Studio und Supermetrics bringst du erfolgreich bei uns einDu  besitzt die Fähigkeit Social Media KPIs und ihre Abhängigkeit zu  verstehen (Paid und Organic) sowie das Verständnis, Erkenntnis in  ansprechende und verständliche Darstellungen zu übertragenDurch  dein Verständnis für Prozesse der Social Media

In [30]:
%%python

import tkinter as tk
from tkinter import simpledialog

def get_selected_option():
    # Create a Toplevel window
    dialog = tk.Toplevel(root)
    dialog.title("Select Option - Loop {}")

    # Define a list of options
    options = ["Aufgaben", "Profil", "Benefits", "Jobbeschreibung", "Firmenprofil"]

    # Create a variable to store the selected option
    selected_option = tk.StringVar(dialog)
    selected_option.set(options[0])  # Set default option

    # Create a dropdown menu (OptionMenu) for selecting an option
    option_menu = tk.OptionMenu(dialog, selected_option, *options)
    option_menu.pack()

    def ok():
        dialog.destroy()

    # Create an "OK" button to confirm selection and close the dialog
    ok_button = tk.Button(dialog, text="Auswählen", command=ok)
    ok_button.pack()

    # Set the size of the dialog window
    dialog.geometry("350x200")  # Adjust the width and height as needed

    # Wait for the dialog window to be closed
    dialog.wait_window()

    # Return the selected option
    return selected_option.get()

# Create a Tkinter root window
root = tk.Tk()
root.withdraw()  # Hide the root window

# Prompt the user to select an option
selected_option = get_selected_option()


# Display the selected option
if selected_option:
    print("Selected option:", selected_option)

# Close the Tkinter root window
root.destroy()

Selected option: Benefits


In [22]:
uls[1]
uls[0].parent.find_previous_sibling().find_previous_sibling().find_previous_sibling().text.strip()
uls[0].parent.find_previous_sibling().text.strip()
uls[0].parent.find_previous_sibling().find_previous_sibling().text.strip()

<ul><span><li><!-- -->Wir ermöglichen Mobile Office und flexible Arbeitszeiten<span class="white-space-pre"> </span></li></span><span><li><!-- -->Arbeiten aus dem EU-Ausland | bis zu 45 Tage<span class="white-space-pre"> </span></li></span><span><li><!-- -->Discounts bei Dienstleistern und Fitnessstudios im Umkreis unserer Standorte<span class="white-space-pre"> </span></li></span><span><li><!-- -->Möglichkeit
 der Mitgestaltung der Agenturkultur durch Round Tables und 
Mitarbeitendeninitiativen, wie z. B. unserer &amp;Queer*-Community<span class="white-space-pre"> </span></li></span><span><li><!-- -->Starte bei uns mit einem erfahrenen Buddy an deiner Seite - wir nennen sie „First Friend“<!-- --></li></span><span><li><!-- -->Scholz
 &amp; Friends Academy: offene Weiterbildungsangebote zu Fachthemen, 
Impulsgebungen, Diversity, Equity &amp; Inclusion (D.E.I.) und vielem 
mehr<span class="white-space-pre"> </span></li></span><span><li><!-- -->Standort-übergreifende Mentoring-Programme f

In [None]:
linkedin_dict

### Extract Jobbeschreibung

In [475]:
### ONLY TESTING; SEE DIFFERENT APPROACH BELOW
# ## WORKS ONLY FOR FUNKE => Get the job description

# # Find the target <ul> element
# target_ul = job_description.find('ul')

# # Find the parent <span> element of the target <ul> element
# parent_span = target_ul.find_parent('span')

# # Initialize an empty string to store concatenated text
# concatenated_description = ''

# # Iterate over previous siblings of the parent <span> element
# for sibling in parent_span.previous_siblings:
#     # Check if the sibling is a <span> element
#     if sibling.name == 'span':
#         # Prepend the text of the <span> element to the string
#         concatenated_description = sibling.get_text(strip=True) + ' ' + concatenated_description
#     # Stop iteration if we encounter the target <span> element
#     if sibling == soup.find('span'):
#         break
# linkedin_dict["job_description"] = concatenated_description


In [476]:
# # Reset
# linkedin_dict= {}

In [477]:
## ONLY TESTING

# original_string = uls[0].parent.find_previous_sibling().text.strip()
# # Iterate over key-value pairs in replacements
# for key, value in replacements.items():
#     # Check if the key is present in the original string
#     if key in original_string:
#         # Replace the entire original string with the corresponding replacement value
#         new_string = value
#         # Break the loop after the first replacement is done
#         break
# else:
#     # If none of the keys are found in the original string, keep the original string unchanged
#     new_string = original_string

In [478]:
# if uls[0].parent.find_previous_sibling().text.strip() != '':
#     keyword = uls[0].parent.find_previous_sibling().text.strip()
# elif uls[0].parent.find_previous_sibling().find_previous_sibling().text.strip() != '':
#     keyword = uls[0].parent.find_previous_sibling().find_previous_sibling().text.strip()
# else:
#     keyword = uls[0].parent.find_previous_sibling().find_previous_sibling().find_previous_sibling().text.strip()

In [21]:
## Get job description => WORKS

# Get the parent element
parent_element = uls[0].parent.parent.text.replace('\n', '').strip()

# # Extract text from the parent element
#parent_element.text.replace('\n', '').strip()

if keyword == '':
    keyword = uls[0].text.strip()[:50]

linkedin_dict["job_description"] = parent_element.split(keyword)[0]


In [22]:
linkedin_dict

{'title': 'Data Analyst - Mid or Senior',
 'company': 'Adjust',
 'pub_date': 'Vor 2 Wochen',
 'jobtyp': '',
 'Bestehende Kenntnisse': 'Data Science, Datenanalytik, Datenvisualisierung',
 'Fehlende Kenntnisse': 'Analytik, Business Intelligence (BI), Dashboard, Data-Mining, Pandas (Software), Storytelling, Visualisierung',
 'Aufgaben': ['Play a major role in the Data & Insights team by scoping analytics projects and shaping stakeholder decisions',
  'Identify, analyze, and interpret complex data sets and present them in a clear, easy and understandable way',
  'Able to handle difficult queries (large dataset, longer query times, optimization of approach)',
  'Able to work with Pandas DataFrames to organize and analyze data, and using statistics to find important information',
  'Familiarity with data visualization, skill in storytelling, and proficiency in developing user-friendly, self-service dashboards',
  'Help our teams to ask the right questions, uncovering novel insights and oppor

## Write to markdown

In [23]:
def get_job_title_linkedin():
    # Get Job Title => WORKS
    title = soup.find('h1', class_='t-24 t-bold job-details-jobs-unified-top-card__job-title').text.strip()
    linkedin_dict["title"] = title
    return title

In [45]:
## Assign Filename from Company
cleaned_title = re.sub(r'[^\w\s]', '', get_job_title_linkedin())
output_filename = f"jobsearch___{cleaned_title}.md"

In [27]:
# Get JobID of LinkedIn Job

all_a_elements = soup.find_all("a", {"target": "_self"})
skill_match_elements = [element for element in all_a_elements if "skill-match" in element.get("href", "")]

skill_match_elements

# Extract the 'href' attribute from the element
href = skill_match_elements[0].get("href", "")

# Define the pattern for the job ID using regular expression
pattern = r'jobId=(\d+)'

# Search for the pattern in the 'href' attribute
match = re.search(pattern, href)

# Extract the job ID as a string
linkedin_dict["job_id"] = match.group(1) if match else None
linkedin_dict["url"] = "https://www.linkedin.com/jobs/view/" + linkedin_dict["job_id"]


In [28]:
## Make sure all dict keys are set

# Check if the key exists in the dictionary
if "Benefits" not in linkedin_dict:
    linkedin_dict["Benefits"] = ["nicht gefunden"]
if "Bestehende Kenntnisse" not in linkedin_dict:
    linkedin_dict["Bestehende Kenntnisse"] = "Keine"

In [29]:
## Tags section

tags = f"""\
type:: stellenausschreibung
institution:: {linkedin_dict["company"]}
Teilzeit:: {linkedin_dict["jobtyp"]}
status:: offen
starting_date:: offen
kommentar:: offen
deadline:: offen
ansprechpartner:: offen
website:: [Stellenausschreibung]({linkedin_dict["url"]})
berufserfahrung:: offen
kennziffer:: offen
publication:: {linkedin_dict["pub_date"]}
matching_skills:: {linkedin_dict["Bestehende Kenntnisse"]}
missing_skills:: {linkedin_dict["Fehlende Kenntnisse"]}

"""

## ToDos

In [33]:
### Create function to get future date starting from today and return it in the right format for Logseq's task management
### if future date falls on a weekend, assign task to Monday instead => ONLY assign tasks on weekdays

from datetime import datetime, timedelta

def get_date_logseq_format(days_from_today):
    # Get today's date
    today = datetime.now()
    
    # Calculate the date after delta_days
    future_date = today + timedelta(days=days_from_today)

    # Check if the future date falls on a weekend (Saturday or Sunday)
    if future_date.weekday() >= 5:  # Saturday or Sunday
        # Calculate the number of days to add to reach Monday
        days_to_add = 7 - future_date.weekday()
        future_date += timedelta(days=days_to_add)
    
    # Get the day name and format it
    day_name = future_date.strftime('%a')
    
    # Format the date
    formatted_date = future_date.strftime('%Y-%m-%d')
    
    # Combine the formatted date and day name
    formatted_result = f'<{formatted_date} {day_name}>'
    
    return formatted_result



In [35]:
# Example usage: Get the name of the day in two days from today
result = get_date_logseq_format(4)
print(result)

<2024-04-22 Mon>


In [44]:
# Write To Do Section in markdown file

todos = f"""\
	- TODO ergänze relevante Informationen für Stellenausschreibung "{linkedin_dict["title"]}" bei {linkedin_dict["company"]}
	  SCHEDULED: {get_date_logseq_format(0)}
		- insb. zu Bewerbungsprozess und Unterlagen
	- TODO mache stichpunktartige Notizen zu Details in Stellenausschreibung "{linkedin_dict["title"]}" bei {linkedin_dict["company"]}
	  SCHEDULED: {get_date_logseq_format(0)}
	- TODO setze Anschreiben auf für "{linkedin_dict["title"]}" bei {linkedin_dict["company"]}
	  SCHEDULED: {get_date_logseq_format(1)}
	- TODO passe Lebenslauf an für "{linkedin_dict["title"]}" bei {linkedin_dict["company"]}
	  SCHEDULED: {get_date_logseq_format(1)}
	- TODO stelle Anschreiben fertig für "{linkedin_dict["title"]}" bei {linkedin_dict["company"]}
	  SCHEDULED: {get_date_logseq_format(2)}
	- TODO schicke Bewerbung ab für "{linkedin_dict["title"]}" bei {linkedin_dict["company"]}
	  SCHEDULED: {get_date_logseq_format(2)}
"""

print(todos)

	- TODO ergänze relevante Informationen für Stellenausschreibung "Data Analyst - Mid or Senior" bei Adjust
	  SCHEDULED: <2024-04-18 Thu>
		- insb. zu Bewerbungsprozess und Unterlagen
	- TODO mache stichpunktartige Notizen zu Details in Stellenausschreibung "Data Analyst - Mid or Senior" bei Adjust
	  SCHEDULED: <2024-04-18 Thu>
	- TODO setze Anschreiben auf für "Data Analyst - Mid or Senior" bei Adjust
	  SCHEDULED: <2024-04-19 Fri>
	- TODO passe Lebenslauf an für "Data Analyst - Mid or Senior" bei Adjust
	  SCHEDULED: <2024-04-19 Fri>
	- TODO stelle Anschreiben fertig für "Data Analyst - Mid or Senior" bei Adjust
	  SCHEDULED: <2024-04-22 Mon>
	- TODO schicke Bewerbung ab für "Data Analyst - Mid or Senior" bei Adjust
	  SCHEDULED: <2024-04-22 Mon>



In [46]:
with open(output_filename, 'w', encoding='utf-8') as markdown_file:
    # Write tags to markdown
    markdown_file.write(tags)
    # Add to do section
    markdown_file.write(f"- ## To Dos\n")
    markdown_file.write("  {{renderer :todomaster}}")
    markdown_file.write(f"\n")
    markdown_file.write(todos)
    # Write Profil / Requirements to markdown
    markdown_file.write(f"- ## Jobbeschreibung\n")
    markdown_file.write(f"\t- {linkedin_dict["job_description"]}\n")
    markdown_file.write(f"- ## Profil\n")
    for item in linkedin_dict["Profil"]:
        markdown_file.write(f"\t- {item}\n")
    # Write Tasks / Aufgaben to markdown
    markdown_file.write(f"- ## Aufgaben\n")
    for item in linkedin_dict["Aufgaben"]:
        markdown_file.write(f"\t- {item}\n")
    # Write Benefits to markdown
    markdown_file.write(f"- ## Benefits\n")
    for item in linkedin_dict["Benefits"]:
        markdown_file.write(f"\t- {item}\n")