## Import Libraries

In [121]:
import pandas as pd
import numpy as np
import re

# Extract

LinkedIn Datasets DACH Region: https://www.kaggle.com/datasets/czobii/linkedin-dataset

In [122]:
df_jobs_linkedin = pd.read_csv("../data/LinkedInJobsDACH.csv", encoding="utf-8", sep=",")

## Understand the Data

View the first few rows, get summary statistics and check data types

In [123]:
df_jobs_linkedin.head(5)

Unnamed: 0.1,Unnamed: 0,job_title,company_name,company_location,post_date,job_description,applicant_number,applicants,country,city,...,Spark or Hive,PowerBI,Scikit-learn,Tableau,BigData,ML,ETL,ABtest,Senior,Remote
0,0,Software Quality Engineer (m/f/d),IFCO SYSTEMS,"Gmunden, Upper Austria, Austria",2 weeks ago,How is IFCO making the world more sustainable?...,2 weeks ago Be among the first 25 applicants,25,Austria,Gmunden,...,0,0,0,0,0,0,0,0,0,0
1,1,DevOps Engineer (m/f/d) – Innovation and Techn...,ABB,"Eggelsberg, Upper Austria, Austria",2 weeks ago,Take your next career step at B&R – a member o...,2 weeks ago Be among the first 25 applicants,25,Austria,Eggelsberg,...,0,0,0,0,0,0,0,0,0,0
2,3,Embedded QA Developer DAS EMEIA,ASSA ABLOY Group,"Graz, Styria, Austria",2 days ago,"Location; Austria (Graz), Germany (Berlin), Po...",2 days ago Be among the first 25 applicants,25,Austria,Graz,...,0,0,0,0,0,0,0,0,0,0
3,15,App Developer (React Native),Biolyz,"Tulln an der Donau, Lower Austria, Austria",1 week ago,Direct message the job poster from Biolyz\nMar...,1 week ago 70 applicants,70,Austria,Tulln an der Donau,...,0,0,0,0,0,1,0,0,0,0
4,16,Workplace Analytics Engineer,Canonical,"Vienna, Vienna, Austria",1 month ago,"Bring your people analytics, social science re...",1 month ago 25 applicants,25,Austria,Vienna,...,0,0,0,1,0,0,0,0,0,1


In [124]:
df_jobs_linkedin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2228 entries, 0 to 2227
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        2228 non-null   int64 
 1   job_title         2228 non-null   object
 2   company_name      2228 non-null   object
 3   company_location  2228 non-null   object
 4   post_date         2228 non-null   object
 5   job_description   2228 non-null   object
 6   applicant_number  2228 non-null   object
 7   applicants        2228 non-null   int64 
 8   country           2228 non-null   object
 9   city              2228 non-null   object
 10  unit              2228 non-null   int64 
 11  time_unit         2228 non-null   object
 12  job_title_clean   2228 non-null   object
 13  SQL               2228 non-null   int64 
 14  Python            2228 non-null   int64 
 15  Spark or Hive     2228 non-null   int64 
 16  PowerBI           2228 non-null   int64 
 17  Scikit-learn  

# Transform

In [125]:
df_jobs_linkedin = df_jobs_linkedin.drop(columns=['Unnamed: 0', 'company_name', 'post_date', 'applicant_number', 'unit', 'time_unit'])
df_jobs_linkedin['id'] = df_jobs_linkedin.index + 1

In [127]:
# ------------------------------------------------------------
# 1) Hilfs-Regexe und -Funktionen
# ------------------------------------------------------------
NUM_RE   = re.compile(r'(\d+[kK]|\d{1,3}(?:[.,\s]\d{3})+|\d{1,6}(?:[.,]\d+)?)')
MONTH_RE = re.compile(r'(monat|monate|monatlich|month|per month|/month)', re.I)
YEAR_RE  = re.compile(r'(jahr|jährlich|year|yearly|per year|/year|p\.a\.|pa|annum|annual|annually)', re.I)

def _clean_amount(raw: str) -> float:
    """Wandelt '3.400', '60,000', '50k', '65.000,00' → 3400.0 … um"""
    raw = raw.replace(' ', '')
    if raw.lower().endswith('k'):                       # 50k → 50000
        return float(raw[:-1].replace(',', '.')) * 1_000
    # Tausender-Trenner entfernen (.,) **nur** wenn genau 3 Ziffern folgen
    raw = re.sub(r'(?<=\d)[.,](?=\d{3}\b)', '', raw)
    return float(raw.replace(',', '.'))

def _extract_salary(txt: str, window: int = 60):
    """
    Liefert (raw_amount, period, salary_annual) oder (NaN, NaN, NaN),
    indem rund um jede gefundene Zahl nach €/EUR/k & Monats-/Jahres-Wörtern gesucht wird.
    """
    for m in NUM_RE.finditer(txt):
        raw = m.group(1)
        s, e = m.span()

        vicinity = txt[max(0, s-10):e+10].lower()          # 10 Zeichen links/rechts
        if '€' not in vicinity and 'eur' not in vicinity and not raw.lower().endswith('k'):
            continue                                       # keine Währung in der Nähe → skip

        context = txt[max(0, s-window):e+window].lower()   # größerer Kontext
        period  = 'monat' if MONTH_RE.search(context) else 'jahr'

        amount  = _clean_amount(raw)
        annual  = amount * 12 if period == 'monat' else amount
        return pd.Series([amount, period, annual])

    # nichts Geeignetes gefunden
    return pd.Series([np.nan, np.nan, np.nan])

# ------------------------------------------------------------
# 2) Auf das DataFrame anwenden
# ------------------------------------------------------------
df_jobs_linkedin[['raw_amount', 'period', 'salary_annual']] = (
    df_jobs_linkedin['job_description'].apply(_extract_salary)
)


In [129]:
mask = (df_jobs_linkedin['salary_annual'] < 20_000) | (df_jobs_linkedin['salary_annual'] > 160_000)
df_jobs_linkedin.loc[mask, ['raw_amount', 'period', 'salary_annual']] = np.nan

In [130]:
df_jobs_with_salary      = df_jobs_linkedin[df_jobs_linkedin['salary_annual'].notna()].copy()  # mit Gehaltswert
df_jobs_without_salary   = df_jobs_linkedin[df_jobs_linkedin['salary_annual'].isna()].copy()   # ohne Gehaltswert

In [131]:
df_jobs_with_salary.head(2500)

Unnamed: 0,job_title,company_location,job_description,applicants,country,city,job_title_clean,SQL,Python,Spark or Hive,...,BigData,ML,ETL,ABtest,Senior,Remote,id,raw_amount,period,salary_annual
1,DevOps Engineer (m/f/d) – Innovation and Techn...,"Eggelsberg, Upper Austria, Austria",Take your next career step at B&R – a member o...,25,Austria,Eggelsberg,Engineer,0,0,0,...,0,0,0,0,0,0,2,3409.0,monat,40908.0
5,Solution Engineer - Reporting,"Vienna, Austria",Wir wachsen! Deshalb suchen wir für unser Büro...,25,Austria,Vienna,Engineer,1,0,0,...,0,1,0,0,0,0,6,49000.0,jahr,49000.0
6,Data Engineer (m/w/d),"Wels, Upper Austria, Austria",Zurück\n\nData Engineer (m/w/d)\n\nIhr Aufgabe...,32,Austria,Wels,Engineer,1,0,0,...,0,1,0,0,0,0,7,4000.0,monat,48000.0
7,DevOps Engineer - m/f/d,"Vienna, Austria",Direct message the job poster from SQUARS\nIvo...,55,Austria,Vienna,Engineer,0,1,0,...,0,0,0,0,0,0,8,60000.0,jahr,60000.0
8,Software Test Engineer (m/f/x),"Vienna, Vienna, Austria",BOC Group entwickelt Softwareprodukte und Serv...,36,Austria,Vienna,Engineer,0,0,0,...,0,0,0,0,0,0,9,2800.0,monat,33600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2166,Senior Statistical Programmer - Sponsor Dedica...,Germany,Are you ready to elevate your career in the dy...,25,Germany,Germany,Other,0,0,0,...,0,0,0,0,1,1,2167,75000.0,jahr,75000.0
2168,Mid/Senior Python Full Stack Developer with Ba...,"Munich, Bavaria, Germany",Direct message the job poster from Optimus Sea...,62,Germany,Munich,Other,1,1,0,...,0,0,0,0,1,0,2169,80000.0,jahr,80000.0
2187,Expert Biostatistician in HEOR/HTA - Driving E...,Germany,Are you ready to elevate your career in the dy...,28,Germany,Germany,Other,0,0,0,...,0,0,0,0,0,1,2188,95000.0,jahr,95000.0
2191,.NET Software Engineer - Munich,"Munich, Bavaria, Germany",Direct message the job poster from Noir\nBen K...,26,Germany,Munich,Engineer,1,0,0,...,0,0,0,0,0,1,2192,45000.0,jahr,45000.0


In [132]:
df_jobs_without_salary.head(2500)

Unnamed: 0,job_title,company_location,job_description,applicants,country,city,job_title_clean,SQL,Python,Spark or Hive,...,BigData,ML,ETL,ABtest,Senior,Remote,id,raw_amount,period,salary_annual
0,Software Quality Engineer (m/f/d),"Gmunden, Upper Austria, Austria",How is IFCO making the world more sustainable?...,25,Austria,Gmunden,Engineer,0,0,0,...,0,0,0,0,0,0,1,,,
2,Embedded QA Developer DAS EMEIA,"Graz, Styria, Austria","Location; Austria (Graz), Germany (Berlin), Po...",25,Austria,Graz,Other,0,1,0,...,0,0,0,0,0,0,3,,,
3,App Developer (React Native),"Tulln an der Donau, Lower Austria, Austria",Direct message the job poster from Biolyz\nMar...,70,Austria,Tulln an der Donau,Other,0,0,0,...,0,1,0,0,0,0,4,,,
4,Workplace Analytics Engineer,"Vienna, Vienna, Austria","Bring your people analytics, social science re...",25,Austria,Vienna,Data Analyst,1,1,0,...,0,0,0,0,0,1,5,,,
13,DevOps Engineer for PKI Services (m/f/d),"Innere Stadt, Vienna, Austria",What are the Siemens Encryption Technology (PK...,25,Austria,Innere Stadt,Engineer,0,1,0,...,0,0,0,0,0,0,14,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2223,Senior Quality Assurance Engineer,"Munich, Bavaria, Germany",Direct message the job poster from Zero to One...,25,Germany,Munich,Engineer,0,0,0,...,0,0,0,0,1,1,2224,,,
2224,Projektmanager Kundenmedien (m/w/d),"Stuttgart, Baden-Württemberg, Germany",Im Bereich Marketing & Kommunikation besetzen ...,25,Germany,Stuttgart,Other,0,0,0,...,0,0,0,0,0,0,2225,,,
2225,(Senior) Project Manager (m/f/d) - Industrial ...,"Hamburg, Hamburg, Germany",Nexperia is a world-class company in semicondu...,25,Germany,Hamburg,Engineer,0,0,0,...,0,0,0,0,1,0,2226,,,
2226,Piping professional with affinity for SQL and ...,"Oberhausen, North Rhine-Westphalia, Germany",Stellenbeschreibung:\n\nAufgaben:\nDesign pipi...,25,Germany,Oberhausen,Other,1,0,0,...,0,0,0,0,0,0,2227,,,


# Load

In [133]:
df_jobs_linkedin.to_csv('../output/linkedin.csv', index=False, encoding='utf-8')
df_jobs_with_salary.to_csv('../output/linkedin_with_salary.csv', index=False, encoding='utf-8')
df_jobs_without_salary.to_csv('../output/linkedin_no_salary.csv',  index=False, encoding='utf-8')

In [142]:
df_jobs_linkedin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2228 entries, 0 to 2227
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         2228 non-null   object 
 1   company_location  2228 non-null   object 
 2   job_description   2228 non-null   object 
 3   applicants        2228 non-null   int64  
 4   country           2228 non-null   object 
 5   city              2228 non-null   object 
 6   job_title_clean   2228 non-null   object 
 7   SQL               2228 non-null   int64  
 8   Python            2228 non-null   int64  
 9   Spark or Hive     2228 non-null   int64  
 10  PowerBI           2228 non-null   int64  
 11  Scikit-learn      2228 non-null   int64  
 12  Tableau           2228 non-null   int64  
 13  BigData           2228 non-null   int64  
 14  ML                2228 non-null   int64  
 15  ETL               2228 non-null   int64  
 16  ABtest            2228 non-null   int64  


In [143]:
df_jobs_with_salary.info()

<class 'pandas.core.frame.DataFrame'>
Index: 494 entries, 1 to 2207
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         494 non-null    object 
 1   company_location  494 non-null    object 
 2   job_description   494 non-null    object 
 3   applicants        494 non-null    int64  
 4   country           494 non-null    object 
 5   city              494 non-null    object 
 6   job_title_clean   494 non-null    object 
 7   SQL               494 non-null    int64  
 8   Python            494 non-null    int64  
 9   Spark or Hive     494 non-null    int64  
 10  PowerBI           494 non-null    int64  
 11  Scikit-learn      494 non-null    int64  
 12  Tableau           494 non-null    int64  
 13  BigData           494 non-null    int64  
 14  ML                494 non-null    int64  
 15  ETL               494 non-null    int64  
 16  ABtest            494 non-null    int64  
 17  S

In [144]:
df_jobs_without_salary.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1734 entries, 0 to 2227
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         1734 non-null   object 
 1   company_location  1734 non-null   object 
 2   job_description   1734 non-null   object 
 3   applicants        1734 non-null   int64  
 4   country           1734 non-null   object 
 5   city              1734 non-null   object 
 6   job_title_clean   1734 non-null   object 
 7   SQL               1734 non-null   int64  
 8   Python            1734 non-null   int64  
 9   Spark or Hive     1734 non-null   int64  
 10  PowerBI           1734 non-null   int64  
 11  Scikit-learn      1734 non-null   int64  
 12  Tableau           1734 non-null   int64  
 13  BigData           1734 non-null   int64  
 14  ML                1734 non-null   int64  
 15  ETL               1734 non-null   int64  
 16  ABtest            1734 non-null   int64  
 17  