# Data Staging
Extract, Transform, Load (ETL)

## Extract datasets into dataframes

Extract job posting information

In [109]:
import pandas as pd
df = pd.read_csv('job_descriptions.csv')
df

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615935,134563577088850,0 to 12 Years,B.Tech,$64K-$114K,"Malabo (de jure),",Equatorial Guinea,1.6508,10.2679,Full-Time,18281,...,950-451-5843,Mechanical Engineer,Mechanical Design Engineer,ZipRecruiter,Mechanical Design Engineers create and develop...,"{'Employee Assistance Programs (EAP), Tuition ...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",The Hershey Company,"{""Sector"":""Food and Beverage/Confectionery"",""I..."
1615936,618604818190827,2 to 14 Years,M.Tech,$62K-$130K,Warsaw,Poland,51.9194,19.1451,Intern,63621,...,676.387.1572x71877,IT Manager,IT Director,USAJOBS,An IT Director oversees an organizations IT de...,"{'Health Insurance, Retirement Plans, Paid Tim...",Strategic IT planning Leadership and managemen...,Provide strategic leadership for IT department...,EQT,"{""Sector"":""Energy"",""Industry"":""Energy"",""City"":..."
1615937,615471367712200,4 to 15 Years,BCA,$60K-$96K,Ashgabat,Turkmenistan,38.9697,59.5563,Part-Time,114287,...,537.384.6193x5284,Mechanical Engineer,Mechanical Design Engineer,Indeed,Mechanical Design Engineers create and develop...,"{'Tuition Reimbursement, Stock Options or Equi...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",KLA,"{""Sector"":""Technology"",""Industry"":""Semiconduct..."
1615938,804137342023945,5 to 15 Years,BCA,$65K-$103K,Ouagadougou,Burkina Faso,12.2383,-1.5616,Full-Time,45009,...,(484)257-4755x5346,HR Coordinator,Training Coordinator,Stack Overflow Jobs,Training Coordinators design and implement emp...,"{'Casual Dress Code, Social and Recreational A...",Training program coordination Training materia...,"Coordinate employee training programs, track t...",Mahindra & Mahindra,"{""Sector"":""Automotive"",""Industry"":""Automotive""..."


Extract City Population information

In [110]:
df_population = pd.read_csv('CityPopulation.csv')
df_population

Unnamed: 0,Country,City,City Population
0,USA,"Washington, D.C.",678972
1,UK,London,9748000
2,Canada,Ottawa,1077900
3,France,Paris,11208000
4,Belgium,Brussels,1222657
5,Australia,Canberra,472000
6,Spain,Madrid,6751000
7,India,New Delhi,32941000
8,Germany,Berlin,3574000
9,Singapore,Singapore,6014723


Extract company headquarters country location and company size (number of employees)

In [111]:
df_company = pd.read_csv('CompanyInformation.csv')
df_company

Unnamed: 0,Company,Country,Company Size
0,3i Group,UK,249
1,3M,USA,95000
2,A-Mark Precious Metals,USA,429
3,Abbott Laboratories,USA,113000
4,AbbVie,USA,50000
...,...,...,...
883,XPO,USA,42000
884,Yum China Holdings,USA,450000
885,Zee Entertainment Enterprises,India,3400
886,Zoetis,USA,11700


## Transform

In [112]:
pd.options.mode.copy_on_write = True #https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [113]:
print(df.dtypes)

Job Id                int64
Experience           object
Qualifications       object
Salary Range         object
location             object
Country              object
latitude            float64
longitude           float64
Work Type            object
Company Size          int64
Job Posting Date     object
Preference           object
Contact Person       object
Contact              object
Job Title            object
Role                 object
Job Portal           object
Job Description      object
Benefits             object
skills               object
Responsibilities     object
Company              object
Company Profile      object
dtype: object


Keep only the countries we want for our analysis

In [114]:
# Define the values to keep in the "Country" column
desired_countries = ['USA', 'UK', 'Canada', 'France', 'Japan', 'Belgium', 'Australia', 'Spain', 'India', 'Germany', 'Singapore', 'Thailand', 'China', 'Portugal', 'Vietnam', 'Mauritius']

# Filter the DataFrame to keep only rows with desired countries
df = df[df['Country'].isin(desired_countries)]

Drop unrequired columns

In [115]:
df = df.drop(columns = ['latitude', 'longitude', 'Contact Person', 'Contact', 'Job Description', 'Company Size'])
df

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,Work Type,Job Posting Date,Preference,Job Title,Role,Job Portal,Benefits,skills,Responsibilities,Company,Company Profile
5,116831420231957,4 to 12 Years,MCA,$59K-$93K,Brussels,Belgium,Full-Time,2023-07-25,Male,Software Tester,Quality Assurance Analyst,Snagajob,"{'Life and Disability Insurance, Stock Options...",Quality assurance processes Testing methodolog...,Test software applications and systems to iden...,Adani Ports and Special Economic Zone,"{""Sector"":""Infrastructure"",""Industry"":""Ports a..."
34,1889088294438825,0 to 8 Years,M.Com,$62K-$130K,Canberra,Australia,Temporary,2022-11-02,Both,Electrical Designer,Lighting Designer,Idealist,"{'Life and Disability Insurance, Stock Options...",Lighting design Architectural lighting Lightin...,"Specialize in lighting design, creating lighti...",CBRE Group,"{""Sector"":""Real Estate"",""Industry"":""Real Estat..."
46,1373513885523551,3 to 9 Years,B.Tech,$56K-$109K,Madrid,Spain,Temporary,2023-05-29,Male,Data Entry Clerk,Record Keeper,Jobs2Careers,"{'Legal Assistance, Bonuses and Incentive Prog...",Records management Data entry and retrieval At...,"Maintain records, files, and documentation in ...",Arconic,"{""Sector"":""Manufacturing"",""Industry"":""Metals"",..."
61,401560922349533,5 to 12 Years,PhD,$57K-$125K,New Delhi,India,Full-Time,2022-09-28,Female,Account Executive,Sales Account Executive,Internships.com,"{'Flexible Spending Accounts (FSAs), Relocatio...",Sales strategies and tactics Account managemen...,Identify and pursue sales opportunities with n...,GlaxoSmithKline,"{""Sector"":""Pharmaceuticals"",""Industry"":""Pharma..."
69,2853459439973158,0 to 9 Years,B.Tech,$56K-$125K,London,UK,Intern,2022-08-03,Male,Data Engineer,Data Architect,Indeed,"{'Childcare Assistance, Paid Time Off (PTO), R...",Data architecture design Database management s...,Design data architecture and systems to meet b...,Equitable Holdings,"{""Sector"":""Insurance"",""Industry"":""Insurance: L..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615879,934065317714728,0 to 15 Years,B.Com,$64K-$113K,London,UK,Contract,2023-04-18,Male,Brand Manager,Product Brand Manager,USAJOBS,"{'Legal Assistance, Bonuses and Incentive Prog...",Product branding and marketing Product lifecyc...,Manage the branding of specific products or pr...,Kinder Morgan,"{""Sector"":""Energy"",""Industry"":""Pipelines"",""Cit..."
1615885,2072349359037720,2 to 13 Years,PhD,$65K-$126K,Bangkok,Thailand,Temporary,2022-04-14,Male,Event Coordinator,Event Planner,Idealist,"{'Legal Assistance, Bonuses and Incentive Prog...",Event planning and coordination Budget managem...,"Plan and organize events, such as weddings, co...",Chesapeake Energy,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
1615911,2800492340115484,4 to 9 Years,MBA,$60K-$87K,London,UK,Contract,2022-02-18,Both,Sales Consultant,Sales Advisor,Snagajob,"{'Childcare Assistance, Paid Time Off (PTO), R...",Sales strategies Customer relationship managem...,Provide expert advice and guidance to customer...,Unum Group,"{""Sector"":""Insurance"",""Industry"":""Insurance: L..."
1615930,690948368250854,1 to 14 Years,BCA,$65K-$125K,Madrid,Spain,Intern,2022-07-10,Both,Environmental Consultant,Sustainability Consultant,Internships.com,"{'Employee Assistance Programs (EAP), Tuition ...",Sustainability consulting Sustainability asses...,Focus on sustainable practices and initiatives...,J.M. Smucker,"{""Sector"":""Food & Beverage"",""Industry"":""Food C..."


Renaming columns

In [116]:
# Columns to rename in df
columns_to_rename = {
    'skills':'Skills',
    'location': 'City',
    'Preference': 'Gender Preference',
    'Role': 'Specialization',
}

df = df.rename(columns_to_rename, axis='columns')

# columns to rename in df_population
df_population.rename(columns={'City Population': 'Job City Population'}, inplace=True)

# column to rename in df_company
df_company.rename(columns={'Country': 'Company HQ Country'}, inplace=True)

Processing date values

In [117]:
# convert date object to datetime
df['Job Posting Date'] = pd.to_datetime(df['Job Posting Date'])
df.dtypes

Job Id                        int64
Experience                   object
Qualifications               object
Salary Range                 object
City                         object
Country                      object
Work Type                    object
Job Posting Date     datetime64[ns]
Gender Preference            object
Job Title                    object
Specialization               object
Job Portal                   object
Benefits                     object
Skills                       object
Responsibilities             object
Company                      object
Company Profile              object
dtype: object

In [118]:
# Extract the month and year from job posting date and add them to 2 separate columns
df['Day'] = df['Job Posting Date'].dt.day # extract day
df['Month'] = df['Job Posting Date'].dt.month # extract month
df['Year'] = df['Job Posting Date'].dt.year # extract year

Processing Salary Range and Years of Experience

In [119]:
# Split salary range in to minimum salary and maximum salary
df[['Minimum Salary', 'Maximum Salary']] = df['Salary Range'].str.split('-', expand=True)

# Keep only integers, using a regex, replace every non digit character by a ""
df['Minimum Salary'] = df['Minimum Salary'].str.replace('[^\d]', '', regex=True).astype(int)
df['Maximum Salary'] = df['Maximum Salary'].str.replace('[^\d]', '', regex=True).astype(int)


In [120]:
# Transform salary into $ instead of in thousands unit
df['Minimum Salary'] *= 1000
df['Maximum Salary'] *= 1000


In [121]:
# Split Years of experience range in to minimum experience and maximum experience
df[['Minimum Experience (years)', 'Maximum Experience (years)']] = df['Experience'].str.split('to', expand=True)

# Keep only integers, using a regex, replace every non digit character by a ""
df['Minimum Experience (years)'] = df['Minimum Experience (years)'].str.replace('[^\d]', '', regex=True).astype(int)
df['Maximum Experience (years)'] = df['Maximum Experience (years)'].str.replace('[^\d]', '', regex=True).astype(int)


In [122]:
# Drop unneeded columns
df = df.drop(columns=['Experience', 'Salary Range', 'Job Posting Date'])
df

Unnamed: 0,Job Id,Qualifications,City,Country,Work Type,Gender Preference,Job Title,Specialization,Job Portal,Benefits,...,Responsibilities,Company,Company Profile,Day,Month,Year,Minimum Salary,Maximum Salary,Minimum Experience (years),Maximum Experience (years)
5,116831420231957,MCA,Brussels,Belgium,Full-Time,Male,Software Tester,Quality Assurance Analyst,Snagajob,"{'Life and Disability Insurance, Stock Options...",...,Test software applications and systems to iden...,Adani Ports and Special Economic Zone,"{""Sector"":""Infrastructure"",""Industry"":""Ports a...",25,7,2023,59000,93000,4,12
34,1889088294438825,M.Com,Canberra,Australia,Temporary,Both,Electrical Designer,Lighting Designer,Idealist,"{'Life and Disability Insurance, Stock Options...",...,"Specialize in lighting design, creating lighti...",CBRE Group,"{""Sector"":""Real Estate"",""Industry"":""Real Estat...",2,11,2022,62000,130000,0,8
46,1373513885523551,B.Tech,Madrid,Spain,Temporary,Male,Data Entry Clerk,Record Keeper,Jobs2Careers,"{'Legal Assistance, Bonuses and Incentive Prog...",...,"Maintain records, files, and documentation in ...",Arconic,"{""Sector"":""Manufacturing"",""Industry"":""Metals"",...",29,5,2023,56000,109000,3,9
61,401560922349533,PhD,New Delhi,India,Full-Time,Female,Account Executive,Sales Account Executive,Internships.com,"{'Flexible Spending Accounts (FSAs), Relocatio...",...,Identify and pursue sales opportunities with n...,GlaxoSmithKline,"{""Sector"":""Pharmaceuticals"",""Industry"":""Pharma...",28,9,2022,57000,125000,5,12
69,2853459439973158,B.Tech,London,UK,Intern,Male,Data Engineer,Data Architect,Indeed,"{'Childcare Assistance, Paid Time Off (PTO), R...",...,Design data architecture and systems to meet b...,Equitable Holdings,"{""Sector"":""Insurance"",""Industry"":""Insurance: L...",3,8,2022,56000,125000,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615879,934065317714728,B.Com,London,UK,Contract,Male,Brand Manager,Product Brand Manager,USAJOBS,"{'Legal Assistance, Bonuses and Incentive Prog...",...,Manage the branding of specific products or pr...,Kinder Morgan,"{""Sector"":""Energy"",""Industry"":""Pipelines"",""Cit...",18,4,2023,64000,113000,0,15
1615885,2072349359037720,PhD,Bangkok,Thailand,Temporary,Male,Event Coordinator,Event Planner,Idealist,"{'Legal Assistance, Bonuses and Incentive Prog...",...,"Plan and organize events, such as weddings, co...",Chesapeake Energy,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O...",14,4,2022,65000,126000,2,13
1615911,2800492340115484,MBA,London,UK,Contract,Both,Sales Consultant,Sales Advisor,Snagajob,"{'Childcare Assistance, Paid Time Off (PTO), R...",...,Provide expert advice and guidance to customer...,Unum Group,"{""Sector"":""Insurance"",""Industry"":""Insurance: L...",18,2,2022,60000,87000,4,9
1615930,690948368250854,BCA,Madrid,Spain,Intern,Both,Environmental Consultant,Sustainability Consultant,Internships.com,"{'Employee Assistance Programs (EAP), Tuition ...",...,Focus on sustainable practices and initiatives...,J.M. Smucker,"{""Sector"":""Food & Beverage"",""Industry"":""Food C...",10,7,2022,65000,125000,1,14


Handling null values within company profile



In [123]:
# Check if there are null values, no null values if we successfully added missing companies
df.isnull().sum()

Job Id                          0
Qualifications                  0
City                            0
Country                         0
Work Type                       0
Gender Preference               0
Job Title                       0
Specialization                  0
Job Portal                      0
Benefits                        0
Skills                          0
Responsibilities                0
Company                         0
Company Profile               407
Day                             0
Month                           0
Year                            0
Minimum Salary                  0
Maximum Salary                  0
Minimum Experience (years)      0
Maximum Experience (years)      0
dtype: int64

In [124]:
# Checking which company has missing information on company profile
null_company_profile_rows = df[df['Company Profile'].isnull() | (df['Company Profile'] == '')]

# Extract unique companies from these rows
unique_companies_with_null_profile = null_company_profile_rows['Company'].unique()

print(unique_companies_with_null_profile)

['Estée Lauder' "Dunkin'Brands Group, Inc." 'Peter Kiewit Sons']


In [125]:
# handling missing values for company profile
import json
value_mapping = {
    'Estée Lauder': {"Sector":"Consumer Goods","Industry":"Consumer Goods","City":"New York","State":"New York","Zip":"10001","Website":"www.elcompanies.com","Ticker":"EL","CEO":"Fabrizio Freda"},
    'Dunkin\'Brands Group, Inc.': {"Sector":"Restaurants","Industry":"Food Services","City":"Canton","State":"Massachusetts","Zip":"02021","Website":"www.dunkindonuts.com","Ticker":"DNKN","CEO":"Nigel Travis"},
    'Peter Kiewit Sons': {"Sector":"Construction/Infrastructure","Industry":"Construction/Infrastructure","City":"Omaha","State":"Nebraska","Zip":"68102","Website":"www.kiewit.com","Ticker":"N/A","CEO":"Rick Lanoha"},
}

# Filling in missing information for company profile
for index, row in df.iterrows():
    if pd.isnull(row['Company Profile']):
        company = row['Company']
        if company in value_mapping:
          company_profile_str = json.dumps(value_mapping[company])
          df.at[index, 'Company Profile'] = company_profile_str

In [126]:
# Checking if missing data was filled in properly

df.isnull().sum()

Job Id                        0
Qualifications                0
City                          0
Country                       0
Work Type                     0
Gender Preference             0
Job Title                     0
Specialization                0
Job Portal                    0
Benefits                      0
Skills                        0
Responsibilities              0
Company                       0
Company Profile               0
Day                           0
Month                         0
Year                          0
Minimum Salary                0
Maximum Salary                0
Minimum Experience (years)    0
Maximum Experience (years)    0
dtype: int64

Cleaning data for Company Profile

In [127]:
# Cleaning data that is formatted incorrectly in Company Profile
rows_to_replace = df[df['Company'] == 'Quanta Services']
df.loc[rows_to_replace.index, 'Company Profile'] = df.loc[rows_to_replace.index, 'Company Profile'].str.replace('"Duke" Austin', 'Austin')

Processing the Company Profile column

Format for Company Profile is a dictionnary, and we want to have a column for each key of the dictionnary and fill the column with the value associated to the key

In [128]:
import ast

def extract_and_rename(dictionary_str, key, new_key):
    try:
        dictionary = ast.literal_eval(dictionary_str)
        value = dictionary.get(key, None)
        return value if value is not None else None
    except (SyntaxError, ValueError):
        return None

# Define keys to extract and their column names
key_to_new_key_mapping = {
    'Sector': 'Company Sector',
    'Industry': 'Company Industry',
    'City': 'Company HQ City',
    'Ticker': 'Company Ticker'
}

# Create new columns and fill with values
for key, new_key in key_to_new_key_mapping.items():
    df[new_key] = df['Company Profile'].apply(lambda x: extract_and_rename(x, key, new_key))

# Drop the original Company Profile column
df.drop(columns=['Company Profile'], inplace=True)
df

Unnamed: 0,Job Id,Qualifications,City,Country,Work Type,Gender Preference,Job Title,Specialization,Job Portal,Benefits,...,Month,Year,Minimum Salary,Maximum Salary,Minimum Experience (years),Maximum Experience (years),Company Sector,Company Industry,Company HQ City,Company Ticker
5,116831420231957,MCA,Brussels,Belgium,Full-Time,Male,Software Tester,Quality Assurance Analyst,Snagajob,"{'Life and Disability Insurance, Stock Options...",...,7,2023,59000,93000,4,12,Infrastructure,Ports and Infrastructure,Mundra,ADANIPORTS
34,1889088294438825,M.Com,Canberra,Australia,Temporary,Both,Electrical Designer,Lighting Designer,Idealist,"{'Life and Disability Insurance, Stock Options...",...,11,2022,62000,130000,0,8,Real Estate,Real Estate,Dallas,CBRE
46,1373513885523551,B.Tech,Madrid,Spain,Temporary,Male,Data Entry Clerk,Record Keeper,Jobs2Careers,"{'Legal Assistance, Bonuses and Incentive Prog...",...,5,2023,56000,109000,3,9,Manufacturing,Metals,Pittsburgh,ARNC
61,401560922349533,PhD,New Delhi,India,Full-Time,Female,Account Executive,Sales Account Executive,Internships.com,"{'Flexible Spending Accounts (FSAs), Relocatio...",...,9,2022,57000,125000,5,12,Pharmaceuticals,Pharmaceuticals,London,GSK
69,2853459439973158,B.Tech,London,UK,Intern,Male,Data Engineer,Data Architect,Indeed,"{'Childcare Assistance, Paid Time Off (PTO), R...",...,8,2022,56000,125000,0,9,Insurance,"Insurance: Life, Health (Stock)",New York,EQH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615879,934065317714728,B.Com,London,UK,Contract,Male,Brand Manager,Product Brand Manager,USAJOBS,"{'Legal Assistance, Bonuses and Incentive Prog...",...,4,2023,64000,113000,0,15,Energy,Pipelines,Houston,KMI
1615885,2072349359037720,PhD,Bangkok,Thailand,Temporary,Male,Event Coordinator,Event Planner,Idealist,"{'Legal Assistance, Bonuses and Incentive Prog...",...,4,2022,65000,126000,2,13,Energy,"Mining, Crude-Oil Production",Oklahoma City,CHK
1615911,2800492340115484,MBA,London,UK,Contract,Both,Sales Consultant,Sales Advisor,Snagajob,"{'Childcare Assistance, Paid Time Off (PTO), R...",...,2,2022,60000,87000,4,9,Insurance,"Insurance: Life, Health (Stock)",Chattanooga,UNM
1615930,690948368250854,BCA,Madrid,Spain,Intern,Both,Environmental Consultant,Sustainability Consultant,Internships.com,"{'Employee Assistance Programs (EAP), Tuition ...",...,7,2022,65000,125000,1,14,Food & Beverage,Food Consumer Products,Orrville,SJM


Cleaning the Company HQ City column

We noticed that some values for the City key included the city inside of it, so for example we have "City": "London, UK", but we only want the London part in that column

In [129]:
# We split at the comma
split_cities = df['Company HQ City'].str.split(',')

# Filtering elements with length 2
split_cities_with_length_2 = split_cities[split_cities.apply(lambda x: len(x) == 2)]

# Getting indices of rows with split cities of length 2
indices_with_length_2 = split_cities_with_length_2.index

# Replacing the existing values with only the first element of the split list
df.loc[indices_with_length_2, 'Company HQ City'] = split_cities_with_length_2.apply(lambda x: x[0])

In [130]:
# Checking if there are still rows with the format where we have the country, if the output returns nothing, then we cleaned properly

# Splitting the content of the 'Company HQ City' column at the comma
split_cities = df['Company HQ City'].str.split(',')

# Filtering elements with length 2
split_cities_with_length_2 = split_cities[split_cities.apply(lambda x: len(x) == 2)]

# Displaying the split content with elements of length 2
print(split_cities_with_length_2)

Series([], Name: Company HQ City, dtype: object)


Processing the Benefits values

Format of Benefits table is {'Retirement Plans', 'Parental Leave', etc} and we want to have a column for each benefit and if the benefit is in the Object, we have True, and if it's not there we have False. So with the Object given above, Retirement Plans and Parental Leave will have True in their column and the others will have False

In [131]:
benefit_columns = ['Retirement Plans','Stock Options or Equity Grants','Parental Leave','Paid Time Off (PTO)',
                   'Flexible Work Arrangements','Health Insurance','Life and Disability Insurance',
                   'Employee Assistance Program','Health and Wellness Facilities','Employee Referral Program',
                   'Transportation Benefits','Bonuses and Incentive Programs']

# Initialize all columns with False
for column in benefit_columns:
    df[column] = False

# Function to set boolean values
def set_benefit_values(row):
    benefits = row['Benefits']
    for column in benefit_columns:
        if column in benefits:
            row[column] = True
    return row

# Apply function to each row
df = df.apply(set_benefit_values, axis=1)

# Drop the original Benefits column
df.drop(columns=['Benefits'], inplace=True)
df

Unnamed: 0,Job Id,Qualifications,City,Country,Work Type,Gender Preference,Job Title,Specialization,Job Portal,Skills,...,Parental Leave,Paid Time Off (PTO),Flexible Work Arrangements,Health Insurance,Life and Disability Insurance,Employee Assistance Program,Health and Wellness Facilities,Employee Referral Program,Transportation Benefits,Bonuses and Incentive Programs
5,116831420231957,MCA,Brussels,Belgium,Full-Time,Male,Software Tester,Quality Assurance Analyst,Snagajob,Quality assurance processes Testing methodolog...,...,False,False,False,True,True,False,False,False,False,False
34,1889088294438825,M.Com,Canberra,Australia,Temporary,Both,Electrical Designer,Lighting Designer,Idealist,Lighting design Architectural lighting Lightin...,...,False,False,False,True,True,False,False,False,False,False
46,1373513885523551,B.Tech,Madrid,Spain,Temporary,Male,Data Entry Clerk,Record Keeper,Jobs2Careers,Records management Data entry and retrieval At...,...,False,False,False,False,False,False,False,False,False,True
61,401560922349533,PhD,New Delhi,India,Full-Time,Female,Account Executive,Sales Account Executive,Internships.com,Sales strategies and tactics Account managemen...,...,False,False,False,False,False,False,False,False,False,False
69,2853459439973158,B.Tech,London,UK,Intern,Male,Data Engineer,Data Architect,Indeed,Data architecture design Database management s...,...,False,True,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615879,934065317714728,B.Com,London,UK,Contract,Male,Brand Manager,Product Brand Manager,USAJOBS,Product branding and marketing Product lifecyc...,...,False,False,False,False,False,False,False,False,False,True
1615885,2072349359037720,PhD,Bangkok,Thailand,Temporary,Male,Event Coordinator,Event Planner,Idealist,Event planning and coordination Budget managem...,...,False,False,False,False,False,False,False,False,False,True
1615911,2800492340115484,MBA,London,UK,Contract,Both,Sales Consultant,Sales Advisor,Snagajob,Sales strategies Customer relationship managem...,...,False,True,True,False,False,False,False,False,False,False
1615930,690948368250854,BCA,Madrid,Spain,Intern,Both,Environmental Consultant,Sustainability Consultant,Internships.com,Sustainability consulting Sustainability asses...,...,True,False,False,False,False,True,False,False,True,False


In [132]:
# Checking if we have null values
df.isnull().sum()

Job Id                            0
Qualifications                    0
City                              0
Country                           0
Work Type                         0
Gender Preference                 0
Job Title                         0
Specialization                    0
Job Portal                        0
Skills                            0
Responsibilities                  0
Company                           0
Day                               0
Month                             0
Year                              0
Minimum Salary                    0
Maximum Salary                    0
Minimum Experience (years)        0
Maximum Experience (years)        0
Company Sector                    0
Company Industry                  0
Company HQ City                   0
Company Ticker                    0
Retirement Plans                  0
Stock Options or Equity Grants    0
Parental Leave                    0
Paid Time Off (PTO)               0
Flexible Work Arrangements  

Processing "N/A" values in Company Ticker column and replacing it with empty string. Only a small percentage of our data has a null value for this column, since this is a very small percentage, we ignore those missing values.

In [133]:
df['Company Ticker'] = df['Company Ticker'].replace('N/A', '')

Cleaning strings in Company HQ City column that have special characters and aren't being displayed. This is happening for German cities.

In [134]:
replacements = {
    'G ttingen': 'Göttingen',
    'Bad Homburg vor der H he': 'Bad Homburg vor der Höhe',
    'Unterf hring': 'Unterföhring',
    'Unterschlei heim': 'Unterschleißheim',
    'D sseldorf': 'Düsseldorf'
}

# Perform replacements
df['Company HQ City'] = df['Company HQ City'].replace(replacements, regex=True)

### Dataset Integrations

We integrate our 3 datasets together: job_descriptions.csv, CityPopulation.csv and CompanyInformation.csv

In [135]:
df = pd.merge(df, df_population, how='left', on=["City", "Country"])
df = pd.merge(df, df_company, how='left', on=["Company"])

In [136]:
# Remove comma from population
df['Job City Population'] = df['Job City Population'].str.replace(',', '')

# Convert the City population and Company size to integer format
df['Job City Population'] = df['Job City Population'].astype('int64')
df['Company Size'] = df['Company Size'].astype('int64')

df.head(5)

Unnamed: 0,Job Id,Qualifications,City,Country,Work Type,Gender Preference,Job Title,Specialization,Job Portal,Skills,...,Health Insurance,Life and Disability Insurance,Employee Assistance Program,Health and Wellness Facilities,Employee Referral Program,Transportation Benefits,Bonuses and Incentive Programs,Job City Population,Company HQ Country,Company Size
0,116831420231957,MCA,Brussels,Belgium,Full-Time,Male,Software Tester,Quality Assurance Analyst,Snagajob,Quality assurance processes Testing methodolog...,...,True,True,False,False,False,False,False,1222657,India,12000
1,1889088294438825,M.Com,Canberra,Australia,Temporary,Both,Electrical Designer,Lighting Designer,Idealist,Lighting design Architectural lighting Lightin...,...,True,True,False,False,False,False,False,472000,USA,105000
2,1373513885523551,B.Tech,Madrid,Spain,Temporary,Male,Data Entry Clerk,Record Keeper,Jobs2Careers,Records management Data entry and retrieval At...,...,False,False,False,False,False,False,True,6751000,USA,41400
3,401560922349533,PhD,New Delhi,India,Full-Time,Female,Account Executive,Sales Account Executive,Internships.com,Sales strategies and tactics Account managemen...,...,False,False,False,False,False,False,False,32941000,UK,94000
4,2853459439973158,B.Tech,London,UK,Intern,Male,Data Engineer,Data Architect,Indeed,Data architecture design Database management s...,...,False,False,False,False,False,False,False,9748000,USA,7500


In [137]:
print(df.dtypes)

Job Id                             int64
Qualifications                    object
City                              object
Country                           object
Work Type                         object
Gender Preference                 object
Job Title                         object
Specialization                    object
Job Portal                        object
Skills                            object
Responsibilities                  object
Company                           object
Day                                int64
Month                              int64
Year                               int64
Minimum Salary                     int64
Maximum Salary                     int64
Minimum Experience (years)         int64
Maximum Experience (years)         int64
Company Sector                    object
Company Industry                  object
Company HQ City                   object
Company Ticker                    object
Retirement Plans                    bool
Stock Options or

Reorder Columns

In [138]:
# Reorder columns based on the desired order
desired_order = ['Job Id','Minimum Experience (years)',
                 'Maximum Experience (years)','Qualifications',
                 'Minimum Salary', 'Maximum Salary', 'City', 'Country', 'Job City Population',
                 'Work Type', 'Day', 'Month', 'Year', 'Gender Preference', 'Job Title', 'Specialization',
                 'Job Portal', 'Skills', 'Responsibilities', 'Company', 'Company Size', 'Company Sector',
                 'Company Industry', 'Company HQ City', 'Company HQ Country', 'Company Ticker',
                 'Retirement Plans', 'Stock Options or Equity Grants', 'Parental Leave','Paid Time Off (PTO)',
                 'Flexible Work Arrangements','Health Insurance', 'Life and Disability Insurance',
                 'Employee Assistance Program','Health and Wellness Facilities','Employee Referral Program',
                 'Transportation Benefits','Bonuses and Incentive Programs']

df = df[desired_order]
df.head(5)

Unnamed: 0,Job Id,Minimum Experience (years),Maximum Experience (years),Qualifications,Minimum Salary,Maximum Salary,City,Country,Job City Population,Work Type,...,Parental Leave,Paid Time Off (PTO),Flexible Work Arrangements,Health Insurance,Life and Disability Insurance,Employee Assistance Program,Health and Wellness Facilities,Employee Referral Program,Transportation Benefits,Bonuses and Incentive Programs
0,116831420231957,4,12,MCA,59000,93000,Brussels,Belgium,1222657,Full-Time,...,False,False,False,True,True,False,False,False,False,False
1,1889088294438825,0,8,M.Com,62000,130000,Canberra,Australia,472000,Temporary,...,False,False,False,True,True,False,False,False,False,False
2,1373513885523551,3,9,B.Tech,56000,109000,Madrid,Spain,6751000,Temporary,...,False,False,False,False,False,False,False,False,False,True
3,401560922349533,5,12,PhD,57000,125000,New Delhi,India,32941000,Full-Time,...,False,False,False,False,False,False,False,False,False,False
4,2853459439973158,0,9,B.Tech,56000,125000,London,UK,9748000,Intern,...,False,True,True,False,False,False,False,False,False,False


## Load

### Surrogate Key Generation

In [139]:
# Create a new column new_result with sequential indices for each row
df['Surrogate Keys'] = range(1,len(df)+1)

In [140]:
# after generation, surrogate key column is at the end of the dataset
# this code brings it to the beginning
df = df.reindex(columns=['Surrogate Keys'] + list([c for c in df.columns if c!= 'Surrogate Keys']))

In [141]:
df.head(5)

Unnamed: 0,Surrogate Keys,Job Id,Minimum Experience (years),Maximum Experience (years),Qualifications,Minimum Salary,Maximum Salary,City,Country,Job City Population,...,Parental Leave,Paid Time Off (PTO),Flexible Work Arrangements,Health Insurance,Life and Disability Insurance,Employee Assistance Program,Health and Wellness Facilities,Employee Referral Program,Transportation Benefits,Bonuses and Incentive Programs
0,1,116831420231957,4,12,MCA,59000,93000,Brussels,Belgium,1222657,...,False,False,False,True,True,False,False,False,False,False
1,2,1889088294438825,0,8,M.Com,62000,130000,Canberra,Australia,472000,...,False,False,False,True,True,False,False,False,False,False
2,3,1373513885523551,3,9,B.Tech,56000,109000,Madrid,Spain,6751000,...,False,False,False,False,False,False,False,False,False,True
3,4,401560922349533,5,12,PhD,57000,125000,New Delhi,India,32941000,...,False,False,False,False,False,False,False,False,False,False
4,5,2853459439973158,0,9,B.Tech,56000,125000,London,UK,9748000,...,False,True,True,False,False,False,False,False,False,False


### Saving the fully staged data for loading into a csv

In [142]:
 # convert it back to csv
df.to_csv('Staged_data.csv')