In [1]:
import pandas as pd

# Read the CSV files into DataFrames 
job_skills_df = pd.read_csv("../data/jobs/job_skills.csv")
job_summary_df = pd.read_csv("../data/jobs/job_summary.csv")
linkedin_job_postings_df = pd.read_csv("../data/jobs/linkedin_job_postings.csv")

# show the first few rows of each DataFrame
print("Job Skills DataFrame:")
print(job_skills_df.head())
print("\nJob Summary DataFrame:")
print(job_summary_df.head())
print("\nLinkedIn Job Postings DataFrame:")
print(linkedin_job_postings_df.head())

Job Skills DataFrame:
                                            job_link  \
0  https://www.linkedin.com/jobs/view/housekeeper...   
1  https://www.linkedin.com/jobs/view/assistant-g...   
2  https://www.linkedin.com/jobs/view/school-base...   
3  https://www.linkedin.com/jobs/view/electrical-...   
4  https://www.linkedin.com/jobs/view/electrical-...   

                                          job_skills  
0  Building Custodial Services, Cleaning, Janitor...  
1  Customer service, Restaurant management, Food ...  
2  Applied Behavior Analysis (ABA), Data analysis...  
3  Electrical Engineering, Project Controls, Sche...  
4  Electrical Assembly, Point to point wiring, St...  

Job Summary DataFrame:
                                            job_link  \
0  https://www.linkedin.com/jobs/view/restaurant-...   
1  https://www.linkedin.com/jobs/view/med-surg-re...   
2  https://www.linkedin.com/jobs/view/registered-...   
3  https://uk.linkedin.com/jobs/view/commercial-a...   
4  http

In [2]:
# check the columns of each DataFrame
print("\nJob Skills DataFrame Columns:")
print(job_skills_df.columns)
print("\nJob Summary DataFrame Columns:")
print(job_summary_df.columns)
print("\nLinkedIn Job Postings DataFrame Columns:")
print(linkedin_job_postings_df.columns)
# check how many rows each DataFrame has
print("\nJob Skills DataFrame Rows:")
print(len(job_skills_df))
print("\nJob Summary DataFrame Rows:")
print(len(job_summary_df))
print("\nLinkedIn Job Postings DataFrame Rows:")
print(len(linkedin_job_postings_df))


Job Skills DataFrame Columns:
Index(['job_link', 'job_skills'], dtype='object')

Job Summary DataFrame Columns:
Index(['job_link', 'job_summary'], dtype='object')

LinkedIn Job Postings DataFrame Columns:
Index(['job_link', 'last_processed_time', 'got_summary', 'got_ner',
       'is_being_worked', 'job_title', 'company', 'job_location', 'first_seen',
       'search_city', 'search_country', 'search_position', 'job_level',
       'job_type'],
      dtype='object')

Job Skills DataFrame Rows:
1296381

Job Summary DataFrame Rows:
1297332

LinkedIn Job Postings DataFrame Rows:
1348454


In [3]:
# merge the dataframes on the 'job_link' column
merged_df = pd.merge(linkedin_job_postings_df, job_skills_df, on='job_link', how='inner')
merged_df = pd.merge(merged_df, job_summary_df, on='job_link', how='inner')
# show the first few rows of the merged DataFrame using display
print("\nMerged DataFrame:")
print(merged_df.head())


Merged DataFrame:
                                            job_link  \
0  https://www.linkedin.com/jobs/view/account-exe...   
1  https://www.linkedin.com/jobs/view/registered-...   
2  https://www.linkedin.com/jobs/view/restaurant-...   
3  https://www.linkedin.com/jobs/view/independent...   
4  https://www.linkedin.com/jobs/view/registered-...   

             last_processed_time got_summary got_ner is_being_worked  \
0   2024-01-21 07:12:29.00256+00           t       t               f   
1   2024-01-21 07:39:58.88137+00           t       t               f   
2  2024-01-21 07:40:00.251126+00           t       t               f   
3  2024-01-21 07:40:00.308133+00           t       t               f   
4  2024-01-21 08:08:19.663033+00           t       t               f   

                                           job_title  \
0  Account Executive - Dispensing (NorCal/Norther...   
1                 Registered Nurse - RN Care Manager   
2               RESTAURANT SUPERVISOR - THE

In [4]:
# check how many rows merged DataFrame has
print("\nMerged DataFrame Rows:")
print(len(merged_df))


Merged DataFrame Rows:
1296381


In [5]:
# check merged DataFrame missing values
print("\nMerged DataFrame Missing Values:")
print(merged_df.isnull().sum())
# check merged DataFrame duplicates
print("\nMerged DataFrame Duplicates:")
print(merged_df.duplicated().sum())


Merged DataFrame Missing Values:
job_link                  0
last_processed_time       0
got_summary               0
got_ner                   0
is_being_worked           0
job_title                 0
company                   9
job_location             19
first_seen                0
search_city               0
search_country            0
search_position           0
job_level                 0
job_type                  0
job_skills             2085
job_summary               0
dtype: int64

Merged DataFrame Duplicates:
0


In [6]:
# drop rows with missing values
merged_df = merged_df.dropna()
# check merged DataFrame missing values again
print("\nMerged DataFrame After Dropping Rows with Missing Values:")
print(merged_df.isnull().sum())

# check how many rows merged DataFrame has
print("\nMerged DataFrame Rows:")
print(len(merged_df))


Merged DataFrame After Dropping Rows with Missing Values:
job_link               0
last_processed_time    0
got_summary            0
got_ner                0
is_being_worked        0
job_title              0
company                0
job_location           0
first_seen             0
search_city            0
search_country         0
search_position        0
job_level              0
job_type               0
job_skills             0
job_summary            0
dtype: int64

Merged DataFrame Rows:
1294268


In [7]:
# use display function to show the first few rows of the merged DataFrame
print("\nMerged DataFrame:")
display(merged_df.head())


Merged DataFrame:


Unnamed: 0,job_link,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills,job_summary
0,https://www.linkedin.com/jobs/view/account-exe...,2024-01-21 07:12:29.00256+00,t,t,f,Account Executive - Dispensing (NorCal/Norther...,BD,"San Diego, CA",2024-01-15,Coronado,United States,Color Maker,Mid senior,Onsite,"Medical equipment sales, Key competitors, Term...",Responsibilities\nJob Description Summary\nJob...
1,https://www.linkedin.com/jobs/view/registered-...,2024-01-21 07:39:58.88137+00,t,t,f,Registered Nurse - RN Care Manager,Trinity Health MI,"Norton Shores, MI",2024-01-14,Grand Haven,United States,Director Nursing Service,Mid senior,Onsite,"Nursing, Bachelor of Science in Nursing, Maste...",Employment Type:\nFull time\nShift:\nDescripti...
2,https://www.linkedin.com/jobs/view/restaurant-...,2024-01-21 07:40:00.251126+00,t,t,f,RESTAURANT SUPERVISOR - THE FORKLIFT,Wasatch Adaptive Sports,"Sandy, UT",2024-01-14,Tooele,United States,Stand-In,Mid senior,Onsite,"Restaurant Operations Management, Inventory Ma...",Job Details\nDescription\nWhat You'll Do\nAs a...
3,https://www.linkedin.com/jobs/view/independent...,2024-01-21 07:40:00.308133+00,t,t,f,Independent Real Estate Agent,Howard Hanna | Rand Realty,"Englewood Cliffs, NJ",2024-01-16,Pinehurst,United States,Real-Estate Clerk,Mid senior,Onsite,"Real Estate, Customer Service, Sales, Negotiat...",Who We Are\nRand Realty is a family-owned brok...
4,https://www.linkedin.com/jobs/view/registered-...,2024-01-21 08:08:19.663033+00,t,t,f,Registered Nurse (RN),Trinity Health MI,"Muskegon, MI",2024-01-14,Muskegon,United States,Nurse Practitioner,Mid senior,Onsite,"Nursing, BSN, Medical License, Virtual RN, Nur...",Employment Type:\nFull time\nShift:\n12 Hour N...


In [None]:
# drop irrelevant columns 'last_processed_time', 'got_summary', 'got_ner', 'is_being_worked' and 'first_seen'
merged_df = merged_df.drop(columns=['last_processed_time', 'got_summary', 'got_ner', 'is_being_worked', 'first_seen'])
# check merged DataFrame columns
print("\nMerged DataFrame Columns After Dropping Irrelevant Columns:")
print(merged_df.columns)



Merged DataFrame Columns After Dropping Irrelevant Columns:
Index(['job_link', 'job_title', 'company', 'job_location', 'search_city',
       'search_country', 'search_position', 'job_level', 'job_type',
       'job_skills', 'job_summary'],
      dtype='object')


In [10]:
#check number of unique values in each column
print("\nNumber of Unique Values in Each Column:")
print(merged_df.nunique())



Number of Unique Values in Each Column:
job_link           1294268
job_title           564775
company              88932
job_location         28776
search_city           1018
search_country           4
search_position       1922
job_level                2
job_type                 3
job_skills         1287073
job_summary         955934
dtype: int64


In [None]:
# save the merged DataFrame to a CSV file
merged_df.to_csv("merged_job_data.csv", index=False)

In [1]:
import pandas as pd

# load the merged data from CSV
merged_job_data = pd.read_csv("merged_job_data.csv")

# drop the 'job_summary' column
merged_no_summary_df = merged_job_data.drop(columns=['job_summary'])

print(merged_no_summary_df.head())

merged_no_summary_df.to_csv("merged_job_data_no_summary.csv", index=False)

                                            job_link  \
0  https://www.linkedin.com/jobs/view/account-exe...   
1  https://www.linkedin.com/jobs/view/registered-...   
2  https://www.linkedin.com/jobs/view/restaurant-...   
3  https://www.linkedin.com/jobs/view/independent...   
4  https://www.linkedin.com/jobs/view/registered-...   

                                           job_title  \
0  Account Executive - Dispensing (NorCal/Norther...   
1                 Registered Nurse - RN Care Manager   
2               RESTAURANT SUPERVISOR - THE FORKLIFT   
3                      Independent Real Estate Agent   
4                              Registered Nurse (RN)   

                      company          job_location  search_city  \
0                          BD         San Diego, CA     Coronado   
1           Trinity Health MI     Norton Shores, MI  Grand Haven   
2     Wasatch Adaptive Sports             Sandy, UT       Tooele   
3  Howard Hanna | Rand Realty  Englewood Cliffs, NJ   