In [37]:
import json
import os
from langdetect import detect
import pandas as pd

In [29]:
# Understand structure of the JSON file
with open("../data/raw/2024-07-09/3809446969/rawJobDetails.json", 'r') as file:
    data = json.load(file)

In [21]:
def print_structure(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + str(key))
        if isinstance(value, dict):
            print_structure(value, indent+1)
        elif isinstance(value, list) and value and isinstance(value[0], dict):
            print('  ' * (indent+1) + 'List of Dictionaries...')
        else:
            print('  ' * (indent+1) + str(type(value)))

In [22]:
print_structure(data)

dashEntityUrn
  <class 'str'>
companyDetails
  com.linkedin.voyager.deco.jobs.web.shared.WebCompactJobPostingCompany
    companyResolutionResult
      entityUrn
        <class 'str'>
      name
        <class 'str'>
      logo
        image
          com.linkedin.common.VectorImage
            artifacts
              List of Dictionaries...
            rootUrl
              <class 'str'>
        type
          <class 'str'>
      $recipeType
        <class 'str'>
      universalName
        <class 'str'>
      url
        <class 'str'>
    company
      <class 'str'>
    $recipeType
      <class 'str'>
jobState
  <class 'str'>
description
  attributes
    List of Dictionaries...
  text
    <class 'str'>
title
  <class 'str'>
entityUrn
  <class 'str'>
workRemoteAllowed
  <class 'bool'>
applyMethod
  com.linkedin.voyager.jobs.OffsiteApply
    applyStartersPreferenceVoid
      <class 'bool'>
    companyApplyUrl
      <class 'str'>
    inPageOffsiteApply
      <class 'bool'>
talentHubJob
 

# Extract useful information from JSON file 

In [60]:
def extract_data (data):
    ''' Extracts necessary information from a given JSON data dictionary.'''
    try:
        state = data['jobState'] if 'jobState' in data else None
        job_title = data['title'] if 'title' in data else None
        location = data['formattedLocation'] if 'formattedLocation' in data else None
        list_time = data['listedAt']
        id = data['jobPostingId']
        # workplace_type_key = data['workplaceTypes'][0] 
        # workplace_type = data['workplaceTypesResolutionResults'][workplace_type_key]['localizedName']
        description = data['description']['text']
        return {'id':id, 'list_time':list_time,'state': state, 'job_title': job_title, 'location':location,
                'workplace_type': workplace_type, 'description': description }
    except KeyError as e :
        print(f'Missing key{e} in the JSON structure')


In [61]:
base_path = '../data/raw/2024-07-09/'
all_jobs = []
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file =='rawJobDetails.json':
            full_path = os.path.join(root, file)
            with open(full_path, 'r') as file:
                data = json.load(file)
                extracted_data = extract_data(data)
                all_jobs.append(extracted_data)

df = pd.DataFrame(all_jobs)

print(df.head())

           id      list_time   state  \
0  3970706732  1720508350000  LISTED   
1  3969317415  1720486857000  LISTED   
2  3964393203  1720541335000  LISTED   
3  3970162583  1720537140000  LISTED   
4  3970738450  1720515754000  LISTED   

                                           job_title  \
0                         Data Analytics Traineeship   
1                                      Data Engineer   
2      Onderzoeksassistent Gynaecologische oncologie   
3  Arts promovendus voor een klinisch promotiepro...   
4                                            Analyst   

                                location workplace_type  \
0                   Utrecht, Netherlands        On-site   
1  Rotterdam, South Holland, Netherlands        On-site   
2  Amsterdam, North Holland, Netherlands        On-site   
3  Rotterdam, South Holland, Netherlands        On-site   
4                         Amsterdam Area        On-site   

                                         description  
0  ErasmusTa

In [58]:
print(df[df['id'] == 3806724725])

            id      list_time state     job_title  \
94  3806724725  1720519343000  None  R&D Engineer   

                                    location workplace_type  \
94  Alblasserdam, South Holland, Netherlands        On-site   

                                          description  
94  We zijn op zoek naar een Research & Developmen...  


In [31]:
def lang_detect(text):
    try:
        ln = detect(text)
    except:
        ln = None
    return ln

lang_detect(text)

'en'