In [1]:
import os
import json
import copy
import pandas as pd
import petl as etl
from pathlib import Path
# from tqdm import tqdm
from tqdm.notebook import tqdm


root_dir = "./dataset/"

In [2]:
columns_og = ['latitude', 'multilocation', 'longitude', 'remote', 'address_text', 'company_logo_url', 'published_at', 'street', 'company_name', 'skills', 'marker_icon', 'display_offer', 'country_code', 'way_of_apply', 'experience_level', 'id', 'employment_types', 'remote_interview', 'company_url', 'company_size', 'workplace_type', 'open_to_hire_ukrainians', 'title', 'city']

columns_to_check = ('latitude', 'multilocation', 'longitude', 'remote', 'address_text', 'published_at', 'street', 'company_name', 'skills', 'marker_icon', 'display_offer', 'country_code', 'way_of_apply', 'experience_level', 'id', 'remote_interview', 'company_url', 'company_size', 'workplace_type', 'open_to_hire_ukrainians', 'title', 'city')

In [3]:
def degerate_data(data):
    tmp = []
    # zdegenerowane dane
    # see every offer
    for offer in data:
        offer_cp = copy.copy(offer)
        offer_cp.pop("company_logo_url")
        del offer_cp["employment_types"]
        # cross-check to ensure that all columns all present even with None
        
        cols_to_add = set(columns_to_check) - set(offer_cp.keys())
        
        for col in cols_to_add:
            offer_cp[col] = None
        
        for empl_types in offer["employment_types"]:
            offer_cp["employment_type"] = empl_types["type"]
            offer_cp["salary"] = "Disclosed"
            if empl_types["salary"] is not None:
                offer_cp["from"] = empl_types["salary"]["from"]
                offer_cp["to"] = empl_types["salary"]["to"]
                offer_cp["currency"] = empl_types["salary"]["currency"]
            else:
                offer_cp["from"] = None 
                offer_cp["to"] = None
                offer_cp["currency"] = None
               
                tmp.append(offer_cp)
    return tmp

In [4]:
from etl_tools import *
FLAG = True

In [None]:
def przetworz_pliki_w_folderze(folder_path, dim_company, dim_location, workplace_type, employment_type):
    global FLAG
    all_files = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            onefile = degerate_data(json_data)
            all_files.extend(onefile)
    all_files = pd.DataFrame(all_files)
    
    if FLAG:
        dim_company =  create_dim_company(all_files)
        dim_location = create_dim_location(all_files)
        workplace_type = create_workplace_dim(all_files)
        employment_type = create_employmet_dim(all_files)
        FLAG = False
        
        final_fact_table = create_fact_table(all_files, dim_company, dim_location, workplace_type, employment_type)
        
        return final_fact_table, dim_company, dim_location, workplace_type, employment_type
    
    dim_company = pd.concat([dim_company, create_dim_company(all_files)]).drop_duplicates(subset=['company_name', 'company_url', 'company_size'], keep='first').reset_index(drop=True)
    dim_company['company_id'] = dim_company.index + 1
    
    dim_location = pd.concat([dim_location,create_dim_location(all_files)]).drop_duplicates(subset=['street', 'city', 'country_code', 'latitude', 'longitude'], keep='first').reset_index(drop=True)
    dim_location["location_id"] = dim_location.index + 1
    
    workplace_type = pd.concat([workplace_type, create_workplace_dim(all_files)]).drop_duplicates(subset=["workplace_type"], keep='first').reset_index(drop=True)
    workplace_type["workplace_type_id"] = workplace_type.index + 1
    employment_type = pd.concat([employment_type, create_employmet_dim(all_files)]).drop_duplicates(subset=["employment_type"], keep='first').reset_index(drop=True) 
    employment_type["employment_type_id"] = employment_type.index + 1
    
    final_fact_table = create_fact_table(all_files, dim_company, dim_location, workplace_type, employment_type) 
    return final_fact_table, dim_company, dim_location, workplace_type, employment_type

def przetworz_wszystkie_foldery(root_folder):
    final_fact_table = pd.DataFrame()
    dim_company = pd.DataFrame
    dim_location = pd.DataFrame
    workplace_type = pd.DataFrame
    employment_type = pd.DataFrame
    
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)
        if os.path.isdir(folder_path):            
            ft, dim_company,dim_location, workplace_type, employment_type = przetworz_pliki_w_folderze(folder_path, dim_company, dim_location, workplace_type, employment_type)
            if final_fact_table.empty:
                final_fact_table = ft
            else:
                final_fact_table = pd.concat([final_fact_table, ft], ignore_index=True)
    
    final_fact_table.to_csv("final_fact_table.csv")
    dim_company.to_csv("dim_company.csv")
    dim_location.to_csv("dim_location.csv")
    workplace_type.to_csv("workplace_type.csv")
    employment_type.to_csv("employment_type.csv")

# Ścieżka do głównego katalogu
main_folder = "./dataset/"

# Wywołujemy funkcję przetwarzającą wszystkie foldery
final_fact_table, dim_company, dim_location, workplace_type, employment_type = przetworz_wszystkie_foldery(main_folder)

                                     title                   street      city  \
0               Android Developer  (FQWO1)  ul.Marsz.J.Piłsudskiego   Wrocław   
1  Senior System Integration Test Engineer               Łużycka 8C    Gdynia   
2                         Programista Java               1 Maja 133  Katowice   
3                         Programista Java               1 Maja 133  Katowice   
4                             Scrum Master     al. Jana Pawła II 22  Warszawa   

  country_code                      address_text marker_icon workplace_type  \
0           PL  ul.Marsz.J.Piłsudskiego, Wrocław        java  partly_remote   
1           PL                Łużycka 8C, Gdynia     testing         remote   
2           PL              1 Maja 133, Katowice        java         remote   
3           PL              1 Maja 133, Katowice        java         remote   
4           PL    al. Jana Pawła II 22, Warszawa          pm  partly_remote   

                         company_name 

In [None]:
print(final_fact_table.head())
print(dim_company.head())
print(dim_location.head())
print(workplace_type.head())
print(employment_type.head())