In [None]:
import pandas as pd
import os
from glob import glob
import json

def load_data_from_files(folder_path):
    all_files = glob(os.path.join(folder_path, '*.json'))
    data_list = []
    for file in all_files:
        with open(file, 'r') as f:
            data_list.extend(json.load(f))
    return data_list

In [None]:
def create_dim_company(df):
    dim_company = df[['company_name', 'company_url', 'company_size']].drop_duplicates().reset_index(drop=True)
    dim_company['company_id'] = dim_company.index + 1
    return dim_company

def create_dim_location(df):
    dim_location = df[['street', 'city', 'country_code', 'latitude', 'longitude']].drop_duplicates().reset_index(drop=True)
    dim_location['location_id'] = dim_location.index + 1
    return dim_location

def create_dim_skills(df):
    skills_expanded = df.explode('skills')[['id', 'skills']]
    skills_expanded = pd.concat([skills_expanded.drop(['skills'], axis=1), skills_expanded['skills'].apply(pd.Series)], axis=1)
    skills_expanded = skills_expanded.rename(columns={'name': 'skill_name', 'level': 'skill_level'}).drop_duplicates().reset_index(drop=True)
    
    dim_skills = skills_expanded[['skill_name', 'skill_level']].drop_duplicates().reset_index(drop=True)
    dim_skills['skill_id'] = dim_skills.index + 1
    return dim_skills, skills_expanded

In [None]:
def create_fact_table(df, dim_company, dim_location, skills_expanded, dim_skills):
    # Łączenie identyfikatorów firmy i lokalizacji do tabeli faktów
    df = df.merge(dim_company, on=['company_name', 'company_url', 'company_size'])
    df = df.merge(dim_location, on=['street', 'city', 'country_code', 'latitude', 'longitude'])

    # Dodanie identyfikatorów umiejętności do tabeli umiejętności rozszerzonej
    skills_expanded = skills_expanded.merge(dim_skills, on=['skill_name', 'skill_level'])

    # Grupowanie umiejętności według id ofert pracy
    skills_grouped = skills_expanded.groupby('id')['skill_id'].apply(list).reset_index()

    # Łączenie umiejętności z tabelą faktów
    df = df.merge(skills_grouped, on='id')

    # Przekształcenie tabeli faktów, aby zawierała tylko identyfikatory oraz inne istotne kolumny
    fact_df = df[['id', 'title', 'workplace_type', 'experience_level', 'published_at', 'remote_interview', 'open_to_hire_ukrainians', 'remote', 'employment_type', 'salary', 'from', 'to', 'currency', 'company_id', 'location_id', 'skill_id']]
    
    return fact_df

In [None]:
def process_all_folders(base_folder_path):
    all_fact_tables = []
    all_dim_companies = []
    all_dim_locations = []
    all_dim_skills = []

    for folder in os.listdir(base_folder_path):
        folder_path = os.path.join(base_folder_path, folder)
        if os.path.isdir(folder_path):
            data_list = load_data_from_files(folder_path)
            df = pd.DataFrame(data_list)
            
            dim_company = create_dim_company(df)
            dim_location = create_dim_location(df)
            dim_skills, skills_expanded = create_dim_skills(df)
            
            fact_df = create_fact_table(df, dim_company, dim_location, skills_expanded, dim_skills)
            
            all_fact_tables.append(fact_df)
            all_dim_companies.append(dim_company)
            all_dim_locations.append(dim_location)
            all_dim_skills.append(dim_skills)
    
    # Scalanie tabel wymiarów i faktów z wszystkich folderów
    final_fact_table = pd.concat(all_fact_tables).reset_index(drop=True)
    final_dim_company = pd.concat(all_dim_companies).drop_duplicates().reset_index(drop=True)
    final_dim_location = pd.concat(all_dim_locations).drop_duplicates().reset_index(drop=True)
    final_dim_skills = pd.concat(all_dim_skills).drop_duplicates().reset_index(drop=True)

    return final_fact_table, final_dim_company, final_dim_location, final_dim_skills


In [None]:
def przetworz_pliki_w_folderze(folder_path):

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        
        # degenerujemy i pakujemy to do dataframa
    
   
    print("Zapisano wyniki dla folderu:", folder_path)

def przetworz_wszystkie_foldery(root_folder):
    
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)

        if os.path.isdir(folder_path):            
            przetworz_pliki_w_folderze(folder_path)

# Ścieżka do głównego katalogu
main_folder = "./dataset/"

# Wywołujemy funkcję przetwarzającą wszystkie foldery
przetworz_wszystkie_foldery(main_folder)

In [None]:
base_folder_path = './dataset'
final_fact_table, final_dim_company, final_dim_location, final_dim_skills = process_all_folders(base_folder_path)

# Wyświetlanie wyników
print("Tabela faktów:")
print(final_fact_table.head())

print("\nTabela wymiarów firmy:")
print(final_dim_company.head())

print("\nTabela wymiarów lokalizacji:")
print(final_dim_location.head())

print("\nTabela wymiarów umiejętności:")
print(final_dim_skills.head())