In [None]:
import pandas as pd
import os
from glob import glob
import json

# Lista wszystkich kolumn, które mogą wystąpić w danych
columns_to_check = [
    'latitude', 'multilocation', 'longitude', 'remote', 'address_text', 'published_at',
    'street', 'company_name', 'skills', 'marker_icon', 'display_offer', 'country_code',
    'way_of_apply', 'experience_level', 'id', 'remote_interview', 'company_url', 'company_size',
    'workplace_type', 'open_to_hire_ukrainians', 'title', 'city'
]

def load_data_from_files(folder_path):
    all_files = glob(os.path.join(folder_path, '*.json'))
    data_list = []
    for file in all_files:
        with open(file, 'r') as f:
            data_list.extend(json.load(f))
    
    # Konwersja do DataFrame i dodanie brakujących kolumn z wartością None
    df = pd.DataFrame(data_list)
    for column in columns_to_check:
        if column not in df.columns:
            df[column] = None
            
    return df

In [None]:
def create_dim_company(df):
    dim_company = df[['company_name', 'company_url', 'company_size']].drop_duplicates().reset_index(drop=True)
    dim_company['company_id'] = dim_company.index + 1
    return dim_company

def create_dim_location(df):
    dim_location = df[['street', 'city', 'country_code', 'latitude', 'longitude']].drop_duplicates().reset_index(drop=True)
    dim_location['location_id'] = dim_location.index + 1
    return dim_location

def create_dim_skills(df):
    skills_expanded = df.explode('skills')[['id', 'skills']]
    skills_expanded = pd.concat([skills_expanded.drop(['skills'], axis=1), skills_expanded['skills'].apply(pd.Series)], axis=1)
    skills_expanded = skills_expanded.rename(columns={'name': 'skill_name', 'level': 'skill_level'}).drop_duplicates().reset_index(drop=True)
    
    dim_skills = skills_expanded[['skill_name', 'skill_level']].drop_duplicates().reset_index(drop=True)
    dim_skills['skill_id'] = dim_skills.index + 1
    return dim_skills, skills_expanded

def create_employmet_dim(df):
    dim_emp = df["workplace_type"].drop_duplicates().reset_index(
        drop=True)
    dim_emp['workplace_type_id'] = dim_emp.index + 1
    return dim_emp

def create_workplace_dim(df):
    dim_workplace = df["workplace_type"].drop_duplicates().reset_index(
        drop=True)
    dim_workplace['workplace_type_id'] = dim_workplace.index + 1
    return dim_workplace

In [None]:
def create_fact_table(df, dim_company, dim_location, skills_expanded, dim_skills):
    # Łączenie identyfikatorów firmy i lokalizacji do tabeli faktów
    df = df.merge(dim_company, on=['company_name', 'company_url', 'company_size'], how='left')
    df = df.merge(dim_location, on=['street', 'city', 'country_code', 'latitude', 'longitude'], how='left')
    df = df.merge(skills_expanded, on=['skill_name', 'skill_level'], how='left')
    df = df.merge(dim_skills, on=['skill_name', 'skill_level'], how='left')
    # Dodanie identyfikatorów umiejętności do tabeli umiejętności rozszerzonej
    skills_expanded = skills_expanded.merge(dim_skills, on=['skill_name', 'skill_level'], how='left')

    # Grupowanie umiejętności według id ofert pracy
    skills_grouped = skills_expanded.groupby('id')['skill_id'].apply(list).reset_index()

    # Łączenie umiejętności z tabelą faktów
    df = df.merge(skills_grouped, on='id', how='left')

    # Przekształcenie tabeli faktów, aby zawierała tylko identyfikatory oraz inne istotne kolumny
    fact_df = df[[
        'id', 'title', 'workplace_type', 'experience_level', 'published_at', 'remote_interview', 
        'open_to_hire_ukrainians', 'remote', 'employment_type', 'salary', 'from', 'to', 'currency', 
        'company_id', 'location_id', 'skill_id', 'multilocation', 'address_text', 'marker_icon', 
        'display_offer', 'way_of_apply'
    ]]
    
    return fact_df

In [None]:
def process_all_folders(base_folder_path):
    all_fact_tables = []
    all_dim_companies = []
    all_dim_locations = []
    all_dim_skills = []

    for folder in os.listdir(base_folder_path):
        folder_path = os.path.join(base_folder_path, folder)
        if os.path.isdir(folder_path):
            df = load_data_from_files(folder_path)
            
            dim_company = create_dim_company(df)
            dim_location = create_dim_location(df)
            dim_skills, skills_expanded = create_dim_skills(df)
            
            fact_df = create_fact_table(df, dim_company, dim_location, skills_expanded, dim_skills)
            
            all_fact_tables.append(fact_df)
            all_dim_companies.append(dim_company)
            all_dim_locations.append(dim_location)
            all_dim_skills.append(dim_skills)
    
    # Scalanie tabel wymiarów i faktów z wszystkich folderów
    final_fact_table = pd.concat(all_fact_tables).reset_index(drop=True)
    final_dim_company = pd.concat(all_dim_companies).drop_duplicates().reset_index(drop=True)
    final_dim_location = pd.concat(all_dim_locations).drop_duplicates().reset_index(drop=True)
    final_dim_skills = pd.concat(all_dim_skills).drop_duplicates().reset_index(drop=True)

    return final_fact_table, final_dim_company, final_dim_location, final_dim_skills


In [None]:
base_folder_path = '/path/to/your/base/folder'
final_fact_table, final_dim_company, final_dim_location, final_dim_skills = process_all_folders(base_folder_path)
