In [24]:
import json
import os
import yaml
from typing import Dict, List, Any, Optional
from datetime import datetime
from collections import defaultdict
from pprint import pprint



def move_working_dir_to_repo_root(repo_name="orgsync"):
    """
    Move the current working directory to the root of the repository.
    """
    current_dir = os.getcwd()
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")


Current working directory:  c:\Users\dec2g\GitHub\OrgSync


In [20]:
# Determine the directory where this script is located
# script_directory = os.path.dirname(os.path.abspath(__file__))

# Define the base path and the file paths to combine
input_path = "data/raw/all_scraped/"

file_paths = {
    "organisations": os.path.join(input_path, "gtr/scraped/2024_07/organisations.json"),
    "projects": os.path.join(input_path, "gtr/scraped/2024_07/projects.json"),
    "persons": os.path.join(input_path, "gtr/scraped/2024_07/persons.json"),
}

schema_paths = {
    "organisations": os.path.join(input_path, "gtr/scraped/schemas/organisation.json"),
    "projects": os.path.join(input_path, "gtr/scraped/schemas/project.json"),
    "persons": os.path.join(input_path, "gtr/scraped/schemas/person.json"),
}



In [21]:
class SchemaProcessor:
    def __init__(self, schema_path: str):
        """Initialize processor with a schema file path."""
        self.schema = self._load_json(schema_path)
        self.processed_fields = set()
        self.nested_arrays = {}
        self.links = {}
        self._analyze_schema(self.schema)

    @staticmethod
    def _load_json(filepath: str) -> Dict:
        """Load JSON data from file."""
        with open(filepath, 'r') as f:
            return json.load(f)

    def _analyze_schema(self, schema: Dict, parent_path: str = "") -> None:
        """Recursively analyze schema to identify nested structures and links."""
        if not isinstance(schema, dict):
            return

        # Handle items in arrays
        if "items" in schema:
            self._analyze_schema(schema["items"], parent_path)
            return

        # Process properties
        properties = schema.get("properties", {})
        for field, details in properties.items():
            current_path = f"{parent_path}.{field}" if parent_path else field
            
            if details.get("type") == "array":
                array_items = details.get("items", {})
                if array_items.get("type") == "object":
                    self.nested_arrays[current_path] = array_items.get("properties", {})
            
            elif details.get("type") == "object":
                self._analyze_schema(details, current_path)

            # Track fields that are required
            if field in schema.get("required", []):
                self.processed_fields.add(field)

            # Special handling for links
            if field == "links":
                self.links[current_path] = details

    def process_data(self, data: Dict, entity_type: str) -> Dict:
        """Process a single data entry according to schema."""
        processed = {}
        
        # Basic fields
        for field in self.processed_fields:
            if field in data:
                # Handle timestamps
                if field == "created":
                    processed[field] = self._convert_timestamp(data[field])
                else:
                    processed[field] = data[field]

        # Process nested arrays
        for path, properties in self.nested_arrays.items():
            parts = path.split(".")
            current_data = data
            for part in parts:
                current_data = current_data.get(part, {})
                if not current_data:
                    break

            if current_data:
                if isinstance(current_data, list):
                    processed[parts[-1]] = [
                        {k: item.get(k) for k in properties}
                        for item in current_data
                    ]
                elif isinstance(current_data, dict):
                    processed[parts[-1]] = [
                        {k: v.get(k) for k in properties}
                        for v in current_data.values()
                        if isinstance(v, dict)
                    ]

        # Process links
        if "links" in data and "link" in data["links"]:
            processed["relationships"] = self._process_links(data["links"]["link"], entity_type)

        return processed

    def _process_links(self, links: List[Dict], entity_type: str) -> Dict[str, List[str]]:
        """Process links into categorized relationships."""
        relationships = defaultdict(list)
        
        for link in links:
            rel_type = link.get("rel", "").lower()
            href = link.get("href", "")
            if href:
                entity_id = href.split("/")[-1]
                relationships[rel_type].append({
                    "id": entity_id,
                    "type": rel_type,
                    "start": self._convert_timestamp(link.get("start")),
                    "end": self._convert_timestamp(link.get("end"))
                })
        
        return dict(relationships)

    @staticmethod
    def _convert_timestamp(ms: Optional[int]) -> Optional[str]:
        """Convert millisecond timestamp to ISO format date string."""
        if not ms:
            return None
        return datetime.fromtimestamp(ms/1000).isoformat()

def process_dataset(schema_path: str, data_path: str, entity_type: str) -> List[Dict]:
    """Process an entire dataset using its schema."""
    processor = SchemaProcessor(schema_path)
    raw_data = processor._load_json(data_path)
    
    # Handle both single objects and arrays
    if isinstance(raw_data, list):
        return [processor.process_data(item, entity_type) for item in raw_data]
    return [processor.process_data(raw_data, entity_type)]

def combine_datasets(processed_data: Dict[str, List[Dict]]) -> List[Dict]:
    """Combine processed datasets into a single coherent structure."""
    combined = []
    
    # Start with organizations
    for org in processed_data['organizations']:
        org_record = {
            'organization': {
                'id': org.get('id'),
                'name': org.get('name'),
                'created': org.get('created')
            },
            'relationships': org.get('relationships', {}),
            'projects': [],
            'persons': []
        }
        
        # Add related projects
        project_refs = {
            rel['id'] for rel in org.get('relationships', {}).get('project', [])
        }
        org_record['projects'] = [
            proj for proj in processed_data['projects']
            if proj.get('id') in project_refs
        ]
        
        # Add related persons
        person_refs = {
            rel['id'] for rel in org.get('relationships', {}).get('employee', [])
        }
        org_record['persons'] = [
            person for person in processed_data['persons']
            if person.get('id') in person_refs
        ]
        
        combined.append(org_record)
    
    return combined



In [27]:

# Define schema and data paths
schemas = {
    'organisations': schema_paths['organisations'],
    'projects': schema_paths['projects'],
    'persons': schema_paths['persons']
}

data_files = {
    'organisations': file_paths['organisations'],
    'projects': file_paths['projects'],
    'persons': file_paths['persons']
}


org_processor = SchemaProcessor(schema_paths['organisations'])

# org_processor._load_json(file_paths['organisations'])

# Process each dataset
# processed_data = {}
# for entity_type, schema_path in schemas.items():
#     pprint(entity_type)
#     pprint(schema_path)
    # processed_data[entity_type] = process_dataset(
    #     schema_path,
    #     data_files[entity_type],
    #     entity_type
    # )

# # Combine datasets
# combined_data = combine_datasets(processed_data)

# # Save results
# with open('combined_data.json', 'w') as f:
#     json.dump(combined_data, f, indent=2)

# combined_data

[{'links': {'link': [{'href': 'http://gtr.ukri.org/gtr/api/projects/0D5DF2FF-B732-4218-B0E3-4FFBF3DDC906',
     'rel': 'PROJECT',
     'start': None,
     'end': None,
     'otherAttributes': {}},
    {'href': 'http://gtr.ukri.org/gtr/api/projects/0D0F72CC-0163-47CE-A462-5FDDBA4C1C38',
     'rel': 'PROJECT',
     'start': None,
     'end': None,
     'otherAttributes': {}},
    {'href': 'http://gtr.ukri.org/gtr/api/projects/0C6849FD-CA2D-49A9-80D4-75F710980208',
     'rel': 'PROJECT',
     'start': None,
     'end': None,
     'otherAttributes': {}},
    {'href': 'http://gtr.ukri.org/gtr/api/projects/C4059685-9263-44E6-B89A-D8609FCC1360',
     'rel': 'PROJECT',
     'start': None,
     'end': None,
     'otherAttributes': {}},
    {'href': 'http://gtr.ukri.org/gtr/api/projects/F0E04953-58F2-46FB-9837-3C341A3D3165',
     'rel': 'PROJECT',
     'start': None,
     'end': None,
     'otherAttributes': {}},
    {'href': 'http://gtr.ukri.org/gtr/api/projects/D9A401D9-6D91-4DA6-9012-7137774B

In [None]:
raw_data = org_processor._load_json(file_paths['organisations'])

if isinstance(raw_data, list):
    print([org_processor.process_data(item, "organisations") for item in raw_data])
else:
    print([org_processor.process_data(raw_data, "organisations")])
# org_processor.process_data(data=file_paths['organisations'], entity_type="organisations")