# 1. PATH CONFIGURATION

In [13]:
import os
import json
import glob
import re
import pandas as pd

# --- 1. PATH CONFIGURATION ---

# Current working directory (where this Notebook is located)
CURRENT_DIR = os.getcwd()

# Move one level up (..) to access the Raw_Data_Set folder
RAW_DATA_DIR = os.path.join(CURRENT_DIR, '..', '1. Raw_Data_Set')

# Define exact paths for data sources
PATH_LINKEDIN = os.path.join(RAW_DATA_DIR, 'LinkedIn')
PATH_GLASSDOOR = os.path.join(RAW_DATA_DIR, 'GlassDoor')

# Configuration files (located in the same folder as this Notebook)
FILE_ROLES = 'SOC_Analyst_Roles.json'
FILE_SKILLS = 'Skills.json'
FILE_LOCATIONS_FIX = 'Locations.json'

# Final output file for Tableau
FINAL_OUTPUT = 'SOC_Analyst_Master_Dataset.json'

print(f"üìÇ Working Directory: {CURRENT_DIR}")
print(f"üìÇ Raw Data Directory detected at: {os.path.abspath(RAW_DATA_DIR)}")

üìÇ Working Directory: C:\Users\iurie\3. Tableau Projects\02. Security Operation Center Analyst\2. Data_Processing_Process
üìÇ Raw Data Directory detected at: C:\Users\iurie\3. Tableau Projects\02. Security Operation Center Analyst\1. Raw_Data_Set


# 2. LOAD CONFIGURATION FILES

In [14]:
# --- 2. LOAD CONFIGURATION FILES ---

try:
    # Load Roles Definition
    with open(FILE_ROLES, 'r', encoding='utf-8') as f:
        roles_data = json.load(f)
        # Handle cases where data is nested under "all roles SOC"
        ROLES_DEF = roles_data.get("all roles SOC", roles_data)

    # Load Skills Definition
    with open(FILE_SKILLS, 'r', encoding='utf-8') as f:
        skills_data = json.load(f)
        SKILLS_DEF = skills_data.get("Skills", skills_data)

    # Load Location Fixes (Mapping)
    with open(FILE_LOCATIONS_FIX, 'r', encoding='utf-8') as f:
        loc_list = json.load(f)
        # Convert list to dictionary for O(1) lookup speed using 'location' as key
        LOC_FIX_MAP = {item['location']: item for item in loc_list}

    print("‚úÖ Configuration files (Roles, Skills, Locations) loaded successfully.")

except FileNotFoundError as e:
    print(f"‚ùå ERROR: Configuration file not found: {e}")
except Exception as e:
    print(f"‚ùå UNEXPECTED ERROR while reading config: {e}")

‚úÖ Configuration files (Roles, Skills, Locations) loaded successfully.


# 3. DATA NORMALIZATION FUNCTION

In [15]:
# --- 3. DATA NORMALIZATION FUNCTION ---

def normalize_job_structure(job_data, source_name):
    """
    Accepts a raw job object (dict) from either LinkedIn or Glassdoor
    and returns a standardized dictionary with unified keys.
    """
    normalized = {}
    
    if source_name == "LinkedIn":
        normalized = {
            "title": job_data.get("title"),
            "companyName": job_data.get("companyName"),
            "location": job_data.get("location"),
            "jobUrl": job_data.get("jobUrl"),
            "contractType": job_data.get("contractType"),
            "description": job_data.get("description"), # Sometimes 'descriptionHtml'
            "source": "LinkedIn"
        }
    elif source_name == "GlassDoor":
        # Glassdoor location can be a dictionary or a string
        loc_obj = job_data.get("job_location")
        loc_str = ""
        if isinstance(loc_obj, dict):
            parts = [p for p in [loc_obj.get("city"), loc_obj.get("country")] if p]
            loc_str = ", ".join(parts)
        elif isinstance(loc_obj, str):
            loc_str = loc_obj

        normalized = {
            "title": job_data.get("job_title"),
            "companyName": job_data.get("company_name"),
            "location": loc_str,
            "jobUrl": job_data.get("job_url"),
            "contractType": job_data.get("job_job_types"),
            "description": job_data.get("job_description"),
            "source": "GlassDoor"
        }
    
    # Basic string cleaning for location
    if normalized['location']:
        normalized['location'] = normalized['location'].strip()
        
    return normalized

# 4. DATA PROCESSING PIPELINE (ETL)

In [16]:
# --- 4. DATA PROCESSING PIPELINE (ETL) ---

all_jobs = []
processed_jobs = []
seen_ids = set() 

# Statistics for debugging and quality assurance
stats = {
    "total_read": 0,
    "excluded_filter": 0,
    "excluded_duplicate": 0,
    "final_count": 0
}

# Extended keyword list for initial relevance filtering
KEYWORDS_FILTER = ["soc", "security operation", "incident response", "threat intelligence", "cyber defense", "siem", "blue team"]

print("üöÄ Starting Data Processing Pipeline...")

# --- STEP A: INGESTION ---
files_list = []
files_list.extend(glob.glob(os.path.join(PATH_LINKEDIN, "*.json")))
files_list.extend(glob.glob(os.path.join(PATH_GLASSDOOR, "*.json")))

print(f"üìÇ Found {len(files_list)} JSON files in total.")

for file in files_list:
    try:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            source = "LinkedIn" if "LinkedIn" in file else "GlassDoor"
            
            # Handle both List and Dict structures in JSON
            if isinstance(data, list):
                for job in data:
                    all_jobs.append(normalize_job_structure(job, source))
            elif isinstance(data, dict):
                 all_jobs.append(normalize_job_structure(data, source))

    except Exception as e:
        print(f"‚ö†Ô∏è Error reading file {os.path.basename(file)}: {e}")

stats["total_read"] = len(all_jobs)
print(f"üìä Total raw jobs extracted: {stats['total_read']}")


# --- STEP B: TRANSFORMATION (Filtering, Deduplication, Enrichment) ---

for job in all_jobs:
    title = str(job.get("title", "")).lower()
    desc = str(job.get("description", "")).lower()
    
    # 1. RELEVANCE FILTERING: Check Title OR Description
    is_relevant = any(kw in title for kw in KEYWORDS_FILTER) or any(kw in desc for kw in KEYWORDS_FILTER)
    
    if not is_relevant:
        stats["excluded_filter"] += 1
        continue 

    # 2. DEDUPLICATION: Create a composite key (Title + Company + Location)
    # Removing spaces and commas to handle slight variations in formatting
    comp = str(job.get("companyName", "")).lower().replace(" ", "").replace(",", "")
    loc = str(job.get("location", "")).lower().replace(" ", "").replace(",", "")
    tit_simple = title.replace(" ", "")
    
    unique_id = f"{tit_simple}|{comp}|{loc}"
    
    if unique_id in seen_ids:
        stats["excluded_duplicate"] += 1
        continue
    seen_ids.add(unique_id)

    # 3. GEOGRAPHIC PARSING (Split Location into City/Region/Country)
    job['city'] = None
    job['region'] = None
    job['country'] = None
    
    raw_loc = job.get("location", "")
    if raw_loc:
        parts = [p.strip() for p in raw_loc.split(',')]
        if len(parts) >= 3:
            job['city'] = parts[0]
            job['region'] = ", ".join(parts[1:-1])
            job['country'] = parts[-1]
        elif len(parts) == 2:
            job['city'] = parts[0]
            job['country'] = parts[1]
        elif len(parts) == 1:
            job['country'] = parts[0]
            
    # 4. LOCATION FIXES (Apply manual corrections from Locations.json)
    if job.get('region') is None and job.get('location') in LOC_FIX_MAP:
        fix_data = LOC_FIX_MAP[job['location']]
        job['country'] = fix_data.get('country')
        job['region'] = fix_data.get('region')
        job['city'] = fix_data.get('city')

    # 5. ROLE ASSIGNMENT (Using Rule-based Classification)
    assigned_role = "Uncategorized"
    
    # Extended dictionary to catch specific roles not in the main JSON (Includes German terms & Niche roles)
    extended_roles = {
        "Direct SOC Analyst": [
            "soc", "security analyst", "analyst", "operator", "monitoring", 
            "intern", "student", "trainee", "junior", "graduate", "stage", 
            "werkstudent", "watch", "handler", "analyste", 
            "duales", "placement", "apprentice"
        ],
        "SOC Technology Specialists": [
            "engineer", "network", "system", "admin", "integrator", "architect", 
            "support", "technician", "embedded", "infrastructure", "developer",
            "iam", "pki", "access", "identity", "cloud", "devops", "platform",
            "pentest", "hacker", "offensive", "vulnerability", "tester",
            "spezialist", "specialist", "expert", "sicherheit", "security professional",
            "engineer", "ingenieur", "product owner", "scientist"
        ],
        "Incident Response and Threat Intelligence": [
            "incident", "response", "threat", "intelligence", "forensic", 
            "hunter", "cert", "csirt", "ir", "malware", 
            "investigator", "crypto"
        ],
        "Sales, Management, Training & Consulting": [
            "manager", "head", "lead", "sales", "consultant", "trainer", 
            "director", "coordinator", "officer", "ciso", "presales", "account",
            "risk", "compliance", "audit", "governance", "awareness", "legal",
            "advisor", "leiter", "vp", "vice president", "president",
            "professor", "lecturer", "teacher", "academic", "research",
            "advocate", "claims"
        ],
        "Cyber Defense and Operations": [
            "defense", "defence", "blue", "operations", "ops", "secops", "protect", "resilience"
        ]
    }

    # A. First pass: Check against the main JSON config file
    match_found = False
    for role_name, keywords in ROLES_DEF.items():
        if any(k.lower() in title for k in keywords):
            assigned_role = role_name
            match_found = True
            break
            
    # B. Second pass: Check against the extended hardcoded list
    if not match_found:
        for role_name, keywords in extended_roles.items():
            if any(k in title for k in keywords):
                assigned_role = role_name
                break
                
    job['role'] = assigned_role

    # 6. SKILLS EXTRACTION (Keyword matching in Description)
    job_skills = {}
    for category, skill_list in SKILLS_DEF.items():
        found_skills = []
        for skill in skill_list:
            # Regex \b ensures whole word matching (e.g., avoids matching "Java" in "Javascript")
            if re.search(r'\b' + re.escape(skill) + r'\b', desc, re.IGNORECASE):
                found_skills.append(skill)
        if found_skills:
            job_skills[category] = found_skills
            
    job['skills'] = job_skills

    processed_jobs.append(job)

stats["final_count"] = len(processed_jobs)

print("\n--- FINAL EXECUTION REPORT ---")
print(f"1. Total Jobs Scanned: {stats['total_read']}")
print(f"2. Excluded (Irrelevant): -{stats['excluded_filter']}")
print(f"3. Excluded (Duplicates): -{stats['excluded_duplicate']}")
print(f"==========================================")
print(f"‚úÖ FINAL DATASET FOR TABLEAU: {stats['final_count']} jobs")

üöÄ Starting Data Processing Pipeline...
üìÇ Found 42 JSON files in total.
üìä Total raw jobs extracted: 4096

--- FINAL EXECUTION REPORT ---
1. Total Jobs Scanned: 4096
2. Excluded (Irrelevant): -1692
3. Excluded (Duplicates): -61
‚úÖ FINAL DATASET FOR TABLEAU: 2343 jobs


# 5. CHECKING DATA SET

In [23]:
# --- 5. CHECKING DATA SET (Quality Assurance) ---

import pandas as pd
from IPython.display import display # Import necessary for nice HTML tables

print("üîç Starting Data Quality Check...")

# Convert the list of processed jobs into a Pandas DataFrame
df = pd.DataFrame(processed_jobs)

# 1. Analyze Job Role Distribution
print("\n--- üìä Job Role Distribution ---")
print(df['role'].value_counts())

# 2. Visual Inspection (HTML Table)
print("\n--- üé≤ Random Sample of 5 Jobs (Clean Data Preview) ---")

# Setup generic pandas display options to ensure text isn't cut off too aggressively
pd.set_option('display.max_colwidth', 50) 

# Create a clean view by dropping the heavy text columns just for this preview
# We keep 'jobUrl' this time but truncate it via display options if needed, 
# or drop it if it's still too messy. Let's drop description/html for clarity.
cols_to_hide = ['description', 'descriptionHtml', 'companyUrl', 'jobUrl']
df_display = df.drop(columns=cols_to_hide, errors='ignore')

# USE DISPLAY() INSTEAD OF PRINT()
# This renders a beautiful HTML table in Jupyter Notebooks
display(df_display.sample(5))

print(f"\n‚úÖ Data Check Complete. Total Records: {len(df)}")

üîç Starting Data Quality Check...

--- üìä Job Role Distribution ---
role
SOC Technology Specialists                   1184
Direct SOC Analyst                            746
Sales, Management, Training & Consulting      201
Uncategorized                                 116
Incident Response and Threat Intelligence      76
Cyber Defense and Operations                   20
Name: count, dtype: int64

--- üé≤ Random Sample of 5 Jobs (Clean Data Preview) ---


Unnamed: 0,title,companyName,location,contractType,source,city,region,country,role,skills
2010,SOC Analyst Tier 2 (m/w/d) in Leipzig,WBS IT-Service,"leipzig, germany",[],GlassDoor,leipzig,,germany,Direct SOC Analyst,"{'Programming languages': ['Python'], 'Ability..."
1933,Cybersecurity enthusiast,Thales Cyber Solutions Luxembourg,"Contern, Luxembourg, Luxembourg",Full-time,LinkedIn,Contern,Luxembourg,Luxembourg,Direct SOC Analyst,"{'Ability': ['Cybersecurity', 'English', 'Fren..."
1769,Active Directory Expert,ENGIE,"Bucharest, Bucharest, Romania",Full-time,LinkedIn,Bucharest,Bucharest,Romania,SOC Technology Specialists,"{'Programming languages': ['PowerShell'], 'Abi..."
202,Sicherheitsexpert:in,pester pac automation GmbH,"Wolfertschwenden, Bavaria, Germany",Full-time,LinkedIn,Wolfertschwenden,Bavaria,Germany,SOC Technology Specialists,"{'Ability': ['Cybersecurity', 'Software develo..."
1655,Product Cybersecurity Expert,Roche,"Sant Cugat del Vall√®s, Catalonia, Spain",Full-time,LinkedIn,Sant Cugat del Vall√®s,Catalonia,Spain,SOC Technology Specialists,"{'Ability': ['Computer Science', 'Work experie..."



‚úÖ Data Check Complete. Total Records: 2343


# 6. FINAL EXPORT

In [24]:
# --- 5. FINAL EXPORT ---

try:
    with open(FINAL_OUTPUT, 'w', encoding='utf-8') as f:
        json.dump(processed_jobs, f, indent=4, ensure_ascii=False)
    
    print(f"üéâ Success! The file '{FINAL_OUTPUT}' has been generated.")
    print("This file is now ready for Tableau ingestion.")
    
except Exception as e:
    print(f"‚ùå Error saving file: {e}")

üéâ Success! The file 'SOC_Analyst_Master_Dataset.json' has been generated.
This file is now ready for Tableau ingestion.
