# 02 - Skill Extraction & Normalization

This notebook extracts skills from all cleaned job datasets, normalizes them into a unified taxonomy, and creates the job-skill mapping used for EDA and ML feature engineering.

**Inputs:** Cleaned parquet files from Notebook 01

**Outputs:**
- `skill_taxonomy.parquet` - Master skill vocabulary
- `skill_synonyms.json` - Synonym mapping dictionary
- `job_skill_mapping.parquet` - Exploded (job, skill) pairs with metadata

## 2.1 Setup & Load Cleaned Data

In [1]:
import pandas as pd
import numpy as np
import re
import json
from pathlib import Path
from collections import Counter
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

RAW_DIR = Path('../data/raw')
PROCESS_DIR = Path('../data/process')

# Load cleaned datasets
df_future = pd.read_parquet(PROCESS_DIR / 'cleaned_future_jobs.parquet')
df_india = pd.read_parquet(PROCESS_DIR / 'cleaned_india_jobs.parquet')
df_market = pd.read_parquet(PROCESS_DIR / 'cleaned_job_market.parquet')
df_jobstreet = pd.read_parquet(PROCESS_DIR / 'cleaned_jobstreet.parquet')
df_linkedin_skills = pd.read_parquet(PROCESS_DIR / 'linkedin_skills_taxonomy.parquet')
df_linkedin_job_skills = pd.read_parquet(PROCESS_DIR / 'linkedin_job_skills.parquet')

print('Loaded datasets:')
for name, df in [('future_jobs', df_future), ('india_jobs', df_india), 
                  ('job_market', df_market), ('jobstreet', df_jobstreet),
                  ('linkedin_job_skills', df_linkedin_job_skills)]:
    print(f'  {name:25s} {len(df):>10,} rows')

Loaded datasets:
  future_jobs                   10,000 rows
  india_jobs                        32 rows
  job_market                       250 rows
  jobstreet                     18,204 rows
  linkedin_job_skills          213,768 rows


## 2.2 Skill Synonym Dictionary

In [2]:
SKILL_SYNONYMS = {
    # AI & ML
    'ml': 'machine learning',
    'ai': 'artificial intelligence',
    'dl': 'deep learning',
    'nlp': 'natural language processing',
    'cv': 'computer vision',
    'gen ai': 'generative ai',
    'genai': 'generative ai',
    'generative artificial intelligence': 'generative ai',
    'rag': 'retrieval augmented generation',
    'llm': 'large language model',
    'llms': 'large language model',
    'large language models': 'large language model',
    
    # Programming Languages
    'js': 'javascript',
    'ts': 'typescript',
    'py': 'python',
    'golang': 'go',
    'c#': 'csharp',
    'c++': 'cpp',
    
    # Frameworks & Libraries
    'react.js': 'react',
    'reactjs': 'react',
    'react js': 'react',
    'node.js': 'nodejs',
    'node js': 'nodejs',
    'vue.js': 'vue',
    'vuejs': 'vue',
    'angular.js': 'angular',
    'angularjs': 'angular',
    'next.js': 'nextjs',
    'next js': 'nextjs',
    'express.js': 'expressjs',
    'sci-kit learn': 'scikit-learn',
    'sklearn': 'scikit-learn',
    'tf': 'tensorflow',
    
    # Cloud & DevOps
    'aws': 'amazon web services',
    'gcp': 'google cloud platform',
    'google cloud': 'google cloud platform',
    'azure cloud': 'microsoft azure',
    'k8s': 'kubernetes',
    'ci/cd': 'cicd',
    'ci cd': 'cicd',
    
    # Databases
    'postgres': 'postgresql',
    'mongo': 'mongodb',
    'sql server': 'microsoft sql server',
    'mssql': 'microsoft sql server',
    
    # Data
    'bi': 'business intelligence',
    'power bi': 'powerbi',
    'data science on azure': 'data science',
    'machine learning on azure': 'machine learning',
    'azure machine learning': 'machine learning',
    'azure ai': 'artificial intelligence',
    
    # Other
    'rest apis': 'rest api',
    'restful api': 'rest api',
    'restful apis': 'rest api',
    'api management': 'api design',
}

print(f'Synonym dictionary: {len(SKILL_SYNONYMS)} entries')

Synonym dictionary: 54 entries


In [3]:
def normalize_skill(skill_str):
    """Normalize a single skill string."""
    if not isinstance(skill_str, str):
        return ''
    s = skill_str.strip().lower()
    # Remove trailing/leading punctuation (but keep internal ones like c++, c#)
    s = re.sub(r'^[\s\-\.\,]+|[\s\-\.\,]+$', '', s)
    # Apply synonym mapping
    s = SKILL_SYNONYMS.get(s, s)
    return s

def parse_skills_column(series, sep=','):
    """Parse a comma-separated skills column into lists of normalized skills."""
    def _parse(val):
        if not isinstance(val, str) or val.strip() == '':
            return []
        skills = [normalize_skill(s) for s in val.split(sep)]
        # Remove empty strings and deduplicate while preserving order
        seen = set()
        result = []
        for s in skills:
            if s and s not in seen:
                seen.add(s)
                result.append(s)
        return result
    return series.apply(_parse)

# Test
test_skills = 'NLP, Machine Learning on Azure, Data science on Azure, Python, ML'
print(f'Input:  {test_skills}')
print(f'Output: {parse_skills_column(pd.Series([test_skills]))[0]}')

Input:  NLP, Machine Learning on Azure, Data science on Azure, Python, ML
Output: ['natural language processing', 'machine learning', 'data science', 'python']


## 2.3 Parse Explicit Skills (3 datasets with skills columns)

In [4]:
# Future Jobs
df_future['skills_list'] = parse_skills_column(df_future['skills_required'])
future_skills = [s for skills in df_future['skills_list'] for s in skills]
print(f'Future Jobs: {len(set(future_skills))} unique skills from {len(future_skills)} total mentions')
print(f'Top 10: {Counter(future_skills).most_common(10)}')

Future Jobs: 11 unique skills from 20000 total mentions
Top 10: [('climate data analysis', 2490), ('energy modeling', 2490), ('quantum algorithms', 1690), ('tensorflow', 1690), ('ethereum', 1683), ('qiskit', 1682), ('solidity', 1669), ('linear algebra', 1666), ('python', 1657), ('rust', 1646)]


In [5]:
# India Jobs
df_india['skills_list'] = parse_skills_column(df_india['skills'])
india_skills = [s for skills in df_india['skills_list'] for s in skills]
print(f'India Jobs: {len(set(india_skills))} unique skills from {len(india_skills)} total mentions')
print(f'Top 10: {Counter(india_skills).most_common(10)}')

India Jobs: 138 unique skills from 253 total mentions
Top 10: [('python', 15), ('machine learning', 12), ('sql', 9), ('artificial intelligence', 7), ('natural language processing', 5), ('data science', 5), ('pyspark', 5), ('amazon web services', 5), ('project management', 5), ('data', 4)]


In [6]:
# Job Market
df_market['skills_list'] = parse_skills_column(df_market['skills'])
market_skills = [s for skills in df_market['skills_list'] for s in skills]
print(f'Job Market: {len(set(market_skills))} unique skills from {len(market_skills)} total mentions')
print(f'Top 10: {Counter(market_skills).most_common(10)}')

Job Market: 19 unique skills from 1032 total mentions
Top 10: [('machine learning', 71), ('python', 67), ('go', 60), ('git', 59), ('docker', 58), ('agile', 57), ('amazon web services', 56), ('cicd', 56), ('ruby', 55), ('typescript', 55)]


## 2.4 Build Seed Skill Vocabulary

Combine skills from:
1. The 3 job datasets with explicit skill columns
2. Coursera courses (50K+ courses with curated skill names)
3. LinkedIn skill taxonomy

In [7]:
# Collect all explicit skills from job datasets
all_explicit_skills = set(future_skills) | set(india_skills) | set(market_skills)
print(f'Unique skills from job datasets: {len(all_explicit_skills)}')

# Add Coursera skills
df_courses = pd.read_csv(RAW_DIR / 'courses_en.csv', usecols=['skills'])
coursera_skills = set()
for skills_str in df_courses['skills'].dropna():
    for s in skills_str.split(','):
        normalized = normalize_skill(s)
        if normalized and len(normalized) >= 2:
            coursera_skills.add(normalized)
print(f'Unique skills from Coursera: {len(coursera_skills)}')
del df_courses

# Add LinkedIn skill category names (broad categories)
linkedin_category_names = set(df_linkedin_skills['skill_name'].str.lower().tolist())
print(f'LinkedIn skill categories: {len(linkedin_category_names)}')

# Build seed vocabulary
seed_vocabulary = all_explicit_skills | coursera_skills
# Filter out overly generic or short terms
seed_vocabulary = {s for s in seed_vocabulary if len(s) >= 2 and s not in {
    'and', 'the', 'for', 'with', 'from', 'data', 'science', 'machine',
    'senior', 'junior', 'hiring', 'generation', 'management', 'associate',
}}

print(f'\nFinal seed vocabulary: {len(seed_vocabulary)} skills')
print(f'Sample: {sorted(list(seed_vocabulary))[:20]}')

Unique skills from job datasets: 160
Unique skills from Coursera: 2448
LinkedIn skill categories: 35

Final seed vocabulary: 2528 skills
Sample: ['3d assets', '3d modeling', 'a/b testing', 'ability to meet deadlines', 'about different approaches to the question of how was it humanly\xa0possible', 'about the development of nazi ideology as well as the early measures taken against the jews and others who were considered "undesirables"', 'about the implementation of the "final solution" throughout europe and the nature of local collaboration', 'about the key decisions and turning points leading to the "final solution"', 'academic advising', 'acceptance testing', 'accident prevention', 'accident reporting', 'account management', 'accountability', 'accounting', 'accounting and finance software', 'accounting records', 'accounting software', 'accounts payable', 'accounts payable and receivable']


## 2.5 Extract Skills from JobStreet Descriptions

Since jobstreet has no explicit skills column, we extract skills from job descriptions using keyword matching against the seed vocabulary.

In [8]:
# Build a more focused tech skill vocabulary for jobstreet extraction
# Use a curated subset to reduce false positives in free text
TECH_SKILL_VOCABULARY = {
    # Programming languages
    'python', 'java', 'javascript', 'typescript', 'csharp', 'cpp', 'go', 'rust',
    'ruby', 'php', 'swift', 'kotlin', 'scala', 'r', 'matlab', 'perl',
    'dart', 'lua', 'haskell', 'elixir', 'solidity',
    
    # Web frameworks
    'react', 'angular', 'vue', 'nextjs', 'nodejs', 'django', 'flask', 'fastapi',
    'spring boot', 'expressjs', 'laravel', 'rails', 'asp.net',
    
    # AI/ML
    'machine learning', 'deep learning', 'natural language processing',
    'computer vision', 'tensorflow', 'pytorch', 'scikit-learn', 'keras',
    'artificial intelligence', 'generative ai', 'large language model',
    'reinforcement learning', 'neural network', 'retrieval augmented generation',
    
    # Data
    'sql', 'nosql', 'postgresql', 'mysql', 'mongodb', 'redis', 'elasticsearch',
    'apache spark', 'hadoop', 'kafka', 'airflow', 'dbt',
    'data engineering', 'data analysis', 'data visualization',
    'pandas', 'numpy', 'powerbi', 'tableau', 'looker',
    'etl', 'data warehouse', 'data pipeline', 'data modeling',
    'big data', 'data mining', 'business intelligence',
    
    # Cloud & Infrastructure
    'amazon web services', 'microsoft azure', 'google cloud platform',
    'docker', 'kubernetes', 'terraform', 'ansible', 'jenkins',
    'cicd', 'devops', 'linux', 'unix', 'nginx',
    'microservices', 'serverless', 'cloud computing',
    
    # Security
    'cybersecurity', 'penetration testing', 'network security',
    'information security', 'encryption', 'firewall',
    
    # Mobile
    'android', 'ios', 'react native', 'flutter', 'mobile development',
    
    # Tools & Practices
    'git', 'github', 'gitlab', 'jira', 'agile', 'scrum',
    'rest api', 'graphql', 'grpc', 'websocket',
    'html', 'css', 'sass', 'webpack', 'vite',
    
    # Blockchain
    'blockchain', 'ethereum', 'smart contract', 'web3',
    'decentralized', 'defi', 'nft', 'cryptocurrency',
    
    # Emerging tech
    'quantum computing', 'quantum algorithms', 'qiskit',
    'iot', 'internet of things', 'edge computing',
    'augmented reality', 'virtual reality', 'metaverse',
    'robotics', '3d printing', 'autonomous vehicles',
    
    # Green tech
    'renewable energy', 'energy modeling', 'climate data analysis',
    'sustainability', 'carbon footprint', 'clean energy',
    
    # Soft skills (selected tech-relevant ones)
    'project management', 'product management', 'system design',
    'software architecture', 'technical leadership',
}

# Also add skills from the seed vocabulary that look like tech skills
# (longer multi-word terms are less likely to be false positives)
for skill in seed_vocabulary:
    if len(skill) >= 5:  # Only add skills with 5+ chars to reduce noise
        TECH_SKILL_VOCABULARY.add(skill)

print(f'Tech skill vocabulary for extraction: {len(TECH_SKILL_VOCABULARY)} skills')

Tech skill vocabulary for extraction: 2572 skills


In [9]:
# Pre-compile regex patterns sorted by length (longest first) for efficient matching
skill_patterns = []
for skill in sorted(TECH_SKILL_VOCABULARY, key=len, reverse=True):
    # Use word boundaries, but handle special characters in skill names
    pattern = r'\b' + re.escape(skill) + r'\b'
    try:
        skill_patterns.append((skill, re.compile(pattern, re.IGNORECASE)))
    except re.error:
        pass

def extract_skills_from_text(text):
    """Extract skills from free text using keyword matching."""
    if not isinstance(text, str) or len(text) < 10:
        return []
    text_lower = text.lower()
    found = []
    for skill, pattern in skill_patterns:
        if pattern.search(text_lower):
            found.append(skill)
    return found

# Test with a sample description
test_desc = "Looking for a Python developer with experience in machine learning, TensorFlow, and AWS. Knowledge of Docker and Kubernetes preferred."
print(f'Test extraction: {extract_skills_from_text(test_desc)}')

Test extraction: ['machine learning', 'kubernetes', 'tensorflow', 'python', 'docker']


In [10]:
# Extract skills from jobstreet descriptions
print(f'Extracting skills from {len(df_jobstreet):,} jobstreet descriptions...')

tqdm.pandas(desc='Extracting skills')
df_jobstreet['skills_list'] = df_jobstreet['descriptions'].progress_apply(extract_skills_from_text)

# Stats
skills_per_job = df_jobstreet['skills_list'].apply(len)
print(f'\nJobstreet skill extraction results:')
print(f'  Jobs with >= 1 skill found: {(skills_per_job > 0).sum():,} ({(skills_per_job > 0).mean()*100:.1f}%)')
print(f'  Average skills per job: {skills_per_job.mean():.1f}')
print(f'  Median skills per job: {skills_per_job.median():.0f}')
print(f'  Max skills per job: {skills_per_job.max()}')

jobstreet_skills = [s for skills in df_jobstreet['skills_list'] for s in skills]
print(f'\nTop 20 skills in jobstreet:')
for skill, count in Counter(jobstreet_skills).most_common(20):
    print(f'  {skill:35s} {count:,}')

Extracting skills from 18,204 jobstreet descriptions...


Extracting skills:   0%|          | 0/18204 [00:00<?, ?it/s]


Jobstreet skill extraction results:
  Jobs with >= 1 skill found: 18,094 (99.4%)
  Average skills per job: 14.6
  Median skills per job: 13
  Max skills per job: 66

Top 20 skills in jobstreet:
  company                             9,643
  engineering                         9,407
  development                         8,256
  communication                       7,882
  design                              7,506
  business                            7,395
  environment                         5,983
  product                             4,795
  maintenance                         4,663
  computer science                    4,225
  analysis                            4,140
  operations                          3,596
  analytical                          3,580
  planning                            3,360
  delivery                            2,964
  information technology              2,689
  project management                  2,252
  sales                               2,130
  leadership 

## 2.6 Build Unified Skill Taxonomy

In [11]:
# Aggregate all skills across all datasets
all_skill_counts = Counter()
all_skill_sources = {}

for name, df in [('future_jobs', df_future), ('india_jobs', df_india), 
                  ('job_market', df_market), ('jobstreet', df_jobstreet)]:
    for skills in df['skills_list']:
        for skill in skills:
            all_skill_counts[skill] += 1
            if skill not in all_skill_sources:
                all_skill_sources[skill] = set()
            all_skill_sources[skill].add(name)

# Add LinkedIn broad categories
for _, row in df_linkedin_job_skills.iterrows():
    skill_name = row.get('skill_name', '')
    if isinstance(skill_name, str):
        normalized = normalize_skill(skill_name)
        all_skill_counts[normalized] += 1
        if normalized not in all_skill_sources:
            all_skill_sources[normalized] = set()
        all_skill_sources[normalized].add('linkedin')

print(f'Total unique skills before filtering: {len(all_skill_counts)}')

# Filter noise: remove skills appearing fewer than 3 times
MIN_SKILL_COUNT = 3
filtered_skills = {k: v for k, v in all_skill_counts.items() if v >= MIN_SKILL_COUNT and len(k) >= 2}
print(f'Skills after filtering (>= {MIN_SKILL_COUNT} occurrences): {len(filtered_skills)}')

Total unique skills before filtering: 1771
Skills after filtering (>= 3 occurrences): 1498


In [12]:
# Create skill taxonomy DataFrame
taxonomy_rows = []
for i, (skill, count) in enumerate(sorted(filtered_skills.items(), key=lambda x: -x[1])):
    taxonomy_rows.append({
        'skill_id': i + 1,
        'skill_name': skill,
        'total_count': count,
        'source_count': len(all_skill_sources.get(skill, set())),
        'sources': ', '.join(sorted(all_skill_sources.get(skill, set()))),
    })

df_taxonomy = pd.DataFrame(taxonomy_rows)
print(f'Skill Taxonomy: {len(df_taxonomy)} skills')
print(f'\nTop 30 skills:')
display(df_taxonomy.head(30))

Skill Taxonomy: 1498 skills

Top 30 skills:


Unnamed: 0,skill_id,skill_name,total_count,source_count,sources
0,1,information technology,28827,3,"india_jobs, jobstreet, linkedin"
1,2,sales,24605,2,"jobstreet, linkedin"
2,3,engineering,22416,2,"jobstreet, linkedin"
3,4,management,20862,2,"india_jobs, linkedin"
4,5,manufacturing,18185,1,linkedin
5,6,health care provider,17369,1,linkedin
6,7,business development,14515,2,"jobstreet, linkedin"
7,8,other,12608,1,linkedin
8,9,design,9750,2,"jobstreet, linkedin"
9,10,company,9643,1,jobstreet


In [13]:
# Save taxonomy and synonyms
df_taxonomy.to_parquet(PROCESS_DIR / 'skill_taxonomy.parquet', index=False)
print(f'Saved: skill_taxonomy.parquet ({len(df_taxonomy)} skills)')

with open(PROCESS_DIR / 'skill_synonyms.json', 'w') as f:
    json.dump(SKILL_SYNONYMS, f, indent=2)
print(f'Saved: skill_synonyms.json ({len(SKILL_SYNONYMS)} mappings)')

Saved: skill_taxonomy.parquet (1498 skills)
Saved: skill_synonyms.json (54 mappings)


## 2.7 Create Exploded Job-Skill Mapping

Long-format DataFrame where each row represents one (job, skill) pair with metadata for analysis.

In [14]:
# Valid skills from taxonomy
valid_skills = set(df_taxonomy['skill_name'])

def create_exploded_mapping(df, source_name, id_col, title_col, date_col=None,
                            industry_col=None, location_col=None, salary_col=None):
    """Explode skills_list and create standardized mapping rows."""
    rows = []
    for _, row in df.iterrows():
        skills = row.get('skills_list', [])
        if not skills:
            continue
        for skill in skills:
            if skill not in valid_skills:
                continue
            entry = {
                'job_id': str(row.get(id_col, '')),
                'job_title': row.get(title_col, ''),
                'skill': skill,
                'source': source_name,
                'posting_date': row.get(date_col) if date_col else pd.NaT,
                'industry': row.get(industry_col, '') if industry_col else '',
                'location': row.get(location_col, '') if location_col else '',
                'salary': row.get(salary_col) if salary_col else np.nan,
            }
            rows.append(entry)
    return rows

print('Creating job-skill mappings...')

all_mappings = []

# Future Jobs
mappings = create_exploded_mapping(
    df_future, 'future_jobs', 'job_id', 'job_title',
    date_col='posting_date', industry_col='industry',
    location_col='location', salary_col='salary_usd'
)
all_mappings.extend(mappings)
print(f'  future_jobs: {len(mappings):,} skill-job pairs')

# India Jobs
mappings = create_exploded_mapping(
    df_india, 'india_jobs', 'title', 'title',
    location_col='city_clean'
)
all_mappings.extend(mappings)
print(f'  india_jobs: {len(mappings):,} skill-job pairs')

# Job Market
mappings = create_exploded_mapping(
    df_market, 'job_market', 'job_title', 'job_title',
    date_col='publication_date', industry_col='category',
    location_col='location', salary_col='salary_min'
)
all_mappings.extend(mappings)
print(f'  job_market: {len(mappings):,} skill-job pairs')

Creating job-skill mappings...
  future_jobs: 20,000 skill-job pairs
  india_jobs: 226 skill-job pairs
  job_market: 1,032 skill-job pairs


In [15]:
# Jobstreet (larger, process in chunks for memory efficiency)
print('Processing jobstreet mappings...')
jobstreet_mappings = []

for _, row in tqdm(df_jobstreet.iterrows(), total=len(df_jobstreet), desc='Jobstreet'):
    skills = row.get('skills_list', [])
    if not skills:
        continue
    for skill in skills:
        if skill not in valid_skills:
            continue
        jobstreet_mappings.append({
            'job_id': str(row.get('job_id', '')),
            'job_title': row.get('job_title', ''),
            'skill': skill,
            'source': 'jobstreet',
            'posting_date': row.get('listingDate', pd.NaT),
            'industry': str(row.get('category', '')),
            'location': row.get('location', ''),
            'salary': np.nan,
        })

all_mappings.extend(jobstreet_mappings)
print(f'  jobstreet: {len(jobstreet_mappings):,} skill-job pairs')

Processing jobstreet mappings...


Jobstreet:   0%|          | 0/18204 [00:00<?, ?it/s]

  jobstreet: 264,401 skill-job pairs


In [16]:
# Create final DataFrame
df_job_skills = pd.DataFrame(all_mappings)
df_job_skills['posting_date'] = pd.to_datetime(df_job_skills['posting_date'], errors='coerce')

print(f'\nFinal Job-Skill Mapping:')
print(f'  Total rows: {len(df_job_skills):,}')
print(f'  Unique jobs: {df_job_skills["job_id"].nunique():,}')
print(f'  Unique skills: {df_job_skills["skill"].nunique():,}')
print(f'\nRows by source:')
print(df_job_skills['source'].value_counts())
print(f'\nDate coverage:')
print(df_job_skills.groupby('source')['posting_date'].agg(['min', 'max', 'count']))


Final Job-Skill Mapping:
  Total rows: 285,659
  Unique jobs: 28,143
  Unique skills: 1,483

Rows by source:
source
jobstreet      264401
future_jobs     20000
job_market       1032
india_jobs        226
Name: count, dtype: int64

Date coverage:
                   min        max  count
source                                  
future_jobs 2025-01-01 2025-12-31  20000
india_jobs         NaT        NaT      0
job_market  2025-11-27 2025-11-27   1032
jobstreet          NaT        NaT      0


In [17]:
df_job_skills.to_parquet(PROCESS_DIR / 'job_skill_mapping.parquet', index=False)
print(f'Saved: job_skill_mapping.parquet ({len(df_job_skills):,} rows)')

# Also save the updated dataframes with skills_list
df_future.to_parquet(PROCESS_DIR / 'cleaned_future_jobs_skills.parquet', index=False)
df_india.to_parquet(PROCESS_DIR / 'cleaned_india_jobs_skills.parquet', index=False)
df_market.to_parquet(PROCESS_DIR / 'cleaned_job_market_skills.parquet', index=False)
df_jobstreet.to_parquet(PROCESS_DIR / 'cleaned_jobstreet_skills.parquet', index=False)
print('Saved all datasets with skills_list column')

Saved: job_skill_mapping.parquet (285,659 rows)
Saved all datasets with skills_list column


In [18]:
# Final summary
print('\n' + '=' * 60)
print('SKILL EXTRACTION SUMMARY')
print('=' * 60)
print(f'Skill taxonomy size: {len(df_taxonomy)} unique normalized skills')
print(f'Total job-skill pairs: {len(df_job_skills):,}')
print(f'\nOutput files:')
for f in sorted(PROCESS_DIR.glob('*')):
    size_mb = f.stat().st_size / 1024**2
    print(f'  {f.name:45s} {size_mb:.2f} MB')


SKILL EXTRACTION SUMMARY
Skill taxonomy size: 1498 unique normalized skills
Total job-skill pairs: 285,659

Output files:
  cleaned_future_jobs.parquet                   0.16 MB
  cleaned_future_jobs_skills.parquet            0.17 MB
  cleaned_india_jobs.parquet                    0.01 MB
  cleaned_india_jobs_skills.parquet             0.02 MB
  cleaned_job_market.parquet                    0.02 MB
  cleaned_job_market_skills.parquet             0.02 MB
  cleaned_jobstreet.parquet                     20.86 MB
  cleaned_jobstreet_skills.parquet              21.26 MB
  job_skill_mapping.parquet                     1.16 MB
  linkedin_job_skills.parquet                   1.25 MB
  linkedin_skills_taxonomy.parquet              0.00 MB
  skill_synonyms.json                           0.00 MB
  skill_taxonomy.parquet                        0.03 MB
