In [1]:
import pandas as pd
from rake_nltk import Rake
import json

# Load your data
data = pd.read_csv('C:/Users/apoorva reddy/Downloads/combined_table2_data_with_industry_new (2).csv')
tasks_activities = data[['Tasks', 'Detailed Work Activities']].dropna()  # Drop rows with NaN values

# Initialize Rake
r = Rake()

# Combine text from "Tasks" and "Detailed Work Activities"
all_text_combined = ' '.join(tasks_activities.apply(lambda x: ' '.join(x), axis=1))

# Extract phrases from the combined text
r.extract_keywords_from_text(all_text_combined)
global_ranked_phrases = r.get_ranked_phrases()

# Define keyword lists for each category
category_keywords = {
    'Image': [
        'photo', 'scan', 'visual', 'inspect', 'camera', 'view', 'see', 'observe', 
        'picture', 'image', 'display', 'diagram', 'color', 'illustrate', 'examine',
        'photograph', 'detect', 'look', 'screen', 'x-ray', 'viewfinder', 'microscope'
    ],
    'Alphanumeric': [
        'data', 'record', 'number', 'transaction', 'account', 'document', 'calculate', 
        'analyze', 'statistics', 'text', 'report', 'log', 'inventory', 'file', 
        'input', 'code', 'digit', 'sheet', 'count', 'figure', 'sort', 'catalog', 
        'classify', 'register', 'alphabet', 'numeric', 'balance', 'sum', 'ledger'
    ],
    'Audio': [
        'listen', 'record', 'transcribe', 'speech', 'audio', 'hear', 'sound', 'voice', 
        'music', 'interview', 'call', 'broadcast', 'microphone', 'tape', 'radio', 
        'noise', 'ring', 'echo', 'soundtrack', 'song', 'volume', 'frequency', 'audio clip',
        'podcast', 'replay', 'speak', 'recording', 'dialogue', 'conference'
    ],
    'Geographic': [
        'location', 'map', 'address', 'region', 'navigate', 'site', 'place', 'route', 
        'area', 'boundary', 'country', 'state', 'terrain', 'district', 'zone', 'vicinity', 
        'latitude', 'longitude', 'coordinate', 'survey', 'land', 'geography', 'region', 
        'navigate', 'landmark', 'compass', 'city', 'province', 'urban', 'rural', 'location data'
    ]
}

# Function to categorize a phrase based on keyword matching
def categorize_phrase(phrase, keywords):
    for category, words in keywords.items():
        if any(word in phrase.lower() for word in words):
            return category
    return None

# Categorize extracted phrases and remove duplicates
global_categories = { 'Image': set(), 'Alphanumeric': set(), 'Audio': set(), 'Geographic': set() }

for phrase in global_ranked_phrases:
    category = categorize_phrase(phrase, category_keywords)
    if category:
        global_categories[category].add(phrase)

# Convert sets back to lists for JSON serialization and output results
global_categories = {category: list(phrases) for category, phrases in global_categories.items()}
print("Extracted Keywords by Category:", json.dumps(global_categories, indent=4))

# Function to calculate DTU level based on Job Zone with increased thresholds for each data type separately
def calculate_dtu_level_for_type(text, keywords, job_zone):
    count = sum(word in text.lower() for word in keywords)
    
  # Set increased thresholds for each data type based on Job Zone
    if job_zone in [4, 5]:  # Higher preparation levels
        if count > 8:
            return "High"
        elif 4 <= count <= 8:
            return "Medium"
        else:
            return "Low"
    elif job_zone == 3:  # Medium preparation level
        if count > 10:
            return "High"
        elif 5 <= count <= 10:
            return "Medium"
        else:
            return "Low"
    else:  # Lower preparation levels (1 and 2)
        if count > 12:
            return "High"
        elif 6 <= count <= 12:
            return "Medium"
        else:
            return "Low"

# Apply the DTU calculation for each data type separately in each row
for data_type, keywords in global_categories.items():
    data[f'{data_type} DTU'] = data.apply(lambda row: calculate_dtu_level_for_type(
        str(row['Tasks']) + " " + str(row['Detailed Work Activities']), 
        keywords, 
        row['Job Zone']
    ), axis=1)

# Save the final results into a CSV file
output_path = 'C:\Users/apoorva reddy/Downloads/dtu_job_zone_results_separate.csv'
data[['Industry', 'Occupation', 'Detailed Work Activities', 'Tasks','Job Zone', 'Image DTU', 'Audio DTU', 'Alphanumeric DTU', 'Geographic DTU']].to_csv(output_path, index=False)



Extracted Keywords by Category: {
    "Image": [
        "display screen",
        "review project plans",
        "inspect materials",
        "related occupations review orders",
        "perform basic screening procedures",
        "examine chemical",
        "observe display screens",
        "related occupations inspect metal",
        "compare colors",
        "related occupations examine shipment contents",
        "physical inspections",
        "specific color densities",
        "review patients",
        "examine photographic images",
        "looking",
        "organizations seeking funds",
        "personal interviews",
        "multimedia visual aids",
        "related occupations examine completed work",
        "inspect finished dies",
        "block diagrams",
        "examine unexposed photographic plates",
        "monitor video displays",
        "review employee benefit programs",
        "review loan agreements",
        "examine medical equipment",
        "prepa