In [2]:
import os
import subprocess
import pandas as pd

def geotopicparser(file_name_include=None,extension='.geot',geot_files_dir='geotopic-mime', tika_app_jar='tika-app-2.6.0.jar', tika_nlp_jar='tika-parser-nlp-package-2.6.0.jar', ner_model_dir='ner-models'):
    # 1. Directory setting
    current_dir = os.getcwd() # Get absolute path to this script
    project_root = os.path.abspath(os.path.join(current_dir, '..','..')) # Assume repo root is 2 levels up

    if not os.path.isdir(project_root):
        raise FileNotFoundError(f"❌ Project root not found at: {project_root}")
    print(f"-- Using project root: {project_root}")

    # four directories of demand : 
    geot_files_dir = os.path.join(project_root, 'src', geot_files_dir) 
    tika_app_jar = os.path.join(project_root, 'src', 'tika', tika_app_jar)
    tika_nlp_jar = os.path.join(project_root, 'src', 'tika', tika_nlp_jar)
    ner_model_dir = os.path.join(project_root, 'src', ner_model_dir)


    # 3. Construct Tika CLI Command Base
    classpath = f"{tika_app_jar}:{tika_nlp_jar}:{ner_model_dir}:{geot_files_dir}"
    tika_cmd_base = [
        "java", "-classpath", classpath,
        "org.apache.tika.cli.TikaCLI", "-m"
    ]

    # 4. Loop through multiple .geot files
    results = []
    for filename in os.listdir(geot_files_dir):
        if filename.endswith(extension) and (file_name_include is None or file_name_include in filename):
            filepath = os.path.join(geot_files_dir, filename)
            cmd = tika_cmd_base + [filepath]

            try:
                output = subprocess.check_output(" ".join(cmd), shell=True, text=True)
            except subprocess.CalledProcessError as e:
                print(f" Error processing {filename}: {e}")
                continue

            # Parse Tika metadata output
            metadata = {"filename": filename}
            for line in output.splitlines():
                if ": " in line:
                    key, value = line.split(": ", 1)
                    metadata[key.strip()] = value.strip()

            results.append(metadata)

    return results



In [4]:
result = geotopicparser(file_name_include='polar')
result

-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 05:00:10,798 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin


[{'filename': 'polar.geot',
  'Content-Length': '881',
  'Content-Type': 'application/geotopic',
  'X-TIKA:Parsed-By': 'org.apache.tika.parser.geo.GeoParser',
  'X-TIKA:Parsed-By-Full-Set': 'org.apache.tika.parser.geo.GeoParser',
  'resourceName': 'polar.geot'}]

In [None]:
# Convert to json and save to file
import json
import pandas as pd

result_json = json.dumps(result, indent=4)
with open('geotopic_metadata.json', 'w') as f:
    f.write(result_json)