## Geoparser

In [7]:
import os
import subprocess
import pandas as pd

# Get absolute path to this script
current_dir = os.getcwd()
# Assume repo root is 1 levels up 
project_root = os.path.abspath(os.path.join(current_dir, '..')) 

if not os.path.isdir(project_root):
    raise FileNotFoundError(f"❌ Project root not found at: {project_root}")
print(f"-- Using project root: {project_root}")

geot_files_dir = os.path.join(project_root, 'src', 'geotopic-mime')
tika_app_jar = os.path.join(project_root, 'src', 'tika', 'tika-app-2.6.0.jar')
tika_nlp_jar = os.path.join(project_root, 'src', 'tika', 'tika-parser-nlp-package-2.6.0.jar')
ner_model_dir = os.path.join(project_root, 'src', 'location-ner-model')


# 3. Construct Tika CLI Command Base
classpath = f"{tika_app_jar}:{tika_nlp_jar}:{ner_model_dir}:{geot_files_dir}"
tika_cmd_base = [
    "java", "-classpath", classpath,
    "org.apache.tika.cli.TikaCLI", "-m"
]

# 4. Loop through multiple .geot files
results = []
for filename in os.listdir(geot_files_dir):
    if filename.endswith(".geot"):
        filepath = os.path.join(geot_files_dir, filename)
        cmd = tika_cmd_base + [filepath]

        try:
            output = subprocess.check_output(" ".join(cmd), shell=True, text=True)
        except subprocess.CalledProcessError as e:
            print(f" Error processing {filename}: {e}")
            continue

        # Parse Tika metadata output
        metadata = {"filename": filename}
        for line in output.splitlines():
            if ": " in line:
                key, value = line.split(": ", 1)
                metadata[key.strip()] = value.strip()

        results.append(metadata)


# Convert to DataFrame
df = pd.DataFrame(results)
print(df.head())

-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 04:03:34,655 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 04:03:37,406 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin


     filename Content-Length          Content-Type Geographic_LATITUDE  \
0  polar.geot            881  application/geotopic               39.76   
1    cnn.geot           3164  application/geotopic             26.0112   

  Geographic_LONGITUDE Geographic_NAME Optional_LATITUDE1 Optional_LONGITUDE1  \
0                -98.5   United States               35.0               105.0   
1            -80.14949       Hollywood           40.92877           -74.96032   

                                 Optional_NAME1  \
0                    People’s Republic of China   
1  New Jersey State Police Troop B Hope Station   

                       X-TIKA:Parsed-By             X-TIKA:Parsed-By-Full-Set  \
0  org.apache.tika.parser.geo.GeoParser  org.apache.tika.parser.geo.GeoParser   
1  org.apache.tika.parser.geo.GeoParser  org.apache.tika.parser.geo.GeoParser   

  resourceName Optional_LATITUDE2 Optional_LONGITUDE2  Optional_NAME2  
0   polar.geot                NaN                 NaN          