## GeoTopicParser

Tutorial (CLI -> ipynb)

First, turn on the new terminal and move cd to 'lucene-geo-gazetteer'

```cd src/lucene-geo-gazetteer```

Next, run lucnene-geo-gazzetteer 

```lucene-geo-gazetteer -server```

Now Tika-app can utilize lucene-geo-gazetteer

In [11]:
import os
import subprocess
import pandas as pd

# 1. Directory setting
current_dir = os.getcwd() # Get absolute path to this script
project_root = os.path.abspath(os.path.join(current_dir, '..')) # Assume repo root is 1 levels up

if not os.path.isdir(project_root):
    raise FileNotFoundError(f"❌ Project root not found at: {project_root}")
print(f"-- Using project root: {project_root}")

# four directories of demand : 
geot_files_dir = os.path.join(project_root, 'src', 'geotopic-mime') 
tika_app_jar = os.path.join(project_root, 'src', 'tika', 'tika-app-2.6.0.jar')
tika_nlp_jar = os.path.join(project_root, 'src', 'tika', 'tika-parser-nlp-package-2.6.0.jar')
ner_model_dir = os.path.join(project_root, 'src', 'location-ner-model')


# 3. Construct Tika CLI Command Base
classpath = f"{tika_app_jar}:{tika_nlp_jar}:{ner_model_dir}:{geot_files_dir}"
tika_cmd_base = [
    "java", "-classpath", classpath,
    "org.apache.tika.cli.TikaCLI", "-m"
]

# 4. Loop through multiple .geot files
results = []
for filename in os.listdir(geot_files_dir):
    if filename.endswith(".geot"):
        filepath = os.path.join(geot_files_dir, filename)
        cmd = tika_cmd_base + [filepath]

        try:
            output = subprocess.check_output(" ".join(cmd), shell=True, text=True)
        except subprocess.CalledProcessError as e:
            print(f" Error processing {filename}: {e}")
            continue

        # Parse Tika metadata output
        metadata = {"filename": filename}
        for line in output.splitlines():
            if ": " in line:
                key, value = line.split(": ", 1)
                metadata[key.strip()] = value.strip()

        results.append(metadata)


# Convert to DataFrame
df = pd.DataFrame(results)
print(df.head())

-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 04:53:34,116 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 04:53:36,476 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin


     filename Content-Length          Content-Type Geographic_LATITUDE  \
0  polar.geot            881  application/geotopic               39.76   
1    cnn.geot           3164  application/geotopic            44.60715   

  Geographic_LONGITUDE Geographic_NAME Optional_LATITUDE1 Optional_LONGITUDE1  \
0                -98.5   United States               35.0               105.0   
1            -69.04576  Town of Monroe            26.0112           -80.14949   

               Optional_NAME1                      X-TIKA:Parsed-By  \
0  People’s Republic of China  org.apache.tika.parser.geo.GeoParser   
1                   Hollywood  org.apache.tika.parser.geo.GeoParser   

              X-TIKA:Parsed-By-Full-Set resourceName Optional_LATITUDE2  \
0  org.apache.tika.parser.geo.GeoParser   polar.geot                NaN   
1  org.apache.tika.parser.geo.GeoParser     cnn.geot           40.92877   

  Optional_LONGITUDE2                                Optional_NAME2  
0                 NaN  

In [39]:
# Convert each row into .geot file
import pandas as pd 
hp = pd.read_csv('../data/haunted_places.tsv', sep='\t')
temp = hp.description.to_list()

# lucene parser does not allow the input text ends with double quote, so remove the double quote at the end. 
def replace_rightmost_quote(text, replacement=''):
    index = text.rfind('"')
    if index == -1:
        return text  # No quote found
    return text[:index] + replacement + text[index+1:]

# generate folder if there wasn't 
#os.makedirs('../data/geot_files', exist_ok=True)
os.makedirs('../src/geotopic-mime', exist_ok=True)

for idx, text in enumerate(temp):
    with open(f'../src/geotopic-mime/{idx}.geot',"w") as f:     
        f.writelines(replace_rightmost_quote(str(text)))   

In [63]:
# Feed 10,991 geot files into geotopicparser 
import os
import subprocess
import pandas as pd

def extract_location(geot_file_num_max, geot_file_num_min=0):
    # 1. Directory setting
    current_dir = os.getcwd() # Get absolute path to this script
    project_root = os.path.abspath(os.path.join(current_dir, '..')) # Assume repo root is 1 levels up

    if not os.path.isdir(project_root):
        raise FileNotFoundError(f"❌ Project root not found at: {project_root}")
    print(f"-- Using project root: {project_root}")

    # four directories of demand : 
    geot_files_dir = os.path.join(project_root, 'src', 'geotopic-mime')  #'data', 'geot_files'
    tika_app_jar = os.path.join(project_root, 'src', 'tika', 'tika-app-2.6.0.jar')
    tika_nlp_jar = os.path.join(project_root, 'src', 'tika', 'tika-parser-nlp-package-2.6.0.jar')
    ner_model_dir = os.path.join(project_root, 'src', 'location-ner-model')


    # 3. Construct Tika CLI Command Base
    classpath = f"{tika_app_jar}:{tika_nlp_jar}:{ner_model_dir}:{geot_files_dir}"
    tika_cmd_base = [
        "java", "-classpath", classpath,
        "org.apache.tika.cli.TikaCLI", "-m"
    ]

    # 4. Loop through multiple .geot files
    results = []
    for idx, file_num in enumerate(range(geot_file_num_min, geot_file_num_max+1)):  #os.listdir(geot_files_dir)
        filename = str(file_num-1)+'.geot'
        if file_num == 0: 
            filename = 'polar.geot'

        if filename.endswith(".geot"):
            filepath = os.path.join(geot_files_dir, filename)
            cmd = tika_cmd_base + [filepath]

            try:
                output = subprocess.check_output(" ".join(cmd), shell=True, text=True)
            except subprocess.CalledProcessError as e:
                print(f" Error processing {filename}: {e}")
                continue

            # Parse Tika metadata output
            metadata = {"filename": filename}
            for line in output.splitlines():
                if ": " in line:
                    key, value = line.split(": ", 1)
                    metadata[key.strip()] = value.strip()

            results.append(metadata)
    return results

extract_location(geot_file_num_max=10, geot_file_num_min=0)

-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 12:30:45,635 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:30:47,533 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:30:49,143 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:30:50,615 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

[{'filename': 'polar.geot',
  'Content-Length': '881',
  'Content-Type': 'application/geotopic',
  'Geographic_LATITUDE': '35.0',
  'Geographic_LONGITUDE': '105.0',
  'Geographic_NAME': 'People’s Republic of China',
  'Optional_LATITUDE1': '39.76',
  'Optional_LONGITUDE1': '-98.5',
  'Optional_NAME1': 'United States',
  'X-TIKA:Parsed-By': 'org.apache.tika.parser.geo.GeoParser',
  'X-TIKA:Parsed-By-Full-Set': 'org.apache.tika.parser.geo.GeoParser',
  'resourceName': 'polar.geot'},
 {'filename': '0.geot',
  'Content-Length': '1476',
  'Content-Type': 'application/geotopic',
  'Optional_LATITUDE1': '43.0125',
  'Optional_LATITUDE2': '43.00142',
  'Optional_LATITUDE3': '42.96336',
  'Optional_LONGITUDE1': '-85.50056',
  'Optional_LONGITUDE2': '-85.49169',
  'Optional_LONGITUDE3': '-85.66809',
  'Optional_NAME1': 'Egypt Valley Country Club',
  'Optional_NAME2': 'Findlay Cemetery',
  'Optional_NAME3': 'Grand Rapids',
  'X-TIKA:Parsed-By': 'org.apache.tika.parser.geo.GeoParser',
  'X-TIKA:Pa

In [65]:
# batch process (by 500)
df = pd.DataFrame()
batches = [] 
for i in range(10992//500 +1):
    file_num_max = min((i+1)*500 -1,10992)
    file_num_min = i*500 
    batches.append([file_num_max, file_num_min])

for idx, (num_max, num_min) in enumerate(batches):
    print(idx, num_max)

0 499
1 999
2 1499
3 1999
4 2499
5 2999
6 3499
7 3999
8 4499
9 4999
10 5499
11 5999
12 6499
13 6999
14 7499
15 7999
16 8499
17 8999
18 9499
19 9999
20 10499
21 10992


In [None]:
results = list() 
for idx, (num_max, num_min) in enumerate(batches):
    print("="*30)
    print(f"Batch_{f} started:{num_min} ~ {num_max}")
    results.append(extract_location(geot_file_num_max= num_max, geot_file_num_min= num_min))


# Convert to DataFrame
df = pd.DataFrame(results)
#print(df.head())
df

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:0 ~ 499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 12:33:01,911 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:33:03,560 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:33:05,046 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:33:06,490 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:500 ~ 999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 12:45:18,395 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:45:19,912 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:45:21,368 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:45:22,857 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:1000 ~ 1499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 12:57:25,097 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:57:26,542 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:57:27,989 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:57:29,446 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:1500 ~ 1999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:09:36,338 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:09:37,815 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:09:39,287 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:09:40,753 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:2000 ~ 2499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:21:55,856 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:21:57,426 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:21:58,918 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:22:00,579 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:2500 ~ 2999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:34:19,406 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:34:20,895 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:34:22,364 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:34:24,079 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:3000 ~ 3499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:46:44,027 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:46:45,573 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:46:47,087 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:46:48,651 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:3500 ~ 3999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:59:06,209 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:59:07,702 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:59:09,241 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:59:10,741 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:4000 ~ 4499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 14:11:36,298 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:11:37,893 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:11:39,559 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:11:41,118 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen