## GeoTopicParser

Tutorial (CLI -> ipynb)

First, turn on the new terminal and move cd to 'lucene-geo-gazetteer'

```cd src/lucene-geo-gazetteer```

Next, run lucnene-geo-gazzetteer 

```lucene-geo-gazetteer -server```

Now Tika-app can utilize lucene-geo-gazetteer

In [11]:
import os
import subprocess
import pandas as pd

# 1. Directory setting
current_dir = os.getcwd() # Get absolute path to this script
project_root = os.path.abspath(os.path.join(current_dir, '..')) # Assume repo root is 1 levels up

if not os.path.isdir(project_root):
    raise FileNotFoundError(f"❌ Project root not found at: {project_root}")
print(f"-- Using project root: {project_root}")

# four directories of demand : 
geot_files_dir = os.path.join(project_root, 'src', 'geotopic-mime') 
tika_app_jar = os.path.join(project_root, 'src', 'tika', 'tika-app-2.6.0.jar')
tika_nlp_jar = os.path.join(project_root, 'src', 'tika', 'tika-parser-nlp-package-2.6.0.jar')
ner_model_dir = os.path.join(project_root, 'src', 'location-ner-model')


# 3. Construct Tika CLI Command Base
classpath = f"{tika_app_jar}:{tika_nlp_jar}:{ner_model_dir}:{geot_files_dir}"
tika_cmd_base = [
    "java", "-classpath", classpath,
    "org.apache.tika.cli.TikaCLI", "-m"
]

# 4. Loop through multiple .geot files
results = []
for filename in os.listdir(geot_files_dir):
    if filename.endswith(".geot"):
        filepath = os.path.join(geot_files_dir, filename)
        cmd = tika_cmd_base + [filepath]

        try:
            output = subprocess.check_output(" ".join(cmd), shell=True, text=True)
        except subprocess.CalledProcessError as e:
            print(f" Error processing {filename}: {e}")
            continue

        # Parse Tika metadata output
        metadata = {"filename": filename}
        for line in output.splitlines():
            if ": " in line:
                key, value = line.split(": ", 1)
                metadata[key.strip()] = value.strip()

        results.append(metadata)


# Convert to DataFrame
df = pd.DataFrame(results)
print(df.head())

-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 04:53:34,116 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 04:53:36,476 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin


     filename Content-Length          Content-Type Geographic_LATITUDE  \
0  polar.geot            881  application/geotopic               39.76   
1    cnn.geot           3164  application/geotopic            44.60715   

  Geographic_LONGITUDE Geographic_NAME Optional_LATITUDE1 Optional_LONGITUDE1  \
0                -98.5   United States               35.0               105.0   
1            -69.04576  Town of Monroe            26.0112           -80.14949   

               Optional_NAME1                      X-TIKA:Parsed-By  \
0  People’s Republic of China  org.apache.tika.parser.geo.GeoParser   
1                   Hollywood  org.apache.tika.parser.geo.GeoParser   

              X-TIKA:Parsed-By-Full-Set resourceName Optional_LATITUDE2  \
0  org.apache.tika.parser.geo.GeoParser   polar.geot                NaN   
1  org.apache.tika.parser.geo.GeoParser     cnn.geot           40.92877   

  Optional_LONGITUDE2                                Optional_NAME2  
0                 NaN  

In [39]:
# Convert each row into .geot file
import pandas as pd 
hp = pd.read_csv('../data/haunted_places.tsv', sep='\t')
temp = hp.description.to_list()

# lucene parser does not allow the input text ends with double quote, so remove the double quote at the end. 
def replace_rightmost_quote(text, replacement=''):
    index = text.rfind('"')
    if index == -1:
        return text  # No quote found
    return text[:index] + replacement + text[index+1:]

# generate folder if there wasn't 
#os.makedirs('../data/geot_files', exist_ok=True)
os.makedirs('../src/geotopic-mime', exist_ok=True)

for idx, text in enumerate(temp):
    with open(f'../src/geotopic-mime/{idx}.geot',"w") as f:     
        f.writelines(replace_rightmost_quote(str(text)))   

In [63]:
# Feed 10,991 geot files into geotopicparser 
import os
import subprocess
import pandas as pd

def extract_location(geot_file_num_max, geot_file_num_min=0):
    # 1. Directory setting
    current_dir = os.getcwd() # Get absolute path to this script
    project_root = os.path.abspath(os.path.join(current_dir, '..')) # Assume repo root is 1 levels up

    if not os.path.isdir(project_root):
        raise FileNotFoundError(f"❌ Project root not found at: {project_root}")
    print(f"-- Using project root: {project_root}")

    # four directories of demand : 
    geot_files_dir = os.path.join(project_root, 'src', 'geotopic-mime')  #'data', 'geot_files'
    tika_app_jar = os.path.join(project_root, 'src', 'tika', 'tika-app-2.6.0.jar')
    tika_nlp_jar = os.path.join(project_root, 'src', 'tika', 'tika-parser-nlp-package-2.6.0.jar')
    ner_model_dir = os.path.join(project_root, 'src', 'location-ner-model')


    # 3. Construct Tika CLI Command Base
    classpath = f"{tika_app_jar}:{tika_nlp_jar}:{ner_model_dir}:{geot_files_dir}"
    tika_cmd_base = [
        "java", "-classpath", classpath,
        "org.apache.tika.cli.TikaCLI", "-m"
    ]

    # 4. Loop through multiple .geot files
    results = []
    for idx, file_num in enumerate(range(geot_file_num_min, geot_file_num_max+1)):  #os.listdir(geot_files_dir)
        filename = str(file_num-1)+'.geot'
        if file_num == 0: 
            filename = 'polar.geot'

        if filename.endswith(".geot"):
            filepath = os.path.join(geot_files_dir, filename)
            cmd = tika_cmd_base + [filepath]

            try:
                output = subprocess.check_output(" ".join(cmd), shell=True, text=True)
            except subprocess.CalledProcessError as e:
                print(f" Error processing {filename}: {e}")
                continue

            # Parse Tika metadata output
            metadata = {"filename": filename}
            for line in output.splitlines():
                if ": " in line:
                    key, value = line.split(": ", 1)
                    metadata[key.strip()] = value.strip()

            results.append(metadata)
    return results

extract_location(geot_file_num_max=10, geot_file_num_min=0)

-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 12:30:45,635 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:30:47,533 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:30:49,143 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:30:50,615 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

[{'filename': 'polar.geot',
  'Content-Length': '881',
  'Content-Type': 'application/geotopic',
  'Geographic_LATITUDE': '35.0',
  'Geographic_LONGITUDE': '105.0',
  'Geographic_NAME': 'People’s Republic of China',
  'Optional_LATITUDE1': '39.76',
  'Optional_LONGITUDE1': '-98.5',
  'Optional_NAME1': 'United States',
  'X-TIKA:Parsed-By': 'org.apache.tika.parser.geo.GeoParser',
  'X-TIKA:Parsed-By-Full-Set': 'org.apache.tika.parser.geo.GeoParser',
  'resourceName': 'polar.geot'},
 {'filename': '0.geot',
  'Content-Length': '1476',
  'Content-Type': 'application/geotopic',
  'Optional_LATITUDE1': '43.0125',
  'Optional_LATITUDE2': '43.00142',
  'Optional_LATITUDE3': '42.96336',
  'Optional_LONGITUDE1': '-85.50056',
  'Optional_LONGITUDE2': '-85.49169',
  'Optional_LONGITUDE3': '-85.66809',
  'Optional_NAME1': 'Egypt Valley Country Club',
  'Optional_NAME2': 'Findlay Cemetery',
  'Optional_NAME3': 'Grand Rapids',
  'X-TIKA:Parsed-By': 'org.apache.tika.parser.geo.GeoParser',
  'X-TIKA:Pa

In [65]:
# batch process (by 500)
df = pd.DataFrame()
batches = [] 
for i in range(10992//500 +1):
    file_num_max = min((i+1)*500 -1,10992)
    file_num_min = i*500 
    batches.append([file_num_max, file_num_min])

for idx, (num_max, num_min) in enumerate(batches):
    print(idx, num_max)

0 499
1 999
2 1499
3 1999
4 2499
5 2999
6 3499
7 3999
8 4499
9 4999
10 5499
11 5999
12 6499
13 6999
14 7499
15 7999
16 8499
17 8999
18 9499
19 9999
20 10499
21 10992


In [67]:
results = list() 
for idx, (num_max, num_min) in enumerate(batches):
    print("="*30)
    print(f"Batch_{f} started:{num_min} ~ {num_max}")
    results.append(extract_location(geot_file_num_max= num_max, geot_file_num_min= num_min))


# Convert to DataFrame
df = pd.DataFrame(results)
#print(df.head())
df

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:0 ~ 499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 12:33:01,911 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:33:03,560 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:33:05,046 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:33:06,490 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:500 ~ 999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 12:45:18,395 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:45:19,912 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:45:21,368 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:45:22,857 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:1000 ~ 1499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 12:57:25,097 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:57:26,542 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:57:27,989 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 12:57:29,446 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:1500 ~ 1999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:09:36,338 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:09:37,815 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:09:39,287 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:09:40,753 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:2000 ~ 2499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:21:55,856 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:21:57,426 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:21:58,918 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:22:00,579 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:2500 ~ 2999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:34:19,406 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:34:20,895 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:34:22,364 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:34:24,079 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:3000 ~ 3499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:46:44,027 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:46:45,573 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:46:47,087 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:46:48,651 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:3500 ~ 3999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 13:59:06,209 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:59:07,702 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:59:09,241 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 13:59:10,741 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:4000 ~ 4499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 14:11:36,298 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:11:37,893 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:11:39,559 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:11:41,118 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:4500 ~ 4999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 14:24:08,238 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:24:09,690 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:24:11,197 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:24:12,689 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:5000 ~ 5499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 14:36:31,780 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:36:33,343 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:36:33,440 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:36:34,970 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:5500 ~ 5999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 14:49:03,215 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:49:04,818 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:49:06,363 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 14:49:07,929 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:6000 ~ 6499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 15:01:30,015 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:01:31,565 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:01:33,375 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:01:35,177 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:6500 ~ 6999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 15:13:58,927 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:14:00,482 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:14:02,258 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:14:03,824 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:7000 ~ 7499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 15:26:29,494 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:26:31,017 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:26:32,600 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:26:34,112 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:7500 ~ 7999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 15:38:55,489 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:38:57,099 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:38:58,666 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:39:00,289 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:8000 ~ 8499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 15:51:24,959 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:51:26,529 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:51:28,065 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 15:51:29,620 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:8500 ~ 8999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 16:03:58,934 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:04:00,496 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:04:02,259 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:04:03,835 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:9000 ~ 9499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 16:16:32,647 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:16:34,249 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:16:35,809 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:16:37,332 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:9500 ~ 9999
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 16:29:04,445 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:29:06,050 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:29:07,651 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:29:09,238 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:10000 ~ 10499
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 16:41:35,211 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:41:36,844 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:41:38,406 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:41:40,214 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Batch_<_io.TextIOWrapper name='../src/geotopic-mime/10991.geot' mode='w' encoding='UTF-8'> started:10500 ~ 10992
-- Using project root: /root/vscode/ds550/DSCI550-assignment2


INFO  [main] 16:54:09,192 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:54:10,971 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:54:12,575 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin
INFO  [main] 16:54:14,406 org.apache.tika.parser.sentiment.SentimentAnalysisParser Sentiment Model is at https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentimen

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,"{'filename': 'polar.geot', 'Content-Length': '...","{'filename': '0.geot', 'Content-Length': '1476...","{'filename': '1.geot', 'Content-Length': '918'...","{'filename': '2.geot', 'Content-Length': '1277...","{'filename': '3.geot', 'Content-Length': '1797...","{'filename': '4.geot', 'Content-Length': '360'...","{'filename': '5.geot', 'Content-Length': '61',...","{'filename': '6.geot', 'Content-Length': '848'...","{'filename': '7.geot', 'Content-Length': '989'...","{'filename': '8.geot', 'Content-Length': '110'...",...,"{'filename': '489.geot', 'Content-Length': '13...","{'filename': '490.geot', 'Content-Length': '16...","{'filename': '491.geot', 'Content-Length': '10...","{'filename': '492.geot', 'Content-Length': '27...","{'filename': '493.geot', 'Content-Length': '30...","{'filename': '494.geot', 'Content-Length': '96...","{'filename': '495.geot', 'Content-Length': '22...","{'filename': '496.geot', 'Content-Length': '38...","{'filename': '497.geot', 'Content-Length': '19...","{'filename': '498.geot', 'Content-Length': '32..."
1,"{'filename': '499.geot', 'Content-Length': '25...","{'filename': '500.geot', 'Content-Length': '23...","{'filename': '501.geot', 'Content-Length': '44...","{'filename': '502.geot', 'Content-Length': '19...","{'filename': '503.geot', 'Content-Length': '20...","{'filename': '504.geot', 'Content-Length': '79...","{'filename': '505.geot', 'Content-Length': '31...","{'filename': '506.geot', 'Content-Length': '49...","{'filename': '507.geot', 'Content-Length': '35...","{'filename': '508.geot', 'Content-Length': '51...",...,"{'filename': '989.geot', 'Content-Length': '35...","{'filename': '990.geot', 'Content-Length': '22...","{'filename': '991.geot', 'Content-Length': '50...","{'filename': '992.geot', 'Content-Length': '20...","{'filename': '993.geot', 'Content-Length': '43...","{'filename': '994.geot', 'Content-Length': '83...","{'filename': '995.geot', 'Content-Length': '16...","{'filename': '996.geot', 'Content-Length': '38...","{'filename': '997.geot', 'Content-Length': '21...","{'filename': '998.geot', 'Content-Length': '48..."
2,"{'filename': '999.geot', 'Content-Length': '17...","{'filename': '1000.geot', 'Content-Length': '2...","{'filename': '1001.geot', 'Content-Length': '6...","{'filename': '1002.geot', 'Content-Length': '2...","{'filename': '1003.geot', 'Content-Length': '5...","{'filename': '1004.geot', 'Content-Length': '3...","{'filename': '1005.geot', 'Content-Length': '5...","{'filename': '1006.geot', 'Content-Length': '6...","{'filename': '1007.geot', 'Content-Length': '1...","{'filename': '1008.geot', 'Content-Length': '2...",...,"{'filename': '1489.geot', 'Content-Length': '3...","{'filename': '1490.geot', 'Content-Length': '5...","{'filename': '1491.geot', 'Content-Length': '2...","{'filename': '1492.geot', 'Content-Length': '4...","{'filename': '1493.geot', 'Content-Length': '6...","{'filename': '1494.geot', 'Content-Length': '1...","{'filename': '1495.geot', 'Content-Length': '7...","{'filename': '1496.geot', 'Content-Length': '7...","{'filename': '1497.geot', 'Content-Length': '5...","{'filename': '1498.geot', 'Content-Length': '3..."
3,"{'filename': '1499.geot', 'Content-Length': '7...","{'filename': '1500.geot', 'Content-Length': '1...","{'filename': '1501.geot', 'Content-Length': '4...","{'filename': '1502.geot', 'Content-Length': '6...","{'filename': '1503.geot', 'Content-Length': '4...","{'filename': '1504.geot', 'Content-Length': '1...","{'filename': '1505.geot', 'Content-Length': '1...","{'filename': '1506.geot', 'Content-Length': '2...","{'filename': '1507.geot', 'Content-Length': '3...","{'filename': '1508.geot', 'Content-Length': '2...",...,"{'filename': '1989.geot', 'Content-Length': '1...","{'filename': '1990.geot', 'Content-Length': '1...","{'filename': '1991.geot', 'Content-Length': '3...","{'filename': '1992.geot', 'Content-Length': '4...","{'filename': '1993.geot', 'Content-Length': '1...","{'filename': '1994.geot', 'Content-Length': '4...","{'filename': '1995.geot', 'Content-Length': '1...","{'filename': '1996.geot', 'Content-Length': '5...","{'filename': '1997.geot', 'Content-Length': '1...","{'filename': '1998.geot', 'Content-Length': '5..."
4,"{'filename': '1999.geot', 'Content-Length': '2...","{'filename': '2000.geot', 'Content-Length': '3...","{'filename': '2001.geot', 'Content-Length': '3...","{'filename': '2002.geot', 'Content-Length': '1...","{'filename': '2003.geot', 'Content-Length': '2...","{'filename': '2004.geot', 'Content-Length': '1...","{'filename': '2005.geot', 'Content-Length': '3...","{'filename': '2006.geot', 'Content-Length': '2...","{'filename': '2007.geot', 'Content-Length': '2...","{'filename': '2008.geot', 'Content-Length': '4...",...,"{'filename': '2489.geot', 'Content-Length': '7...","{'filename': '2490.geot', 'Content-Length': '1...","{'filename': '2491.geot', 'Content-Length': '3...","{'filename': '2492.geot', 'Content-Length': '4...","{'filename': '2493.geot', 'Content-Length': '1...","{'filename': '2494.geot', 'Content-Length': '2...","{'filename': '2495.geot', 'Content-Length': '5...","{'filename': '2496.geot', 'Content-Length': '6...","{'filename': '2497.geot', 'Content-Length': '5...","{'filename': '2498.geot', 'Content-Length': '3..."
5,"{'filename': '2499.geot', 'Content-Length': '3...","{'filename': '2500.geot', 'Content-Length': '2...","{'filename': '2501.geot', 'Content-Length': '4...","{'filename': '2502.geot', 'Content-Length': '6...","{'filename': '2503.geot', 'Content-Length': '3...","{'filename': '2504.geot', 'Content-Length': '3...","{'filename': '2505.geot', 'Content-Length': '8...","{'filename': '2506.geot', 'Content-Length': '4...","{'filename': '2507.geot', 'Content-Length': '3...","{'filename': '2508.geot', 'Content-Length': '5...",...,"{'filename': '2989.geot', 'Content-Length': '4...","{'filename': '2990.geot', 'Content-Length': '1...","{'filename': '2991.geot', 'Content-Length': '1...","{'filename': '2992.geot', 'Content-Length': '3...","{'filename': '2993.geot', 'Content-Length': '8...","{'filename': '2994.geot', 'Content-Length': '9...","{'filename': '2995.geot', 'Content-Length': '3...","{'filename': '2996.geot', 'Content-Length': '7...","{'filename': '2997.geot', 'Content-Length': '1...","{'filename': '2998.geot', 'Content-Length': '1..."
6,"{'filename': '2999.geot', 'Content-Length': '2...","{'filename': '3000.geot', 'Content-Length': '2...","{'filename': '3001.geot', 'Content-Length': '2...","{'filename': '3002.geot', 'Content-Length': '5...","{'filename': '3003.geot', 'Content-Length': '3...","{'filename': '3004.geot', 'Content-Length': '4...","{'filename': '3005.geot', 'Content-Length': '8...","{'filename': '3006.geot', 'Content-Length': '9...","{'filename': '3007.geot', 'Content-Length': '2...","{'filename': '3008.geot', 'Content-Length': '6...",...,"{'filename': '3489.geot', 'Content-Length': '2...","{'filename': '3490.geot', 'Content-Length': '5...","{'filename': '3491.geot', 'Content-Length': '2...","{'filename': '3492.geot', 'Content-Length': '1...","{'filename': '3493.geot', 'Content-Length': '6...","{'filename': '3494.geot', 'Content-Length': '6...","{'filename': '3495.geot', 'Content-Length': '6...","{'filename': '3496.geot', 'Content-Length': '4...","{'filename': '3497.geot', 'Content-Length': '6...","{'filename': '3498.geot', 'Content-Length': '5..."
7,"{'filename': '3499.geot', 'Content-Length': '8...","{'filename': '3500.geot', 'Content-Length': '4...","{'filename': '3501.geot', 'Content-Length': '5...","{'filename': '3502.geot', 'Content-Length': '2...","{'filename': '3503.geot', 'Content-Length': '9...","{'filename': '3504.geot', 'Content-Length': '8...","{'filename': '3505.geot', 'Content-Length': '3...","{'filename': '3506.geot', 'Content-Length': '2...","{'filename': '3507.geot', 'Content-Length': '5...","{'filename': '3508.geot', 'Content-Length': '2...",...,"{'filename': '3989.geot', 'Content-Length': '4...","{'filename': '3990.geot', 'Content-Length': '7...","{'filename': '3991.geot', 'Content-Length': '1...","{'filename': '3992.geot', 'Content-Length': '3...","{'filename': '3993.geot', 'Content-Length': '9...","{'filename': '3994.geot', 'Content-Length': '9...","{'filename': '3995.geot', 'Content-Length': '2...","{'filename': '3996.geot', 'Content-Length': '2...","{'filename': '3997.geot', 'Content-Length': '1...","{'filename': '3998.geot', 'Content-Length': '4..."
8,"{'filename': '3999.geot', 'Content-Length': '1...","{'filename': '4000.geot', 'Content-Length': '6...","{'filename': '4001.geot', 'Content-Length': '3...","{'filename': '4002.geot', 'Content-Length': '2...","{'filename': '4003.geot', 'Content-Length': '7...","{'filename': '4004.geot', 'Content-Length': '2...","{'filename': '4005.geot', 'Content-Length': '4...","{'filename': '4006.geot', 'Content-Length': '2...","{'filename': '4007.geot', 'Content-Length': '1...","{'filename': '4008.geot', 'Content-Length': '2...",...,"{'filename': '4489.geot', 'Content-Length': '5...","{'filename': '4490.geot', 'Content-Length': '5...","{'filename': '4491.geot', 'Content-Length': '7...","{'filename': '4492.geot', 'Content-Length': '3...","{'filename': '4493.geot', 'Content-Length': '2...","{'filename': '4494.geot', 'Content-Length': '2...","{'filename': '4495.geot', 'Content-Length': '1...","{'filename': '4496.geot', 'Content-Length': '5...","{'filename': '4497.geot', 'Content-Length': '2...","{'filename': '4498.geot', 'Content-Length': '2..."
9,"{'filename': '4499.geot', 'Content-Length': '7...","{'filename': '4500.geot', 'Content-Length': '4...","{'filename': '4501.geot', 'Content-Length': '2...","{'filename': '4502.geot', 'Content-Length': '4...","{'filename': '4503.geot', 'Content-Length': '9...","{'filename': '4504.geot', 'Content-Length': '4...","{'filename': '4505.geot', 'Content-Length': '5...","{'filename': '4506.geot', 'Content-Length': '3...","{'filename': '4507.geot', 'Content-Length': '2...","{'filename': '4508.geot', 'Content-Length': '2...",...,"{'filename': '4989.geot', 'Content-Length': '5...","{'filename': '4990.geot', 'Content-Length': '4...","{'filename': '4991.geot', 'Content-Length': '3...","{'filename': '4992.geot', 'Content-Length': '8...","{'filename': '4993.geot', 'Content-Length': '1...","{'filename': '4994.geot', 'Content-Length': '1...","{'filename': '4995.geot', 'Content-Length': '1...","{'filename': '4996.geot', 'Content-Length': '2...","{'filename': '4997.geot', 'Content-Length': '3...","{'filename': '4998.geot', 'Content-Length': '3..."


In [131]:
df.iloc[0,7]

{'filename': '6.geot',
 'Content-Length': '848',
 'Content-Type': 'application/geotopic',
 'Geographic_LATITUDE': '42.43595',
 'Geographic_LONGITUDE': '-123.172',
 'Geographic_NAME': 'Rogue River',
 'X-TIKA:Parsed-By': 'org.apache.tika.parser.geo.GeoParser',
 'X-TIKA:Parsed-By-Full-Set': 'org.apache.tika.parser.geo.GeoParser',
 'resourceName': '6.geot'}

In [92]:
geo_name = [] 
geo_lat = [] 
geo_lon = [] 

for i in range(22):
    for j in range(500):
        content = df.iloc[i,j]
        
        if content is not None:
            geo_name.append(df.iloc[i,j].get('Geographic_NAME',None)) 
            geo_lat.append(df.iloc[i,j].get('Geographic_LATITUDE',None))
            geo_lon.append(df.iloc[i,j].get('Geographic_LONGITUDE',None))
            #print(content['filename'])
            if content['filename'] == '10991.geot':
                break 
        else: 
            geo_name.append(None)
            geo_lat.append(None)
            geo_lon.append(None)

geotopic_df = pd.DataFrame({'geo_name':geo_name, 'geo_lat':geo_lat, 'geo_lon':geo_lon}) 
geotopic_df

Unnamed: 0,geo_name,geo_lat,geo_lon
0,People’s Republic of China,35.0,105.0
1,,,
2,,,
3,,,
4,,,
...,...,...,...
10988,,,
10989,,,
10990,,,
10991,,,


In [None]:
# save data in csv
## drop the first record as it was from polar.geot for the test purpose 
geotopic_df.iloc[1:,:].to_csv('../data/geotopic.csv')

In [101]:
# describe
geotopic_df.describe()

Unnamed: 0,geo_name,geo_lat,geo_lon
count,1194,1194.0,1194.0
unique,810,805.0,806.0
top,Washington,47.50012,-120.50147
freq,17,17.0,17.0


In [106]:
geotopic_df.dropna()

Unnamed: 0,geo_name,geo_lat,geo_lon
0,People’s Republic of China,35.0,105.0
7,Rogue River,42.43595,-123.172
15,Parish of Saint Ann,18.35,-77.26667
17,Kanton Basel-Landschaft,47.50438,7.70444
19,The Other Palace,51.49889,-0.14216
...,...,...,...
10962,Colorado Springs,38.83388,-104.82136
10973,Denver,39.73915,-104.9847
10974,Rocky Mountain House Airport,52.42972,-114.90417
10982,Summit County,40.86815,-110.95567


In [118]:
geotopic_df.dropna()[geotopic_df.dropna().geo_name.duplicated(keep=False)]

Unnamed: 0,geo_name,geo_lat,geo_lon
0,People’s Republic of China,35.0,105.0
17,Kanton Basel-Landschaft,47.50438,7.70444
37,Birmingham,52.48142,-1.89983
44,Michigan,44.25029,-85.50033
61,Charlotte Amalie,18.3419,-64.9307
...,...,...,...
10938,Northwestern United States,46.13125,-113.23778
10960,Colorado Springs,38.83388,-104.82136
10962,Colorado Springs,38.83388,-104.82136
10973,Denver,39.73915,-104.9847


If adding more 'optional' columns(ex. 'Optional_LATITUDE1', 'Optional_LONGITUDE1', 'Optional_NAME1)

In [125]:
geo_name = [] 
geo_lat = [] 
geo_lon = [] 
geo_on1 = [] 
geo_olat1 = [] 
geo_olon1 = [] 

for i in range(22):
    for j in range(500):
        content = df.iloc[i,j]
        
        if content is not None:
            geo_name.append(df.iloc[i,j].get('Geographic_NAME',None)) 
            geo_lat.append(df.iloc[i,j].get('Geographic_LATITUDE',None))
            geo_lon.append(df.iloc[i,j].get('Geographic_LONGITUDE',None))
            geo_on1.append(df.iloc[i,j].get('Optional_NAME1',None))
            geo_olat1.append(df.iloc[i,j].get('Optional_LATITUDE1',None))
            geo_olon1.append(df.iloc[i,j].get('Optional_LONGITUDE1',None))
            #print(content['filename'])
            if content['filename'] == '10991.geot':
                break 
        else: 
            geo_name.append(None)
            geo_lat.append(None)
            geo_lon.append(None)
            geo_on1.append(None)
            geo_olat1.append(None)
            geo_olon1.append(None)

geotopic_df2 = pd.DataFrame({'geo_name':geo_name, 'geo_lat':geo_lat, 'geo_lon':geo_lon, 'geo_on1':geo_on1, 'geo_olat1':geo_olat1, 'geo_olon1':geo_olon1}) 
geotopic_df2.iloc[1:,:].dropna(subset=['geo_name','geo_on1'],how='all')

Unnamed: 0,geo_name,geo_lat,geo_lon,geo_on1,geo_olat1,geo_olon1
1,,,,Egypt Valley Country Club,43.0125,-85.50056
7,Rogue River,42.43595,-123.172,,,
8,,,,Harsens Island,42.58948,-82.58852
15,Parish of Saint Ann,18.35,-77.26667,,,
17,Kanton Basel-Landschaft,47.50438,7.70444,,,
...,...,...,...,...,...,...
10973,Denver,39.73915,-104.9847,,,
10974,Rocky Mountain House Airport,52.42972,-114.90417,Poncha Springs,38.51278,-106.07724
10975,,,,Poncha Springs,38.51278,-106.07724
10982,Summit County,40.86815,-110.95567,,,
