JSON manipulation: https://realpython.com/python-json/

In [503]:
import os, fnmatch
import json
import pprint

In [498]:
data_folder = '/Users/linkalis/GIS8990_DistributedSpatialDatabases/testdatasets/data_split_5000/'
logs_folder = '/Users/linkalis/GIS8990_DistributedSpatialDatabases/testdatasets/data_split_5000/logs/'

In [499]:
class Extractor:
    ''' Takes a folder name and a logs directory path and initializes a log file containing the name of 
    every file in the target folder. The get_next_file() method gets the next file in the folder that 
    hasn't yet been loaded into the database. It then reads in the next file and returns it as 
    list of dictionary objects for further manipulation. '''
    
    def __init__(self, data_path, logs_path):
        self.data_path = data_path
        self.logs_path = logs_path
        
        # Create a directory to store the log files, if necessary
        logs_dir = os.path.dirname(self.logs_path)
        if not os.path.exists(logs_dir):
            os.makedirs(logs_dir)
        
        # Create a 'files_to_load.txt' file, then write the name of every file in the directory to this file
        files_to_load_log  = open(logs_dir + "/files_to_load.txt", "w")
        data_files_list = os.listdir(self.data_path) 
        file_type = "*.txt"  
        for file in data_files_list:  
            if fnmatch.fnmatch(file, file_type):
                files_to_load_log.write(file)
                files_to_load_log.write("\n")
        files_to_load_log.close()     

    def get_next_file(self):
        ''' Reads from the files_to_load.txt file and gets the name of the next file in the list.
        Calls read_file() to read in the target file in as list of dictionaries. Returns a tuple 
        that includes the list of dictionaries representing the JSON data, along with the filename
        so we can keep track of this file in subsequent tasks. '''
        files_to_load_log  = open(self.logs_path + "/files_to_load.txt", "r")
        next_file_name = files_to_load_log.readline()
        print("Next file is: " + next_file_name)
        next_file_path = self.data_path + next_file_name.rstrip("\n") # strip the newline character from the end of filename
        return(self.read_file(next_file_path), next_file_name)
        
    def read_file(self, file_to_read):
        ''' Reads the JSON-formatted file line by line and returns each line as a dictionary. '''
        print("Reading file: " + file_to_read)
        reading_file = open(file_to_read, "r") # open as read-only
        list_of_jsondicts = []
        for line in reading_file.readlines():
            list_of_jsondicts.append(json.loads(line))
        print("Read " + str(len(list_of_jsondicts)) + " data rows.")
        return(list_of_jsondicts)

In [500]:
extractor = Extractor(data_folder, logs_folder)

In [501]:
next_file_data, next_file_metadata = extractor.get_next_file()

Next file is: 500M_unicode_splitan.txt

Reading file: /Users/linkalis/GIS8990_DistributedSpatialDatabases/testdatasets/data_split_5000/500M_unicode_splitan.txt
Read 5000 data rows.


In [492]:
class Cleaner: 
    ''' Takes a batch of data that's been extracted as a list of dictionaries. The clean_data() method iterates over 
    each data element in the list, running it through a series of cleaning steps. Returns the cleaned data
    back as a list. '''
    
    def __init__(self, data_list, metadata, logs_path):
        self.data_list = data_list
        self.logs_path = logs_path
        self.file_name = metadata
        
    def clean_data(self):
        step1_log = []
        step2_log = []
        
        #i = 0
        # Iterate over each data element, progressing through each cleaning step on each element. 
        # Log the ids of data elemtns that contain nulls and/or errors to the cleaning log as we go.
        for data_element in self.data_list:
            #print(i)
            data_clean_step1 = self.fix_null_places(data_element, step1_log)
            data_clean_step2 = self.fix_bounding_box(data_clean_step1, step2_log)
            data_clean_step3 = self.get_centroid(data_clean_step2)
            #i += 1
        
        # Put the cleaning log arrays into a dictionary and write the result to the cleaning log file.
        #log_dict = dict()
        #log_dict['file_name'] = self.file_name
        #log_dict['null_places_fixed'] = step1_log
        #log_dict['bounding_boxes_fixed'] = step2_log
        #cleaning_log  = open(self.cleaning_log_path + "/cleaning_log.txt", "a+") # open file in append mode
        #cleaning_log.write(log_dict)
        #cleaning_log.close()
        
        return(self.data_list)
        
    def fix_null_places(self, data_element, log_array):
        ''' Since place values are critical to our data model, substitute dummy values if we have a place value
        that equals 'None'. This will keep it from blowing up the database when we try to insert.
        '''
        if data_element['place'] is None:
            data_element['place'] = dict()
            data_element['place']['id'] = "9999999"
            data_element['place']['name'] = "No Place"
            data_element['place']['full_name'] = "No Place Available"
            data_element['place']['country'] = "No Country Available"
            data_element['place']['country_code'] = "ZZ"
            data_element['place']['place_type'] = "NA"
            data_element['place']['url'] = "NA"
            data_element['place']['bounding_box'] = dict() # initialize dictionary to hold bounding box
            data_element['place']['bounding_box']['type'] = "Polygon"
            data_element['place']['bounding_box']['coordinates'] = list() # initialize coordinate list w/in bounding box
            data_element['place']['bounding_box']['coordinates'].append([]) # append the [0] element to hold four pairs of coordinates
            data_element['place']['bounding_box']['coordinates'][0].append([0.0, 0.0]) # append 'dummy' coordinates
            data_element['place']['bounding_box']['coordinates'][0].append([0.0, 0.0])
            data_element['place']['bounding_box']['coordinates'][0].append([0.0, 0.0])
            data_element['place']['bounding_box']['coordinates'][0].append([0.0, 0.0])
            log_array.append(data_element["id_str"])
        
        return(data_element)
    
            
    def fix_bounding_box(self, data_element, log_array):
        ''' Fix a few issues that are going on with bounding boxes:
        1. Twitter Place bounding boxes only have four points. Need to close them off so they're a complete polygon.
        Take the first coordinate of the bounding box array and repeat it at the end of the bounding box array.
        2. If the bounding box is actually a point (i.e. all of the four points are the same), then "fake out" 
        a bounding box by transforming into a small rectangle with a small buffer around the point.  We can recognize
        these by looking for place.place_type == 'poi'.
        '''
        print(data_element['id_str'])
        original_bounding_box = data_element['place']['bounding_box']['coordinates'][0]
        print(original_bounding_box)
        print(data_element['place']['place_type'])
        
        if (data_element['place']['place_type'] == 'poi' or data_element['place']['place_type'] == 'NA'):
            point_bounding_box = [[None for x in range(2)] for y in range(5)]
            point_bounding_box[0][0] = original_bounding_box[0][0] - 0.0001
            point_bounding_box[0][1] = original_bounding_box[0][1] - 0.0001
            point_bounding_box[1][0] = original_bounding_box[1][0] - 0.0001
            point_bounding_box[1][1] = original_bounding_box[1][1] + 0.0001
            point_bounding_box[2][0] = original_bounding_box[2][0] + 0.0001
            point_bounding_box[2][1] = original_bounding_box[2][1] + 0.0001
            point_bounding_box[3][0] = original_bounding_box[3][0] + 0.0001
            point_bounding_box[3][1] = original_bounding_box[3][1] - 0.0001
            point_bounding_box[4][0] = original_bounding_box[0][0] - 0.0001
            point_bounding_box[4][1] = original_bounding_box[0][1] - 0.0001
            data_element['place']['better_bounding_box'] = dict()
            data_element['place']['better_bounding_box']['type'] = "Polygon"
            data_element['place']['better_bounding_box']['coordinates'] = list()
            data_element['place']['better_bounding_box']['coordinates'].append([])
            data_element['place']['better_bounding_box']['coordinates'][0] = point_bounding_box
            print(data_element['place']['better_bounding_box']['coordinates'])
            log_array.append(data_element["id_str"])
            return(data_element)
        else:
            first_coords = original_bounding_box[0]
            print(first_coords)
            original_bounding_box.append(first_coords)
            print(original_bounding_box)
            data_element['place']['better_bounding_box'] = dict()
            data_element['place']['better_bounding_box']['type'] = "Polygon"
            data_element['place']['better_bounding_box']['coordinates'] = list()
            data_element['place']['better_bounding_box']['coordinates'].append([])
            data_element['place']['better_bounding_box']['coordinates'][0] = original_bounding_box
            print(data_element['place']['better_bounding_box']['coordinates'])
            return(data_element)
                  
    def get_centroid(self, data_element):
        bounding_box = data_element['place']['better_bounding_box']['coordinates'][0];
        lower_left = bounding_box[0];
        upper_right = bounding_box[2];
        centroid_long = lower_left[0] + ((upper_right[0] - lower_left[0]) / 2);
        centroid_lat = lower_left[1] + ((upper_right[1] - lower_left[1]) / 2);
        data_element['place']['centroid'] = dict()
        data_element['place']['centroid']['type'] = "Point"
        data_element['place']['centroid']['coordinates'] = [centroid_long, centroid_lat]
        

In [493]:
cleaner = Cleaner(next_file_data, next_file_metadata, logs_folder)
cleaned_data = cleaner.clean_data()

947699650961821698
[[-118.856474, 34.117867], [-118.856474, 34.16827], [-118.787618, 34.16827], [-118.787618, 34.117867]]
city
[-118.856474, 34.117867]
[[-118.856474, 34.117867], [-118.856474, 34.16827], [-118.787618, 34.16827], [-118.787618, 34.117867], [-118.856474, 34.117867]]
[[[-118.856474, 34.117867], [-118.856474, 34.16827], [-118.787618, 34.16827], [-118.787618, 34.117867], [-118.856474, 34.117867]]]
947699650932375552
[[-75.32233, 39.990009], [-75.32233, 40.018207], [-75.273474, 40.018207], [-75.273474, 39.990009]]
city
[-75.32233, 39.990009]
[[-75.32233, 39.990009], [-75.32233, 40.018207], [-75.273474, 40.018207], [-75.273474, 39.990009], [-75.32233, 39.990009]]
[[[-75.32233, 39.990009], [-75.32233, 40.018207], [-75.273474, 40.018207], [-75.273474, 39.990009], [-75.32233, 39.990009]]]
947699650781310977
[[152.668523, -27.767441], [152.668523, -26.996845], [153.31787, -26.996845], [153.31787, -27.767441]]
city
[152.668523, -27.767441]
[[152.668523, -27.767441], [152.668523, -2

In [504]:
#next_file_data[122]['id_str'] # has coordinates and a place
pprint.pprint(cleaned_data[576])
pprint.pprint(cleaned_data[4234])
pprint.pprint(cleaned_data[575])

{'contributors': None,
 'coordinates': {'coordinates': [102.18993187, 2.20444631], 'type': 'Point'},
 'created_at': 'Mon Jan 01 05:23:10 +0000 2018',
 'entities': {'hashtags': [{'indices': [29, 46], 'text': 'HappyNewYear2018'},
                           {'indices': [47, 62], 'text': 'SerliSpeedMoto'},
                           {'indices': [63, 78], 'text': 'KasanovaGarage'},
                           {'indices': [79, 93], 'text': 'SebatangGerak'}],
              'symbols': [],
              'urls': [{'display_url': 'instagram.com/p/BdZRyGGnMx7/',
                        'expanded_url': 'https://www.instagram.com/p/BdZRyGGnMx7/',
                        'indices': [95, 118],
                        'url': 'https://t.co/LLb93mKgEz'}],
              'user_mentions': []},
 'favorite_count': 0,
 'favorited': False,
 'filter_level': 'low',
 'geo': {'coordinates': [2.20444631, 102.18993187], 'type': 'Point'},
 'id': 947699701763239942,
 'id_str': '947699701763239942',
 'in_reply_to_screen_

In [495]:
cleaned_data[0]['place']['better_bounding_box']

{'coordinates': [[[-118.856474, 34.117867],
   [-118.856474, 34.16827],
   [-118.787618, 34.16827],
   [-118.787618, 34.117867],
   [-118.856474, 34.117867]]],
 'type': 'Polygon'}

In [None]:
class Loader:
    def __init__(self, data_list, metadata, logs_path):
        self.data_list = data_list
        self.logs_path = logs_path
        self.file_name = metadata

    def log_successful_load(self):
        # If load succeeds, record the time it took to run and write filename to a log file 
        # indicating that it's been successfully loaded so we don't load it again.
        
    
    def log_failed_load(self):
        # If load failed, record the time it ran and record a failed load to the log file.

## JUNK

In [505]:
twurl '/1.1/geo/reverse_geocode.json?lat=2.204446&long=102.189931&granularity=country'
twurl '/1.1/geo/reverse_geocode.json?lat=2.255562&long=102.250785&granularity=country'

SyntaxError: invalid syntax (<ipython-input-505-7cca7c542ae1>, line 1)