In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import re
import os
import time
from shapely.geometry import Point, Polygon, LineString,MultiPolygon
from shapely.ops import unary_union

In [2]:
# Reading in the area-separating-indices created in 2.AreaBasedWarnings.ipynb

directory = 'AreaIndices/'
i = 0
areaIndices = {}
for filename in os.listdir(directory):
    with open(directory+filename,'r') as textfile:
        obs = textfile.readlines()
        obs = [int(re.sub('\n','',i)) for i in obs]
        name = re.sub('.txt','',filename)
        areaIndices[name] = obs
worldSeas = gpd.read_file('World_Seas_IHO_v3/World_Seas_IHO_v3.shp')

In [3]:
# Reading in the cleaned data
messages = pd.read_csv('CleanedData.csv',index_col=0)
geoMessages = gpd.GeoDataFrame(messages)

# Making sure that the dataframe eventually will have a 'valid' geometry column 
geometryList = []
for i,(shape,geom) in enumerate(zip(geoMessages.GeometryType,geoMessages.Coordinates)):
    
    if shape == 'Point':
        coor = []
        ele = geom.split()
        ele = [float(re.sub('[](),[]','',i)) for i in ele]

        coor.append(tuple(ele))
        geo = Point(coor)
        geometryList.append(geo)
        
    elif shape == 'GeometryCollection':
        
        coor = []
        ele = geom.split()
        ele = [float(re.sub('[](),[]','',i)) for i in ele]
        
        tupleCoor = [tuple(ele[i:i+3]) for i in np.arange(0,len(ele),3)]
        if i == 4956 or i == 5681 or i == 1037:
            # Necessary because the three observations are LineStrings, or closer to linestrings than polygons.
            geo = LineString(tupleCoor)
        else:
            geo = Polygon(tupleCoor)
        geometryList.append(geo)

In [4]:
# Storing the geometries in something we can work with.
geometryFrame = gpd.GeoDataFrame(geometry=geometryList,crs=worldSeas.crs)

# Storing the centroids
geometryFrame['Centroids']=geometryFrame.centroid

# Creating a reduced dataframe based on the unique coordinates
reducedCentroidCoordinates = geometryFrame['Centroids'].drop_duplicates()
# Creating the reduced geometry and message dataframe
reducedGeometryFrame = geometryFrame.loc[reducedCentroidCoordinates.index]
reducedGeometryFrame = reducedGeometryFrame.reset_index(drop=True)
reducedGeoMessages = geoMessages.loc[reducedCentroidCoordinates.index]
reducedGeoMessages = reducedGeoMessages.reset_index(drop=True)
# Setting the geometry of the reduced-messages-dataframe
reducedGeoMessages['geometry'] = reducedGeometryFrame['geometry']
reducedGeoMessages['Centroids'] = reducedGeometryFrame['Centroids']
reducedGeoMessages.geometry = reducedGeoMessages['geometry']
reducedGeoMessages.crs = worldSeas.crs
print('This is the areas imported',areaIndices.keys())

This is the areas imported dict_keys(['hydroArc', 'hydroLant', 'hydroPac', 'navareaIV', 'navareaXII'])


In [6]:
avaliableAreas = list(areaIndices.keys())
area0 = reducedGeoMessages.loc[areaIndices[avaliableAreas[0]]]
area1 = reducedGeoMessages.loc[areaIndices[avaliableAreas[1]]]
area2 = reducedGeoMessages.loc[areaIndices[avaliableAreas[2]]]
area3 = reducedGeoMessages.loc[areaIndices[avaliableAreas[3]]]
area4 = reducedGeoMessages.loc[areaIndices[avaliableAreas[4]]]

If one needs to work with all the data, the below is necessary to execute, otherwise just use the above.

In [7]:
# Updating the georeferenced messages with correct geometries
geoMessages['geometry'] = geometryFrame['geometry']
geoMessages['Centroids'] = geometryFrame['Centroids']
geoMessages.geometry = geoMessages['geometry']
geoMessages.crs = worldSeas.crs

# Creating a column in the full and the reduced datasset, which is mergable.
geometryFrame['CentroidCoor'] = [list(geometryFrame['Centroids'].loc[obs].coords)[0] for obs in geometryFrame.index]
reducedGeometryFrame['CentroidCoor'] = [list(reducedGeometryFrame['Centroids'].loc[obs].coords)[0] for obs in reducedGeometryFrame.index]

# Extracting all the subarea-tags
area = []
for obs in reducedGeometryFrame.index:
    area.append([i for i,subarea in enumerate(areaIndices.keys()) if obs in areaIndices[subarea]][0])
reducedGeometryFrame['Subarea'] = area

# Merging the reduced dataset, with subarea-tags, onto the fulldataset.
mergedCentroids = pd.merge(geometryFrame,reducedGeometryFrame,on=['CentroidCoor'],how='inner')

In [13]:
subareasMessages = {subarea : mergedCentroids[mergedCentroids['Subarea_y']==i].shape[0] for i,subarea in enumerate(areaIndices.keys())}

In [14]:
subareasMessages

{'hydroArc': 6270,
 'hydroLant': 3510,
 'hydroPac': 12455,
 'navareaIV': 25228,
 'navareaXII': 6569}

In [15]:
sum(subareasMessages.values())

54032

In [16]:
messages.shape

(53956, 12)

In [17]:
geoMessages.shape

(53956, 12)

In [18]:
mergedCentroids.shape


(54032, 7)