In [8]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import re
import os
import time
from shapely.geometry import Point, Polygon, LineString,MultiPolygon
from shapely.ops import unary_union

In [25]:
# Reading in the area-separating-indices created in 2.AreaBasedWarnings.ipynb

directory = 'AreaIndices/'
i = 0
areaIndices = {}
for filename in os.listdir(directory):
    with open(directory+filename,'r') as textfile:
        obs = textfile.readlines()
        obs = [int(re.sub('\n','',i)) for i in obs]
        name = re.sub('.txt','',filename)
        areaIndices[name] = obs
worldSeas = gpd.read_file('World_Seas_IHO_v3/World_Seas_IHO_v3.shp')

In [36]:
# Reading in the cleaned data
messages = pd.read_csv('CleanedData.csv',index_col=0)
geoMessages = gpd.GeoDataFrame(messages)

# Making sure that the dataframe eventually will have a 'valid' geometry column 
geometryList = []
for i,(shape,geom) in enumerate(zip(geoMessages.GeometryType,geoMessages.Coordinates)):
    
    if shape == 'Point':
        coor = []
        ele = geom.split()
        ele = [float(re.sub('[](),[]','',i)) for i in ele]

        coor.append(tuple(ele))
        geo = Point(coor)
        geometryList.append(geo)
        
    elif shape == 'GeometryCollection':
        
        coor = []
        ele = geom.split()
        ele = [float(re.sub('[](),[]','',i)) for i in ele]
        
        tupleCoor = [tuple(ele[i:i+3]) for i in np.arange(0,len(ele),3)]
        if i == 4956 or i == 5681 or i == 1037:
            # Necessary because the three observations are LineStrings, or closer to linestrings than polygons.
            geo = LineString(tupleCoor)
        else:
            geo = Polygon(tupleCoor)
        geometryList.append(geo)

In [29]:
# Storing the geometries in something we can work with.
geometryFrame = gpd.GeoDataFrame(geometry=geometryList,crs=worldSeas.crs)

# Storing the centroids
geometryFrame['Centroids']=geometryFrame.centroid

# Creating a reduced dataframe based on the unique coordinates
reducedCentroidCoordinates = geometryFrame['Centroids'].drop_duplicates()
# Creating the reduced geometry and message dataframe
reducedGeometryFrame = geometryFrame.loc[reducedCentroidCoordinates.index]
reducedGeometryFrame = reducedGeometryFrame.reset_index(drop=True)
reducedGeoMessages = geoMessages.loc[reducedCentroidCoordinates.index]
reducedGeoMessages = reducedGeoMessages.reset_index(drop=True)
# Setting the geometry of the reduced-messages-dataframe
reducedGeoMessages['geometry'] = reducedGeometryFrame['geometry']
reducedGeoMessages['Centroids'] = reducedGeometryFrame['Centroids']
reducedGeoMessages.geometry = reducedGeoMessages['geometry']
reducedGeoMessages.crs = worldSeas.crs
print('This is the areas imported',areaIndices.keys())

In [None]:
avaliableAreas = list(areaIndices.keys())
area0 = reducedGeoMessages.loc[areaIndices[avaliableAreas[0]]]
area1 = reducedGeoMessages.loc[areaIndices[avaliableAreas[1]]]
area2 = reducedGeoMessages.loc[areaIndices[avaliableAreas[2]]]
area3 = reducedGeoMessages.loc[areaIndices[avaliableAreas[3]]]
area4 = reducedGeoMessages.loc[areaIndices[avaliableAreas[4]]]

If one needs to work with all the data, the below is necessary to execute, otherwise just use the above.

In [63]:
geoMessages['geometry'] = geometryFrame['geometry']
geoMessages['Centroids'] = geometryFrame['Centroids']
geoMessages.geometry = geoMessages['geometry']
geoMessages.crs = worldSeas.crs
geometryFrame['Subarea'] = np.repeat(0,geometryFrame.shape[0])

In [60]:
start = time.time()
for obs in reducedGeometryFrame.index:
    if obs < 10:
        print(obs)
    area = [i for i,subarea in enumerate(areaIndices.keys()) if obs in areaIndices[subarea]]
        #print(area)
    relevantIndex = geoMessages[geoMessages['Centroids']==reducedGeometryFrame['Centroids'].loc[obs]].index
    relevantIndex = list(relevantIndex)
    geoMessages.loc[relevantIndex].assign(**{'Subarea':area[0]})
    #df.append(df.loc[[4] * 10].assign(**{'C': 1}), ignore_index=True)
end = time.time()

print('It took %.3f seconds to complete using "assign"'%(end-start))

0
1
2
3
4
5
6
7
8
9


IndexError: list index out of range

In [62]:
geometryFrame.crs = worldSeas.crs

In [65]:
start = time.time()
for obs in reducedGeometryFrame.index:
    if obs < 10:
        print(obs)
    area = [i for i,subarea in enumerate(areaIndices.keys()) if obs in areaIndices[subarea]]
        #print(area)
    relevantIndex = geometryFrame[geometryFrame['Centroids']==reducedGeometryFrame['Centroids'].loc[obs]].index
    relevantIndex = list(relevantIndex)
    geometryFrame['Subarea'].loc[relevantIndex]=area[0]
    #df.append(df.loc[[4] * 10].assign(**{'C': 1}), ignore_index=True)
end = time.time()

print('It took %.3f seconds to complete using "assign"'%(end-start))

0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


1
2
3
4
5
6
7
8
9


KeyboardInterrupt: 