# Libraries

In [1]:
import pandas as pd
import requests
from shapely import wkt
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed


# Helpful functions

In [2]:
""" def calculate_area(wkt_polygon):
    polygon = wkt.loads(wkt_polygon)
    return polygon.area
     
      
    This function is deprecated"""


' def calculate_area(wkt_polygon):\n    polygon = wkt.loads(wkt_polygon)\n    return polygon.area\n     \n      \n    This function is deprecated'

In [3]:
API_KEY = '00f95217d3b04a0c9e1af341b4e1608a'

def get_coordinates(county, name):
    place_name = f"{name}, {county} County, Florida"
    url = f"https://api.opencagedata.com/geocode/v1/json?q={place_name}&key={API_KEY}"
    response = requests.get(url)
    data = response.json()
    if data['results']:
        location = data['results'][0]['geometry']
        return (location['lat'], location['lng'])
    else:
        return (None, None)

This function is gonna get our coordenates using the OpenCage Geocoding API using a key provided by one of the data scientists

In [4]:
def get_coordinates(county, name, state="Florida"):
    place_name = f"{name}, {county} County, {state}"
    url = f"https://api.opencagedata.com/geocode/v1/json?q={place_name}&key={API_KEY}"
    response = requests.get(url)
    data = response.json()
    if data['results']:
        location = data['results'][0]['geometry']
        return (location['lat'], location['lng'])
    else:
        return (None, None)

# Loading the data

In [5]:
ds = pd.read_csv('florida-beach-names.csv')
ds

Unnamed: 0,WKT,COUNTY,NAME,created_user,created_date,last_edited_user,last_edited_date
0,"POLYGON Z ((-9698711.156 3546590.3287 0,-96987...",ESCAMBIA,UNSURVEYED,,,,
1,"POLYGON Z ((-9061671.5384 3555608.0978 0,-9061...",DUVAL,HANNA PARK,,,,
2,"POLYGON Z ((-9054509.0537 3514807.6314 0,-9054...",ST JOHNS,GUANA RIVER SP,,,,
3,"POLYGON Z ((-9668169.2045 3552697.6667 0,-9668...",ESCAMBIA,UNSURVEYED,,,,
4,"POLYGON Z ((-9597884.3653 3547578.4878 0,-9597...",WALTON,WALTON COUNTY BCHS,,,,
...,...,...,...,...,...,...,...
297,"POLYGON Z ((-9039717.3304 2937286.6217 0,-9039...",MONROE,ENP (HIGHLAND BEAC,,,,
298,"POLYGON Z ((-9039593.2653 2937286.6434 0,-9039...",MONROE,ENP (HIGHLAND BEAC,,,,
299,"POLYGON Z ((-9015596.0256 2844170.7527 0,-9015...",MONROE,LITTLE CRAWL KEY,,,,
300,"POLYGON Z ((-9016208.8296 2843020.5594 0,-9016...",MONROE,FAT DEER KEY,,,,


# Looking at the data

In [6]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   WKT               302 non-null    object 
 1   COUNTY            299 non-null    object 
 2   NAME              299 non-null    object 
 3   created_user      0 non-null      float64
 4   created_date      0 non-null      float64
 5   last_edited_user  0 non-null      float64
 6   last_edited_date  0 non-null      float64
dtypes: float64(4), object(3)
memory usage: 16.6+ KB


In [7]:
ds[ds['NAME'].isna()]

Unnamed: 0,WKT,COUNTY,NAME,created_user,created_date,last_edited_user,last_edited_date
27,"POLYGON Z ((-8926217.4063 2933624.3263 0,-8926...",,,,,,
28,"POLYGON Z ((-8927112.0082 2930542.2383 0,-8927...",,,,,,
153,"POLYGON Z ((-9213069.3505 3199083.0432 0,-9212...",,,,,,


# Data preprocessing

First thing thats gonna happen is we gonna get rid of the empty columns and the 3 null rows since non of that is gonna give any useful data

In [8]:
# We make a copy of the data to work on that
df = ds.copy()

In [9]:
df.drop(columns=['created_user', 'created_date', 'last_edited_user', 'last_edited_date', 'WKT'], inplace = True)

In [10]:
df

Unnamed: 0,COUNTY,NAME
0,ESCAMBIA,UNSURVEYED
1,DUVAL,HANNA PARK
2,ST JOHNS,GUANA RIVER SP
3,ESCAMBIA,UNSURVEYED
4,WALTON,WALTON COUNTY BCHS
...,...,...
297,MONROE,ENP (HIGHLAND BEAC
298,MONROE,ENP (HIGHLAND BEAC
299,MONROE,LITTLE CRAWL KEY
300,MONROE,FAT DEER KEY


In [11]:
df.dropna(inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299 entries, 0 to 301
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   COUNTY  299 non-null    object
 1   NAME    299 non-null    object
dtypes: object(2)
memory usage: 7.0+ KB


Ok we dont have any nan rows in our data

## Feature engineering

Now based on our info of the beaches we are gonna find their approximate latitude and longitude

In [13]:
df['latitude'] = None
df['longitude'] = None

We initialize the two empty columns holding the coordenates

In [14]:
for index, row in df.iterrows():
    county = row['COUNTY']
    name = row['NAME']
    coordinates = get_coordinates(county, name)
    df.at[index, 'latitude'] = coordinates[0]
    df.at[index, 'longitude'] = coordinates[1]


In [20]:
df

Unnamed: 0,COUNTY,NAME,latitude,longitude
0,ESCAMBIA,UNSURVEYED,30.74408,-86.562079
1,DUVAL,HANNA PARK,30.370955,-81.402843
2,ST JOHNS,GUANA RIVER SP,29.91218,-81.40989
4,WALTON,WALTON COUNTY BCHS,26.677051,-80.052446
5,BAY,PANAMA CITY BCH,30.176591,-85.805386
...,...,...,...,...
293,COLLIER,KEEWAYDIN ISL,28.822134,-81.341684
295,DADE,GOLDEN BCH,25.965092,-80.122267
297,MONROE,ENP (HIGHLAND BEAC,25.55731,-80.91705
299,MONROE,LITTLE CRAWL KEY,24.742867,-80.983267


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 167 entries, 0 to 301
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   COUNTY     167 non-null    object
 1   NAME       167 non-null    object
 2   latitude   167 non-null    object
 3   longitude  167 non-null    object
dtypes: object(4)
memory usage: 6.5+ KB


In [22]:
df.duplicated().sum()

0

I also noticed some beaches sharing the same name in the same county, so we are gonna drop those and keep just one for each county since it will create the same entries of location with our geolocator

In [23]:
df.drop_duplicates(inplace=True)


In [27]:
df

Unnamed: 0,COUNTY,NAME,latitude,longitude
0,ESCAMBIA,UNSURVEYED,30.74408,-86.562079
1,DUVAL,HANNA PARK,30.370955,-81.402843
2,ST JOHNS,GUANA RIVER SP,29.91218,-81.40989
4,WALTON,WALTON COUNTY BCHS,26.677051,-80.052446
5,BAY,PANAMA CITY BCH,30.176591,-85.805386
...,...,...,...,...
293,COLLIER,KEEWAYDIN ISL,28.822134,-81.341684
295,DADE,GOLDEN BCH,25.965092,-80.122267
297,MONROE,ENP (HIGHLAND BEAC,25.55731,-80.91705
299,MONROE,LITTLE CRAWL KEY,24.742867,-80.983267


In [28]:
filtered_df = df[df['NAME']!='UNSURVEYED']

In [29]:
filtered_df

Unnamed: 0,COUNTY,NAME,latitude,longitude
1,DUVAL,HANNA PARK,30.370955,-81.402843
2,ST JOHNS,GUANA RIVER SP,29.91218,-81.40989
4,WALTON,WALTON COUNTY BCHS,26.677051,-80.052446
5,BAY,PANAMA CITY BCH,30.176591,-85.805386
6,ST JOHNS,ST AUG BCHS,29.91218,-81.40989
...,...,...,...,...
293,COLLIER,KEEWAYDIN ISL,28.822134,-81.341684
295,DADE,GOLDEN BCH,25.965092,-80.122267
297,MONROE,ENP (HIGHLAND BEAC,25.55731,-80.91705
299,MONROE,LITTLE CRAWL KEY,24.742867,-80.983267


In [33]:
filtered_df

Unnamed: 0,COUNTY,NAME,latitude,longitude
1,DUVAL,HANNA PARK,30.370955,-81.402843
2,ST JOHNS,GUANA RIVER SP,29.91218,-81.40989
4,WALTON,WALTON COUNTY BCHS,26.677051,-80.052446
5,BAY,PANAMA CITY BCH,30.176591,-85.805386
6,ST JOHNS,ST AUG BCHS,29.91218,-81.40989
...,...,...,...,...
293,COLLIER,KEEWAYDIN ISL,28.822134,-81.341684
295,DADE,GOLDEN BCH,25.965092,-80.122267
297,MONROE,ENP (HIGHLAND BEAC,25.55731,-80.91705
299,MONROE,LITTLE CRAWL KEY,24.742867,-80.983267


In [35]:
filtered_df['COUNTY'].unique()

array(['DUVAL', 'ST JOHNS', 'WALTON', 'BAY', 'FLAGLER', 'GULF', 'VOLUSIA',
       'BREVARD', 'INDIAN RIVER', 'PINELLAS', 'SARASOTA', 'PALM BEACH',
       'LEE', 'COLLIER', 'DADE', 'MONROE', 'OKALOOSA', 'ESCAMBIA',
       'NASSAU', 'BROWARD', 'FRANKLIN', 'HILLSBOROUGH', 'ST LUCIE',
       'MANATEE', 'MARTIN', 'CHARLOTTE', 'BOWARD', 'SARASOAT'],
      dtype=object)

Can notice how maybe some counties are repeated, SARASOTA and SARASOAT

In [36]:
filtered_df[filtered_df['COUNTY'] == 'SARASOTA']

Unnamed: 0,COUNTY,NAME,latitude,longitude
16,SARASOTA,CASEY KEY,27.150053,-82.480653
155,SARASOTA,VENICE BCHS,27.283566,-82.558122
156,SARASOTA,LONGBOAT KEY S,27.36055,-82.618135
168,SARASOTA,LIDO KEY,27.316991,-82.581487
169,SARASOTA,SIESTA KEY,27.275596,-82.556163
284,SARASOTA,MANASOTA KEY,26.984782,-82.398429


In [37]:
filtered_df[filtered_df['COUNTY']=='SARASOAT']

Unnamed: 0,COUNTY,NAME,latitude,longitude
282,SARASOAT,MANASOTA KEY,27.339802,-82.547837


In [38]:
filtered_df = filtered_df[filtered_df['COUNTY']!= 'SARASOAT']

Seems to be a misspelling so gonna get rid of that line since is already in the other data

In [40]:
filtered_df.to_csv('updated_beaches.csv', index=False)


We save the data so it can be used later in the script without going thru all this changes again

# Model

In [54]:
starting_location = get_coordinates('1200 Anastasia Ave', 'Coral Gables')

Test location. Soon to be filled with the txt value.

In [62]:
starting_location_list = ['Coral Gables', 'Starting Location', starting_location[0], starting_location[1]]

In [63]:
df.loc[len(df.index)] = starting_location_list

In [64]:
df

Unnamed: 0,COUNTY,NAME,latitude,longitude
0,ESCAMBIA,UNSURVEYED,30.74408,-86.562079
1,DUVAL,HANNA PARK,30.370955,-81.402843
2,ST JOHNS,GUANA RIVER SP,29.91218,-81.40989
3,WALTON,WALTON COUNTY BCHS,26.677051,-80.052446
4,BAY,PANAMA CITY BCH,30.176591,-85.805386
...,...,...,...,...
332,DADE,Starting Location,25.765417,-80.221558
333,DADE,Starting Location,25.765417,-80.221558
334,DADE,Starting Location,25.765417,-80.221558
335,Coral Gables,Starting Location,25.73311,-80.258511


In [34]:
beaches_df = filtered_df.append(starting_location, ignore_index=True)

AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
import numpy as np

def manhattan_distance(lat1, lon1, lat2, lon2):
    return abs(lat2 - lat1) + abs(lon2 - lon1)

# Calculate the distance matrix
n = len(beaches_df)
dist_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(n):
        if i != j:
            dist_matrix[i, j] = manhattan_distance(beaches_df.loc[i, 'latitude'], beaches_df.loc[i, 'longitude'],
                                                   beaches_df.loc[j, 'latitude'], beaches_df.loc[j, 'longitude'])
