In [2]:
import pandas as pd
import numpy as np

In [3]:
#data source: http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0111913
# Marcus Eriksen
# The dataset used in this model is based on expeditions from 2007–2013 (Table S1), surveying all five sub-tropical gyres (North Pacific, North Atlantic, South Pacific, South Atlantic, Indian Ocean) and extensive coastal regions and enclosed seas (Bay of Bengal, Australian coasts and the Mediterranean Sea)
# count density (pieces km−2) and weight density (g km−2)

eriksen_data = pd.read_csv('PlasticMarinePollutionGlobalDataset.csv')
eriksen_data.sample(3)

Unnamed: 0,Date,Latitude,Longitude,CD1 (/km^2),CD2 (/km^2),CD3 (/km^2),CD4 (/km^2),WD1 (g/km^2),WD2 (g/km^2),WD3 (g/km^2),WD4 (g/km^2),Sea State,Source,Info,Comments,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
74,26/11/2010,-34.333,-15.9772,11290.95,71249.82,4672.12,,2.88,358.47,137.36,,2.0,M. Eriksen,SAG10-39,,,,,,
802,28/06/2013,32.1936,-40.6542,43224.88,22675.35,5314.53,,14.88,74.75,51.72,,1.0,M. Eriksen,East NAG13-22 Ocean Research Project,,,,,,
1359,17/09/2008,42.7135,6.242,,,,0.0,,,,0.0,2.0,F.Galgani,Macroplastic observations MED-2008-raw data-if...,,,,,,


In [4]:
#rename headers to clarify the size of the plastic
eriksen_data = eriksen_data.rename(columns={"CD1  (/km^2)": "Small Microplastics Count (/km^2)", 
                                            "CD2  (/km^2)": "Large Microplastics Count (/km^2)",
                                            "CD3  (/km^2)": "Mesoplastic Count (/km^2)", 
                                            "CD4  (/km^2)": "Macroplastic Count (/km^2)", 
                                            "WD1 (g/km^2)": "Small Microplastic Weight (g/km^2)", 
                                            "WD2 (g/km^2)": "Large Microplastics Weight (g/km^2)",
                                            "WD3 (g/km^2)": "Mesoplastic Weight (g/km^2)", 
                                            "WD4 (g/km^2)": "Macroplastic Weight (g/km^2)"})
eriksen_data.sample(3)

Unnamed: 0,Date,Latitude,Longitude,Small Microplastics Count (/km^2),Large Microplastics Count (/km^2),Mesoplastic Count (/km^2),Macroplastic Count (/km^2),Small Microplastic Weight (g/km^2),Large Microplastics Weight (g/km^2),Mesoplastic Weight (g/km^2),Macroplastic Weight (g/km^2),Sea State,Source,Info,Comments,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
1370,18/09/2008,42.5783,5.8943,,,,0.0,,,,0.0,2.0,F.Galgani,Macroplastic observations MED-2008-raw data-if...,,,,,,
82,29/11/2010,-31.9295,-6.989,1855.89,14847.09,8042.17,,0.19,31.49,184.97,,1.0,M. Eriksen,SAG10-47,,,,,,
1571,6/14/2012,2.3564,101.6925,,,328.9,13.2,,,614.4,920.4,2.5,Peter Ryan,Straits of Malacca,64.0,"50 m transect, compensating for missed items",,,,


In [5]:
#filter dataframe to only include North Pacific Gyre data
eriksen_data_NPG = eriksen_data.loc[eriksen_data["Info"].str.contains("NPG", na=False)]
eriksen_data_NPG.count()
#481 data pieces
eriksen_data_NPG

Unnamed: 0,Date,Latitude,Longitude,Small Microplastics Count (/km^2),Large Microplastics Count (/km^2),Mesoplastic Count (/km^2),Macroplastic Count (/km^2),Small Microplastic Weight (g/km^2),Large Microplastics Weight (g/km^2),Mesoplastic Weight (g/km^2),Macroplastic Weight (g/km^2),Sea State,Source,Info,Comments,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
140,16/09/2007,37.2045,-133.8650,15506.45,18552.36,1107.60,,2.90,36.88,61.06,,4.00,C. Moore,NPG07-M001,,,,,,
141,16/09/2007,37.1755,-133.9495,32472.93,68218.32,14348.50,,8.84,297.77,1158.28,,4.00,C. Moore,NPG07-M002,,,,,,
142,18/09/2007,38.1515,-137.4531,118621.69,97000.01,14282.21,,13.69,88.07,3654.46,,3.00,C. Moore,NPG07-M004,,,,,,
143,20/09/2007,38.6913,-141.7720,17257.50,18881.74,2436.35,,1.89,35.90,37.76,,3.00,C. Moore,NPG07-M005,,,,,,
144,20/09/2007,38.6878,-142.0264,5100.93,14353.77,1660.77,,1.87,22.04,224.67,,3.00,C. Moore,NPG07-M006,,,,,,
145,20/09/2007,38.6410,-142.2665,9619.48,12488.45,928.20,,1.96,21.26,14.43,,3.00,C. Moore,NPG07-M007,,,,,,
146,21/09/2007,37.8664,-143.8151,16235.81,255134.23,11597.01,,3.71,500.30,361.83,,4.00,C. Moore,NPG07-M008,,,,,,
147,22/09/2007,36.6822,-144.8449,82044.72,391763.53,28202.87,,31.23,788.09,12882.05,,4.00,C. Moore,NPG07-M010,,,,,,
148,22/09/2007,36.5084,-145.0356,20917.25,107991.36,7296.71,,6.32,212.72,40.38,,3.00,C. Moore,NPG07-M012,,,,,,
149,24/09/2007,32.5488,-146.9627,22362.54,60773.50,7498.03,,6.04,454.99,775.32,,4.00,C. Moore,NPG07-M013,,,,,,


In [6]:
# need to update Google Maps API. Data Science toolkit doesn't support reverse geocoding.

import requests
import json
latitude = 34.1030032
longitude = -118.4104684
url = "http://maps.googleapis.com/maps/api/geocode/json?latlng=" + str(latitude) + ", " + str(longitude)
geodata = response.json()
address = geodata['results'][0]
address
# produces 0 results because location is in ocean, this code functions with lat and long on land

NameError: name 'response' is not defined

In [9]:
eriksen_data_NPG["Date"] = pd.to_datetime(eriksen_data_NPG["Date"])
eriksen_data_NPG.sort_values(by=["Date"])
# Just changing the data to ascending order did not work accurately. Example: 14/08/2012 was before 2/8/2012. because 08 versus 8 as month
# So I used the pd.to_datetime function
eriksen_data_NPG.head()
# Do I need to sort dates?

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Date,Latitude,Longitude,Small Microplastics Count (/km^2),Large Microplastics Count (/km^2),Mesoplastic Count (/km^2),Macroplastic Count (/km^2),Small Microplastic Weight (g/km^2),Large Microplastics Weight (g/km^2),Mesoplastic Weight (g/km^2),...,Sea State,Source,Info,Comments,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Year
140,2007-09-16,37.2045,-133.865,15506.45,18552.36,1107.6,,2.9,36.88,61.06,...,4.0,C. Moore,NPG07-M001,,,,,,,2007
141,2007-09-16,37.1755,-133.9495,32472.93,68218.32,14348.5,,8.84,297.77,1158.28,...,4.0,C. Moore,NPG07-M002,,,,,,,2007
142,2007-09-18,38.1515,-137.4531,118621.69,97000.01,14282.21,,13.69,88.07,3654.46,...,3.0,C. Moore,NPG07-M004,,,,,,,2007
143,2007-09-20,38.6913,-141.772,17257.5,18881.74,2436.35,,1.89,35.9,37.76,...,3.0,C. Moore,NPG07-M005,,,,,,,2007
144,2007-09-20,38.6878,-142.0264,5100.93,14353.77,1660.77,,1.87,22.04,224.67,...,3.0,C. Moore,NPG07-M006,,,,,,,2007


In [8]:
eriksen_data_NPG["Year"]=eriksen_data_NPG["Date"].dt.year
eriksen_data_NPG.head()
# works but get pink notification?

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Date,Latitude,Longitude,Small Microplastics Count (/km^2),Large Microplastics Count (/km^2),Mesoplastic Count (/km^2),Macroplastic Count (/km^2),Small Microplastic Weight (g/km^2),Large Microplastics Weight (g/km^2),Mesoplastic Weight (g/km^2),...,Sea State,Source,Info,Comments,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Year
140,2007-09-16,37.2045,-133.865,15506.45,18552.36,1107.6,,2.9,36.88,61.06,...,4.0,C. Moore,NPG07-M001,,,,,,,2007
141,2007-09-16,37.1755,-133.9495,32472.93,68218.32,14348.5,,8.84,297.77,1158.28,...,4.0,C. Moore,NPG07-M002,,,,,,,2007
142,2007-09-18,38.1515,-137.4531,118621.69,97000.01,14282.21,,13.69,88.07,3654.46,...,3.0,C. Moore,NPG07-M004,,,,,,,2007
143,2007-09-20,38.6913,-141.772,17257.5,18881.74,2436.35,,1.89,35.9,37.76,...,3.0,C. Moore,NPG07-M005,,,,,,,2007
144,2007-09-20,38.6878,-142.0264,5100.93,14353.77,1660.77,,1.87,22.04,224.67,...,3.0,C. Moore,NPG07-M006,,,,,,,2007


In [7]:
eriksen_data_NPG["Year"].value_counts()

2012    152
2009    110
2011     81
2010     77
2008     36
2007     25
Name: Year, dtype: int64

In [9]:
# above data was for NPG, now try for entire dataset
eriksen_data.count()
#1571 instances

Date                                   1571
Latitude                               1571
Longitude                              1571
Small Microplastics Count (/km^2)       680
Large Microplastics Count (/km^2)       680
Mesoplastic Count (/km^2)               808
Macroplastic Count (/km^2)             1090
Small Microplastic Weight (g/km^2)      442
Large Microplastics Weight (g/km^2)     442
Mesoplastic Weight (g/km^2)             570
Macroplastic Weight (g/km^2)            888
Sea State                              1214
Source                                 1571
Info                                   1571
Comments                                 37
Unnamed: 15                             377
Unnamed: 16                               0
Unnamed: 17                               0
Unnamed: 18                               0
Unnamed: 19                               0
dtype: int64

In [12]:
# convert to datetime format
eriksen_data["Date"] = pd.to_datetime(eriksen_data["Date"])
# add Year column
eriksen_data["Year"]=eriksen_data["Date"].dt.year
# count of data by Year
eriksen_data["Year"].value_counts()

# why is year in decimal here but not above?

2008.0    410
2012.0    394
2011.0    242
2010.0    223
2013.0    167
2009.0    110
2007.0     25
Name: Year, dtype: int64

In [15]:
eriksen_data

Unnamed: 0,Date,Latitude,Longitude,Small Microplastics Count (/km^2),Large Microplastics Count (/km^2),Mesoplastic Count (/km^2),Macroplastic Count (/km^2),Small Microplastic Weight (g/km^2),Large Microplastics Weight (g/km^2),Mesoplastic Weight (g/km^2),...,Sea State,Source,Info,Comments,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Year
0,NaT,,,.335-.999 mm,1.00-4.75 mm,4.75-200 mm,>200 mm,.335-.999 mm,1.00-4.75 mm,4.75-200 mm,...,(Beaufort Scale),,,,,,,,,
1,2010-09-01,19.9432,-64.5649,58102.96,21259.89,2226.17,,4.45,26.83,4.23,...,2.50,M. Eriksen,NAG10-SM001,,,,,,,2010.0
2,2010-09-01,20.2173,-64.3828,6639.79,4031.30,1067.11,,1.04,28.69,40.79,...,2.00,M. Eriksen,NAG10-SM002,,,,,,,2010.0
3,2010-09-01,20.4521,-64.1968,15246.71,12147.79,991.66,,2.57,42.62,3503.27,...,2.00,M. Eriksen,NAG10-SM003,,,,,,,2010.0
4,2010-10-01,21.1293,-63.8333,5347.35,6851.29,1420.39,,1.15,12.86,4.26,...,2.00,M. Eriksen,NAG10-SM004,,,,,,,2010.0
5,2010-10-01,21.4730,-63.5899,4090.58,5317.76,409.06,,0.48,29.63,4.36,...,2.00,M. Eriksen,NAG10-SM005,,,,,,,2010.0
6,2010-10-01,21.7367,-63.4227,44914.59,46144.76,33133.71,,6.26,47.98,2051.84,...,2.00,M. Eriksen,NAG10-SM006,,,,,,,2010.0
7,2010-11-01,22.1500,-63.1474,17324.82,4519.52,502.17,,1.38,3.52,1.13,...,2.00,M. Eriksen,NAG10-SM007,,,,,,,2010.0
8,2010-11-01,22.8418,-62.7592,10467.48,10329.75,413.19,,1.24,33.90,15.84,...,2.00,M. Eriksen,NAG10-SM008,,,,,,,2010.0
9,2010-12-01,24.3576,-62.8129,7434.53,9436.14,190.63,,1.14,28.59,0.29,...,4.00,M. Eriksen,NAG10-SM009,,,,,,,2010.0


In [None]:
# ideas: use geocoding to map and then look at counts and types by location
# show change over time in visualization