# Sample from Raw Data

Data is randomly sampled from the original dataset provided by Global Fishing Watch. Prior to sampling, I drop all rows with MMSI numbers containing less than 6 digits. This will prevent the possibility of returning multiple search results for one MMSI number in the IMO database and only removes .2% of the data.

### Import Packages and Data

In [1]:
# Standard data manipulation libraries
import numpy as np
import pandas as pd

In [2]:
# Read in raw data and check shape
raw = pd.read_csv('./data/raw_gaps.csv')
raw.shape

(6653278, 21)

### Remove Incomplete MMSI numbers

In [3]:
# Check how MMSI numbers have less than 6 digits
raw[(raw['ssvid'] < 100_000)]['ssvid'].count()

12162

In [4]:
# Drop all MMSI numbers with less than 6 digits
raw.drop(raw[raw['ssvid'] < 100_000].index, inplace=True)

### Create Random Sample

In [5]:
# Create dataframe of 500,000 rows
raw_sample = raw.sample(n=500_000, replace=False, random_state=42)
raw_sample

Unnamed: 0,ssvid,gap_hours,gap_distance_m,gap_implied_speed_knots,positions_per_day,vessel_class,flag,off_timestamp,off_msgid,off_lat,...,off_type,off_receiver_type,off_distance_from_shore_m,on_timestamp,on_msgid,on_lat,on_lon,on_type,on_receiver_type,on_distance_from_shore_m
1165969,412422839,39.433333,535.624620,0.007334,0.034917,fishing,CHN,2018-11-02T08:32:54Z,ab04ae1f-022d-1cf8-6704-79f5c18f03fe,29.940583,...,B,terrestrial,0.0,2018-11-03T23:59:40Z,d66f5755-1e80-50b7-9c24-97affcd0d30b,29.938277,122.273638,B,terrestrial,1000.0
1683485,247143160,80.800000,1260.751756,0.008425,2.947519,trawlers,ITA,2018-05-18T18:22:40Z,a08e856c-e011-5dab-8ae8-03a4bf0c7ba1,38.099993,...,A,terrestrial,0.0,2018-05-22T03:11:32Z,97e47f62-0784-5e70-bea4-b57c23dcab63,38.088952,13.540445,A,terrestrial,0.0
2917899,224231150,15.816667,9683.318918,0.330574,0.115448,set_gillnets,ESP,2018-03-22T11:28:05Z,b4fbb421-1944-5a5f-b3ae-aabb589bb8d3,43.406192,...,B,terrestrial,3000.0,2018-03-23T03:17:34Z,7137847e-305e-5a4f-8b82-ab83b181c4ab,43.354105,-8.375345,B,terrestrial,1000.0
853267,413002111,14.433333,2965.614746,0.110945,0.416022,trawlers,CHN,2018-02-28T19:59:57Z,06e02215-e3b7-59ac-8504-f404fc25196c,26.573672,...,A,terrestrial,42000.0,2018-03-01T10:26:32Z,499bb4b5-a209-5f79-9718-ea361000bbdc,26.564128,120.902237,A,terrestrial,44000.0
5108673,247074840,16.450000,495.809447,0.016275,0.538665,trawlers,ITA,2018-04-25T07:43:16Z,248c64e1-6d7b-5553-bffe-8974fc455332,44.675090,...,A,terrestrial,0.0,2018-04-26T00:10:43Z,5659fa33-6771-584a-95a5-4d35df3f544f,44.675367,12.235438,A,terrestrial,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3905358,312366522,102.416667,9435.491271,0.049745,0.007645,fixed_gear,CHN,2018-12-12T00:54:53Z,54079cef-3918-5b42-be84-d6d3d43b6afe,34.483688,...,B,terrestrial,1000.0,2018-12-16T07:20:30Z,d831603a-0c38-5298-86c2-4c03180a8ec3,34.552417,119.855133,A,terrestrial,8000.0
4724997,421255667,42.500000,62088.011607,0.788820,0.290867,trawlers,CHN,2018-09-21T15:39:25Z,2b8b00ea-3f70-5e3a-b787-48ace0b9b1ab,24.603018,...,B,terrestrial,45000.0,2018-09-23T10:10:24Z,509fe17e-4ec0-76a1-8a8f-2f1f8a58ce5c,24.582675,119.013088,B,terrestrial,31000.0
171526,413625297,41.100000,3141.256183,0.041269,0.021604,set_gillnets,CHN,2018-10-05T07:07:15Z,016a4319-a145-5b37-899b-427ab719351a,38.792137,...,B,terrestrial,3000.0,2018-10-07T00:14:01Z,e047197e-7b1a-5275-becf-4f91b9719df3,38.820360,121.400630,B,terrestrial,0.0
5610041,412324063,45.433333,12.352876,0.000147,180.878788,fishing,CHN,2019-11-23T06:52:34Z,97bbc609-9430-762b-eca4-aaf4f7e3c809,37.694897,...,B,terrestrial,0.0,2019-11-25T04:19:00Z,e590065b-2a8f-0af0-e941-6f3779207ad6,37.694905,121.143485,B,terrestrial,0.0


In [7]:
# Check data types
raw_sample.dtypes

ssvid                          int64
gap_hours                    float64
gap_distance_m               float64
gap_implied_speed_knots      float64
positions_per_day            float64
vessel_class                  object
flag                          object
off_timestamp                 object
off_msgid                     object
off_lat                      float64
off_lon                      float64
off_type                      object
off_receiver_type             object
off_distance_from_shore_m    float64
on_timestamp                  object
on_msgid                      object
on_lat                       float64
on_lon                       float64
on_type                       object
on_receiver_type              object
on_distance_from_shore_m     float64
dtype: object

In [8]:
# Check for null values
raw_sample.isnull().sum()

ssvid                           0
gap_hours                       0
gap_distance_m                  0
gap_implied_speed_knots         0
positions_per_day            3000
vessel_class                    0
flag                            0
off_timestamp                   0
off_msgid                       0
off_lat                         0
off_lon                         0
off_type                     3000
off_receiver_type               0
off_distance_from_shore_m      19
on_timestamp                    0
on_msgid                        0
on_lat                          0
on_lon                          0
on_type                      2531
on_receiver_type                0
on_distance_from_shore_m       12
dtype: int64

In [9]:
# Check for duplicates
raw_sample.duplicated().sum()

30

In [10]:
# Check for even representation of vessels
raw_sample['ssvid'].value_counts().sort_values(ascending=False)

235067061    54
412465495    49
440152920    48
227594760    47
416005071    47
             ..
412469559     1
320715065     1
412451134     1
900015997     1
338173948     1
Name: ssvid, Length: 81609, dtype: int64

### Save to CSV

In [11]:
# save to csv
raw_sample.to_csv('./data/raw_sample.csv', index=False)