## Import pandas & numpy

In [1]:
import pandas as pd
import numpy as np

## Import csv file

In [2]:
sharks = pd.read_csv("../Data_cleaning_manipulation/GSAF5.csv", encoding = 'cp1252')

## Preview the data (first 5 rows)

In [94]:
sharks.head()

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,year_categories,year_categories_decades
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,from 2000 onwards,from 2010 onwards
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,from 2000 onwards,from 2010 onwards
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,from 2000 onwards,from 2010 onwards
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,from 2000 onwards,from 2010 onwards
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,from 2000 onwards,from 2010 onwards


## Make a copy of file

Before starting to work on the file, it is good practice to make a copy, so that any mistakes do not overwrite the original file

In [4]:
sharks_copy = sharks.copy()

## Start to look at the data

In [5]:
sharks_copy.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

We can see that a lot of these are objects when they shouldn't be (e.g. time or date)

In [6]:
sharks_copy.dtypes

Case Number               object
Date                      object
Year                       int64
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Age                       object
Injury                    object
Fatal (Y/N)               object
Time                      object
Species                   object
Investigator or Source    object
pdf                       object
href formula              object
href                      object
Case Number.1             object
Case Number.2             object
original order             int64
Unnamed: 22               object
Unnamed: 23               object
dtype: object

In [7]:
#to find number of rows & columns. Output: (5992, 24)
sharks_copy.shape

(5992, 24)

In [8]:
sharks_copy['Sex '].value_counts()

M      4835
F       585
M         2
lli       1
.         1
N         1
Name: Sex , dtype: int64

In [9]:
sharks_copy['Country'].value_counts()

USA                       2116
AUSTRALIA                 1279
SOUTH AFRICA               565
PAPUA NEW GUINEA           133
NEW ZEALAND                125
                          ... 
WESTERN SAMOA                1
RED SEA / INDIAN OCEAN       1
KOREA                        1
EGYPT / ISRAEL               1
RED SEA                      1
Name: Country, Length: 203, dtype: int64

In [10]:
sharks_copy['Fatal (Y/N)'].value_counts()

N          4315
Y          1552
UNKNOWN      94
 N            8
N             1
n             1
#VALUE!       1
F             1
Name: Fatal (Y/N), dtype: int64

In [11]:
sharks_copy['Year'].value_counts().sort_values

<bound method Series.sort_values of 2015    139
2011    128
2014    125
0       124
2013    122
       ... 
1742      1
1758      1
1818      1
1822      1
1595      1
Name: Year, Length: 232, dtype: int64>

## Fix column headings

The column names look a bit difficult to work with, so I decided to improve them. 

First, I stripped the whitespaces in the headings.

In [12]:
sharks_copy.columns = sharks_copy.columns.str.strip()

Then I  set column names to lower case to make easier to work with

In [13]:
sharks_copy.columns = sharks_copy.columns.str.lower()

Then I replaced spaces in column names with underscores to make easier to work with

In [14]:
sharks_copy.columns = sharks_copy.columns.str.replace(' ', '_')

In [15]:
sharks_copy.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal_(y/n)', 'time',
       'species', 'investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number.1', 'case_number.2', 'original_order', 'unnamed:_22',
       'unnamed:_23'],
      dtype='object')

## Rename 'type' column 

This column name is unclear, so I renamed it to incident_type

In [16]:
sharks_copy = sharks_copy.rename(columns={'type': 'incident_type'})

## Check if columns can be deleted

I noticed that a lot of values in two columns were nulls, so I checked how many missing values they had.

In [17]:
sharks_copy['unnamed:_22'].value_counts()

stopped here    1
Name: unnamed:_22, dtype: int64

In [18]:
sharks_copy['unnamed:_23'].value_counts()

change filename    1
Teramo             1
Name: unnamed:_23, dtype: int64

I also checked how many missing values there were for all columns

In [19]:
missing_values = sharks_copy.isna().sum()

In [20]:
missing_values

case_number                  0
date                         0
year                         0
incident_type                0
country                     43
area                       402
location                   496
activity                   527
name                       200
sex                        567
age                       2681
injury                      27
fatal_(y/n)                 19
time                      3213
species                   2934
investigator_or_source      15
pdf                          0
href_formula                 1
href                         3
case_number.1                0
case_number.2                0
original_order               0
unnamed:_22               5991
unnamed:_23               5990
dtype: int64

And checked the percentage of missing values per column, to get a quick overview

In [21]:
missing_values_perc = sharks_copy.isna().sum()/len(sharks_copy)

In [22]:
missing_values_perc

case_number               0.000000
date                      0.000000
year                      0.000000
incident_type             0.000000
country                   0.007176
area                      0.067089
location                  0.082777
activity                  0.087951
name                      0.033378
sex                       0.094626
age                       0.447430
injury                    0.004506
fatal_(y/n)               0.003171
time                      0.536215
species                   0.489653
investigator_or_source    0.002503
pdf                       0.000000
href_formula              0.000167
href                      0.000501
case_number.1             0.000000
case_number.2             0.000000
original_order            0.000000
unnamed:_22               0.999833
unnamed:_23               0.999666
dtype: float64

I checked specifically which columns have more than 70% missing values

In [23]:
missing_values_perc.loc[missing_values_perc.gt(0.7)].index

Index(['unnamed:_22', 'unnamed:_23'], dtype='object')

So I removed these columns: 'Unnamed: 22', 'Unnamed: 23' because 99.9% missing values

In [24]:
sharks_copy = sharks_copy.drop(columns=['unnamed:_22', 'unnamed:_23'])

Then I double checked that it worked by checking the shape and comparing that to before I made the change. 
Previous output: (5992, 24). Current output: (5992, 22)

In [25]:
sharks_copy.shape

(5992, 22)

## Check for duplicate columns

A few columns looked very similar, so I checked if they were copies of each other.
I found that they were not, so they could not be deleted without further inspection. I decided to leave them as they are and rather work on cleaning the data in other ways.

In [26]:
'Case_Number.1' == 'Case_Number.2'

False

In [27]:
'Case_Number.1' == 'Case_Number'

False

In [28]:
'Case_Number.2' == 'Case_Number'

False

In [29]:
'href_formula' == 'href'

False

## Check for incorrectly named values in a column & rename them

I saw that there are a few unexpected values in the column 'sex', so I investigated further and renamed them as appropriate

In [30]:
sharks_copy['sex'].value_counts()

M      4835
F       585
M         2
lli       1
.         1
N         1
Name: sex, dtype: int64

In [31]:
print(set(sharks_copy['sex']))

{nan, 'M', 'N', '.', 'M ', 'F', 'lli'}


In [32]:
sharks_copy['sex'].unique()

array(['M', nan, 'F', 'M ', 'lli', 'N', '.'], dtype=object)

'M ' clearly meant 'M', so I replaced it, so that these values are correctly counted as male

In [33]:
sharks_copy['sex'] = sharks_copy['sex'].str.replace('M ', 'M')

In [34]:
sharks_copy['sex'].value_counts()

M      4837
F       585
lli       1
.         1
N         1
Name: sex, dtype: int64

In [35]:
print(set(sharks_copy['sex']))

{nan, 'M', 'N', '.', 'F', 'lli'}


I checked which row has 'lli' in it. Name is 'Brian Kang' --> most likely male, so I was able to change 'lli' to 'M' as well

In [36]:
sharks_copy.loc[sharks_copy['sex'] == 'lli']

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
1400,2004.11.11.b,11-Nov-04,2004,Unprovoked,USA,California,"Bunkers, Humboldt Bay, Eureka, Humboldt County",Surfing,Brian Kang,lli,...,N,13h30,5.5 m [18'] white shark,"R. Collier, GSAF",2004.11.11.b-Kang.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2004.11.11.b,2004.11.11.b,4593


In [37]:
sharks_copy['sex'] = sharks_copy['sex'].str.replace('lli', 'M')

I checked the row with the value 'N' in it. 
I expected that it might refer to a third gender, but on closer inspection it doesn't appear so.
From the names and year, this appears to be a married heterosexual couple.
Therefore, the value 'N' for 'sex' is misleading and should be renamed. 

In [38]:
sharks_copy.loc[sharks_copy['sex'] == 'N']

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
4708,1934.07.11,11-Jul-34,1934,Boating,AUSTRALIA,New South Wales,Cronulla,Fishing,"18' boat, occupants William & Leslie Newton",N,...,N,,"Blue pointer, 11'","G.P. Whitley, ref: Daily Telegraph, 7/11/1934 ...",1934.07.11-Newton-boat-Australia.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1934.07.11,1934.07.11,1285


In [39]:
sharks_copy['sex'] = sharks_copy['sex'].str.replace('N', 'Unknown')

Then I checked the last unclear value in the column, and decided that it should also be replaced with 'Unknown' as I could not discern other relevant information indicating otherwise from looking at other data in the row.

In [40]:
sharks_copy.loc[sharks_copy['sex'] == '.']

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
5201,1908.06.02.R,Reported 02-Jun-1908,1908,Sea Disaster,PAPUA NEW GUINEA,New Britain,Matupi,.,,.,...,Y,,Allegedly a 33-foot shark,"Taranaki Herald, 6/2/1908",1908.06.02.R-Matupi.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1908.06.02.R,1908.06.02.R,792


In [41]:
sharks_copy['sex'] = sharks_copy['sex'].str.replace('.', 'Unknown')

In [42]:
sharks_copy['sex'].value_counts()

M          4838
F           585
Unknown       2
Name: sex, dtype: int64

## Changing values in a column depending on a condition

I saw that in the 'name' column, there were several values with 'male' that had a null value under 'sex'.

In [43]:
sharks_copy['sex'].value_counts()

M          4838
F           585
Unknown       2
Name: sex, dtype: int64

In [44]:
sharks_copy.loc[sharks_copy['name'] == 'male']

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989
6,2016.09.11,11-Sep-16,2016,Unprovoked,USA,Florida,"Ponte Vedra, St. Johns County",Wading,male,M,...,N,15h15,3' to 4' shark,"News4Jax, 9/11/2016",2016.09.11-PonteVedra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.11,2016.09.11,5987
14,2016.08.29.a,29-Aug-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,14h00,,"News Channel 8, 8/30/16",2016.08.29.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.08.29.a,2016.08.29.a,5979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5963,ND.0030,Before 1905,0,Unprovoked,BURMA,,,Bathing,male,M,...,Y,,,"Massillon Independent, 3/1905",ND-0030-Burma.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0030,ND.0030,30
5975,ND.0017,Before 1921,0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Crew swimming alongside their anchored ship,male,M,...,Y,,,"Captain A. Anderson, Natal Mercury, 12/31/192...",ND-0017-alongside-ship.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0017,ND.0017,18
5976,ND.0016,Before 1921,0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,4 men were bathing,male,M,...,Y,,,"Captain A. Anderson, Natal Mercury, 12/31/192...",ND-0016- Durban-PostOffice.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0016,ND.0016,17
5987,ND.0005,Before 1903,0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,Y,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6


This can be seen clearly in the table below

In [45]:
sharks_copy.loc[(sharks_copy['name'] == 'male') & (sharks_copy['sex'].isnull())]

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
2000,1997.09.08,08-Sep-97,1997,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,,...,N,Evening,small blacktip shark,"Daytona News-Journal, 9/9/1997",1997.09.08-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1997.09.08,1997.09.08,3993
2824,1980.12.26,26-Dec-80,1980,Invalid,SOUTH AFRICA,Eastern Cape Province,Port Elizabeth,,male,,...,Y,,,"Eastern Province Herald, 12/29/1980",1980.12.26-scavenging.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1980.12.26,1980.12.26,3169
3068,1973.09.29,29-Sep-73,1973,Sea Disaster,SOUTH AFRICA,KwaZulu-Natal,Mission Rocks,Being pulled to shore from wreck of 25-ton fis...,male,,...,Y,,,"Natal Mercury, 10/5/1973",1973.09.29-AlanS.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1973.09.29,1973.09.29,2925
4112,1954.07.01.R,Reported 01-Jul-1954,1954,Invalid,CROATIA,,Pula,,male,,...,,,,"C. Moore, GSAF",1954.07.01.R-Pula.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1954.07.01.R,1954.07.01.R,1881
4240,1950.07.19,1950.07.19,1950,Provoked,ITALY,Savona,Albenga,Fishing,male,,...,N,,,"C. Moore, GSAF",1950.07.19-Albenga.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1950.07.19,1950.07.19,1753
5018,1920.11.04,04-Nov-20,1920,Sea Disaster,PHILIPPINES,Leyte,,The coastwise steamer San Basilio capsized in ...,male,,...,Y,,,"Oakland Tribune, 11/11/1920",1920.11.04-Philippines.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1920.11.04,1920.11.04,975
5779,1842.07.06,06-Jul-1842,1842,Provoked,USA,New Jersey,"Absecon, Atlantic County",Harassing a shark,male,,...,n,,,"New York Evening Post, 7/11/1842",1842.07.06-Absecon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1842.07.06,1842.07.06,214
5825,1816.09.03.R,Reported 03-Sept-1816,1816,Unprovoked,USA,Rhode Island,Bristol Harbor,Swimming,male,,...,Y,,,"Connecticut Courant, 9/3/1816",1816.09.03.R-Rhode-Island.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1816.09.03.R,1816.09.03.R,168
5865,0500.00.00,Circa 500 A.D.,500,Unprovoked,MEXICO,,,,male,,...,N,,,J. Castro,500AD-Mexico.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0500.00.00,0500.00.00,128


I set a condition to change the null values in the 'sex' column to 'M' when 'male' fell under 'name'

In [46]:
sharks_copy.loc[(sharks_copy['name'] == 'male') & (sharks_copy['sex'].isna()), 'sex'] = 'M'

Below shows that it worked- there are now 9 more values under 'M' in the 'sex' column and recalling the table that meets the condition comes up empty. 

In [47]:
sharks_copy['sex'].value_counts()

M          4847
F           585
Unknown       2
Name: sex, dtype: int64

In [48]:
sharks_copy.loc[(sharks_copy['name'] == 'male') & (sharks_copy['sex'].isnull())]

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order


In [49]:
sharks_copy.loc[sharks_copy['name'] == 'male']

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989
6,2016.09.11,11-Sep-16,2016,Unprovoked,USA,Florida,"Ponte Vedra, St. Johns County",Wading,male,M,...,N,15h15,3' to 4' shark,"News4Jax, 9/11/2016",2016.09.11-PonteVedra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.11,2016.09.11,5987
14,2016.08.29.a,29-Aug-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,14h00,,"News Channel 8, 8/30/16",2016.08.29.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.08.29.a,2016.08.29.a,5979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5963,ND.0030,Before 1905,0,Unprovoked,BURMA,,,Bathing,male,M,...,Y,,,"Massillon Independent, 3/1905",ND-0030-Burma.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0030,ND.0030,30
5975,ND.0017,Before 1921,0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Crew swimming alongside their anchored ship,male,M,...,Y,,,"Captain A. Anderson, Natal Mercury, 12/31/192...",ND-0017-alongside-ship.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0017,ND.0017,18
5976,ND.0016,Before 1921,0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,4 men were bathing,male,M,...,Y,,,"Captain A. Anderson, Natal Mercury, 12/31/192...",ND-0016- Durban-PostOffice.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0016,ND.0016,17
5987,ND.0005,Before 1903,0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,Y,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6


## Changing index

I wanted to check if the another column (original_order, case_number.1 or case_number.2) could work as an index column, but I saw that their values are not all unique, so they wouldn't work. I decided that it would be clearer to leave the index numbers as they are. 

In [50]:
sharks_copy['original_order'].value_counts(dropna=False)

5661    2
569     2
3847    2
5739    2
2047    1
       ..
5428    1
1334    1
3383    1
5432    1
2049    1
Name: original_order, Length: 5988, dtype: int64

In [51]:
sharks_copy['case_number.2'].value_counts(dropna=False)

1923.00.00.a    2
1983.06.15      2
1966.12.26      2
2012.09.02.b    2
1990.05.10      2
               ..
2008.12.20      1
1995.08.15      1
1965.03.14      1
2011.08.16.b    1
2016.05.22      1
Name: case_number.2, Length: 5976, dtype: int64

In [52]:
sharks_copy['case_number.1'].value_counts(dropna=False)

1923.00.00.a    2
2009.12.18      2
2012.09.02.b    2
1952.08.04      2
1966.12.26      2
               ..
1961.01.03.a    1
1901.01.30      1
2008.12.20      1
1995.08.15      1
2016.05.22      1
Name: case_number.1, Length: 5975, dtype: int64

## Create a new column with year categories

I thought it would be interesting to see the years grouped and be able to sort and filter the data according to a broader time period, so I decided to create an extra column with specific time periods.

STEP 1: redefine 0 values in 'year' column as 'Unknown'.
- To ensure that they do not get incorrectly counted in the 'before 1900' category. 
- Ideally, I would have liked to take a closer look at the 'date' column and reassign some of the 'Unknowns' with information provided in the 'date' column. However, given that the 'date' column is extremely messy, with various combinations of strings and integers and some very vague values (e.g. 'before 2016', 'before the war', 'ca. 336 B.C'), I decided that that would be too ambitious to look in so much detail for this short assignment, given the time constraints.  
- I looked more closely at the data and the values '5', '77' and '500', which first appeared to be incorrect values, as referred to as 'A.D' in the 'date' column, indicating that they do, in fact, refer to years. For example, the value '5' is based on an Aboriginal rock painting depicting a man being attacked by a shark.

STEP 2: create new empty column: 'year_categories'

STEP 3: set categories by decade and apply to new column

In [53]:
print(set(sharks_copy['year']))

{0, 5, 1543, 1554, 1555, 1580, 1595, 77, 1617, 1637, 1638, 1642, 1700, 1703, 1721, 1733, 1738, 1742, 1748, 1749, 1755, 1758, 1764, 1767, 1771, 1776, 1779, 1785, 1787, 1788, 1791, 1800, 1803, 1804, 1805, 1807, 1811, 1812, 1816, 1817, 1818, 1819, 1822, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1834, 1835, 1836, 1837, 1839, 1840, 1841, 1842, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953,

In [54]:
sharks_copy['year'].dtypes

dtype('int64')

### Step 1: change null values to 'Unknown'

In [55]:
sharks_copy.year = sharks_copy['year'].replace([0],'Unknown')

Check that it worked

In [56]:
sharks_copy.tail()

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
5987,ND.0005,Before 1903,Unknown,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,Y,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6
5988,ND.0004,Before 1903,Unknown,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5
5989,ND.0003,1900-1905,Unknown,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4
5990,ND.0002,1883-1889,Unknown,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,...,Y,,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3
5991,ND.0001,1845-1853,Unknown,Unprovoked,CEYLON (SRI LANKA),Eastern Province,"Below the English fort, Trincomalee",Swimming,male,M,...,Y,,,S.W. Baker,ND-0001-Ceylon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0001,ND.0001,2


In [57]:
print(set(sharks_copy['year']))

{5, 1543, 1554, 1555, 1580, 1595, 'Unknown', 77, 1617, 1637, 1638, 1642, 1700, 1703, 1721, 1733, 1738, 1742, 1748, 1749, 1755, 1758, 1764, 1767, 1771, 1776, 1779, 1785, 1787, 1788, 1791, 1800, 1803, 1804, 1805, 1807, 1811, 1812, 1816, 1817, 1818, 1819, 1822, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1834, 1835, 1836, 1837, 1839, 1840, 1841, 1842, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 195

### Step 2: create a new empty column

In [58]:
sharks_copy = sharks_copy.assign(year_categories=pd.NA)

Check that it worked

In [59]:
print(set(sharks_copy['year_categories']))

{<NA>}


In [60]:
sharks_copy['year'].dtypes

dtype('O')

I encountered an issue when I tried to run the below - I was unable to assign values to my new year_categories column. It looks like, in changing the null values to 'Unknown', I also changed the data type of my column. Therefore, I had to change the data type of the 'year' column back to int.

In [61]:
sharks_copy.year = pd.to_numeric(sharks_copy.year, errors='coerce', downcast='integer')

In [62]:
sharks_copy['year'] = sharks_copy['year'].astype('Int64')

In [63]:
sharks_copy['year'].dtypes

Int64Dtype()

### Step 3: create categories and assign values to them in the new column

After doing the above, I was able to assign values to my new column as I had planned to.

In [64]:
sharks_copy.loc[sharks_copy["year"].le(1899), "year_categories"] = "before 1900"

In [65]:
sharks_copy.loc[sharks_copy["year"].between(1900, 1949), "year_categories"] = "1900 to 1949"

In [66]:
sharks_copy.loc[sharks_copy["year"].between(1950, 1999), "year_categories"] = "1950 to 1999"

In [67]:
sharks_copy.loc[sharks_copy["year"].ge(2000), "year_categories"] = "from 2000 onwards"

Check that it worked:

In [68]:
sharks_copy.head()

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,year_categories
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,from 2000 onwards
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,from 2000 onwards
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,from 2000 onwards
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,from 2000 onwards
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,from 2000 onwards


In [69]:
sharks_copy.tail()

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,year_categories
5987,ND.0005,Before 1903,,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6,
5988,ND.0004,Before 1903,,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5,
5989,ND.0003,1900-1905,,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4,
5990,ND.0002,1883-1889,,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,...,,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3,
5991,ND.0001,1845-1853,,Unprovoked,CEYLON (SRI LANKA),Eastern Province,"Below the English fort, Trincomalee",Swimming,male,M,...,,,S.W. Baker,ND-0001-Ceylon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0001,ND.0001,2,


In [70]:
sharks_copy['year_categories'].value_counts()

1950 to 1999         2416
from 2000 onwards    1855
1900 to 1949         1053
before 1900           544
Name: year_categories, dtype: int64

In [71]:
print(set(sharks_copy['year_categories']))

{'1950 to 1999', 'from 2000 onwards', 'before 1900', '1900 to 1949', <NA>}


In [72]:
sharks_copy.loc[sharks_copy['year'] == 1959]

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,year_categories
3808,1959.12.29,29-Dec-59,1959,Boat,AUSTRALIA,Queensland,Scotts Point Beach,Paddling,"12' ski, occupants: Bill Dyer & Cliff Burgess",,...,Afternoon,3.7 m [12'] tiger shark,"Herald, (Redcliffe), 12/31/1959",1959.12.29-Burgess-Dyer.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1959.12.29,1959.12.29,2185,1950 to 1999
3809,1959.12.28,28-Dec-59,1959,Boat,AUSTRALIA,New South Wales,"Leichardt, Sydney",Fishing,"plywood dinghy, occupants: Jack Deegan & Trevo...",,...,Late night,2.4 m [8'] shark,"Daily Mirror (Sydney), 12/29/1959",1959.12.28-NV-Deegan-dinghy.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1959.12.28,1959.12.28,2184,1950 to 1999
3810,1959.12.26,26-Dec-59,1959,Unprovoked,PAPUA NEW GUINEA,Rigo subdistrict,Kaparoka,Swimming underneath house on pilings,Manama Mari,,...,,,"A. M. Rapson, p.150; P. Gilbert, L. Schultz & ...",1959.12.26-Mari.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1959.12.26,1959.12.26,2183,1950 to 1999
3811,1959.12.19.b,19-Dec-59,1959,Unprovoked,AUSTRALIA,Queensland,"Off Wynnum in Moreton Bay, near Brisbane",Dived from dinghy to retrieve oar in heavy seas,Stanley Arthur Mullen,M,...,06h00,,"P. Gilbert, L. Schultz & S. Springer (1960); V...",1959.12.19.b-Mullens.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1959.12.19.b,1959.12.19.b,2182,1950 to 1999
3812,1959.12.19.a,19-Dec-59,1959,Sea Disaster,PHILIPPINES,Masbate,Balud,ship M.V. Rizal sank during typhoon,"Mamerto Daanong, Tomas Inog & others",M,...,12h15,,"V.M. Coppleson (1962), p.259",1959.12.19.a - MV-Rizal.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1959.12.19.a,1959.12.19.a,2181,1950 to 1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3896,1959.00.00.e,1959,1959,Provoked,ATLANTIC OCEAN,,,Paddling & sailing from Buenos Aires to Miami,Mirco Tapavica,M,...,,"Tiger shark, 12' ?","New York Times, 1/29/1960",1959.00.00.e-Tapavica.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1959.00.00.e,1959.00.00.e,2097,1950 to 1999
3897,1959.00.00.d,1959,1959,Unprovoked,PAPUA NEW GUINEA,West New Britain Province,"Poi, Kombe Talasea",Spear fishing,male,M,...,,,"A.M. Rapson, p.150; L. Schultz & M. Malin, p.544",1959.00.00.d-Poi.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1959.00.00.d,1959.00.00.d,2096,1950 to 1999
3898,1959.00.00.c,1959,1959,Unprovoked,PAPUA NEW GUINEA,West New Britain Province,"Kalapiai, Kombe",Fishing,male,M,...,,,"A.M. Rapson, p.150",1959.00.00.c-Kalapiai.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1959.00.00.c,1959.00.00.c,2095,1950 to 1999
3899,1959.00.00.b,1959,1959,Unprovoked,PAPUA NEW GUINEA,"New Ireland Province, Bismarck Archipelago","Enuk Island, Kavieng",,Pasinganlas,M,...,,,"J. McLachlan, Medical Officer, Kavieng; A.M. R...",1959.00.00.b-Pasinganlas.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1959.00.00.b,1959.00.00.b,2094,1950 to 1999


## Create another new column with decade categories

When examining the value counts for my new column (year_categories), I noticed that the more recent time periods have a lot more values than the earlier ones. This is likely because it has become easier to collect and record data. I felt that the time periods I chose were not the most useful and I wanted to improve this, so I decided to create an additional new column separating out the years after 1950 into decades. 

Step 1: create new column: 'year_categories_decades'

Step 2: set categories by decade and apply to new column

In [74]:
sharks_copy = sharks_copy.assign(year_categories_decades=pd.NA)

In [75]:
sharks_copy.loc[sharks_copy["year"].le(1949), "year_categories_decades"] = "before 1950"
sharks_copy.loc[sharks_copy["year"].between(1950, 1959), "year_categories_decades"] = '1950 to 1959'
sharks_copy.loc[sharks_copy["year"].between(1960, 1969), "year_categories_decades"] = '1960 to 1969'
sharks_copy.loc[sharks_copy["year"].between(1970, 1979), "year_categories_decades"] = '1970 to 1979'
sharks_copy.loc[sharks_copy["year"].between(1980, 1989), "year_categories_decades"] = '1980 to 1989'
sharks_copy.loc[sharks_copy["year"].between(1990, 1999), "year_categories_decades"] = '1990 to 1999'
sharks_copy.loc[sharks_copy["year"].between(2000, 2009), "year_categories_decades"] = '2000 to 2009'
sharks_copy.loc[sharks_copy["year"].ge(2010), "year_categories_decades"] = "from 2010 onwards"

In [76]:
sharks_copy['year_categories_decades'].value_counts()


before 1950          1597
2000 to 2009         1020
from 2010 onwards     835
1960 to 1969          617
1990 to 1999          568
1950 to 1959          463
1980 to 1989          434
1970 to 1979          334
Name: year_categories_decades, dtype: int64

I compared the missing values from 'year' with my two new columns, to ensure that they have the same number of missing values (124)

In [77]:
sharks_copy.isna().sum()

case_number                   0
date                          0
year                        124
incident_type                 0
country                      43
area                        402
location                    496
activity                    527
name                        200
sex                         558
age                        2681
injury                       27
fatal_(y/n)                  19
time                       3213
species                    2934
investigator_or_source       15
pdf                           0
href_formula                  1
href                          3
case_number.1                 0
case_number.2                 0
original_order                0
year_categories             124
year_categories_decades     124
dtype: int64

In [78]:
print(set(sharks_copy['year_categories_decades']))

{'1960 to 1969', 'before 1950', '1990 to 1999', 'from 2010 onwards', '1980 to 1989', '2000 to 2009', '1950 to 1959', '1970 to 1979', <NA>}


In [79]:
sharks_copy.head()

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,year_categories,year_categories_decades
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,from 2000 onwards,from 2010 onwards
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,from 2000 onwards,from 2010 onwards
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,from 2000 onwards,from 2010 onwards
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,from 2000 onwards,from 2010 onwards
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,from 2000 onwards,from 2010 onwards


In [80]:
sharks_copy.loc[sharks_copy['year_categories_decades'] == 'before 1950']

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,year_categories,year_categories_decades
4271,1949.12.00.b,Dec-49,1949,Sea Disaster,,Caribbean Sea,Between Cuba & Costa Rica,"Sea Disaster, sinking of the motorship Wingate","Albert Battles, James Dean & 4 crew",M,...,Shark involvement not confirmed,"Canberra Times, 1/6/1950",1949.12.00.b-Wingate.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1949.12.00.b,1949.12.00.b,1722,1900 to 1949,before 1950
4272,1949.12.00.a,Dec-49,1949,Unprovoked,AUSTRALIA,Victoria,Seaholme,Lying on the bottom of a 16' dinghy,Doug Miller,M,...,"Grey nurse shark, 2.6 m [8.5']","V.M. Coppleson (1958), pp. 181-182",1949.12.00.a-Miller.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1949.12.00.a,1949.12.00.a,1721,1900 to 1949,before 1950
4273,1949.11.20,20-Nov-49,1949,Unprovoked,AUSTRALIA,New South Wales,"Kurnell, Botany Bay",Free diving or wading back to shore,William Edward Brown,M,...,6' shark,"Sydney Morning Herald, 11/20/1949",1949.11.20-Brown.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1949.11.20,1949.11.20,1720,1900 to 1949,before 1950
4274,1949.11.12,12-Nov-49,1949,Invalid,AUSTRALIA,Victoria,Port Phillip Bay,No details,John W. Smith,M,...,,"G.P. Whitley (1951), p.194, cites Sunday Heral...",1949.11.12-Smith.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1949.11.12,1949.11.12,1719,1900 to 1949,before 1950
4275,1949.08.28.b,28-Aug-49,1949,Unprovoked,AUSTRALIA,Queensland,Yorkey’s Knob Beach near Cairns,Swimming,Brian Ware (rescuer),M,...,,"V.M. Coppleson (1958), p.89",1949.08.28.b-Ware.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1949.08.28.b,1949.08.28.b,1718,1900 to 1949,before 1950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5863,1554.00.00,Ca. 1554,1554,Unprovoked,FRANCE,Nice & Marseilles,,,males (wearing armor),M,...,Possibly white sharks,G. Rondelet,1554.00.00-Rondelet.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1554.00.00,1554.00.00,130,before 1900,before 1950
5864,1543.00.00,Ca. 1543,1543,Unprovoked,VENEZUELA,Magarita or Cubagua Islands,,Pearl diving,Indian slave,M,...,,J. Castro,1543.00.00.R-LasCasas.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1543.00.00,1543.00.00,129,before 1900,before 1950
5865,0500.00.00,Circa 500 A.D.,500,Unprovoked,MEXICO,,,,male,M,...,,J. Castro,500AD-Mexico.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0500.00.00,0500.00.00,128,before 1900,before 1950
5866,0077.00.00,77 A.D.,77,Unprovoked,,Ionian Sea,,Sponge diving,males,M,...,,Perils mentioned by Pliny the Elder (23 A.D. t...,77AD-Pliny.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0077.00.00,0077.00.00,127,before 1900,before 1950


In [90]:
sharks = sharks_copy

In [91]:
sharks.head()

Unnamed: 0,case_number,date,year,incident_type,country,area,location,activity,name,sex,...,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,year_categories,year_categories_decades
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,from 2000 onwards,from 2010 onwards
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,from 2000 onwards,from 2010 onwards
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,from 2000 onwards,from 2010 onwards
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,from 2000 onwards,from 2010 onwards
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,from 2000 onwards,from 2010 onwards


## Export as new CSV file

In [95]:
sharks.to_csv('sharks_clean_IR.csv', sep=',', index=False)