In [14]:
import pandas as pd
import numpy as np
import re

pd.set_option("display.max_rows", 1000)

In [4]:
df = pd.read_csv("GSAF5.csv", encoding="ISO-8859-1")

After loading the dataset, I try to get a feeling for what's in there by checking the columns and the first few rows

In [9]:
df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [8]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

To further examine the date columns ("Case Number", "Date", "Year"), I might print the unique values

In [17]:
df["Date"].value_counts().sort_index()

    10-Jan-2009    1
    15-Jun-1937    1
    16-Jan-1970    1
    22-Jul-2013    1
   21-Sep-1908     1
                  ..
Summer-2008        1
Winter 1942        1
Winter 1969        1
Woirld War II      1
World War II       2
Name: Date, Length: 5128, dtype: int64

We see that there are a bunch of weird values in the column

Test if we can use the pd.to_datetime() method on a bunch of rows to convert them to datetime.

In [22]:
pd.to_datetime(df.loc[0, "Date"], infer_datetime_format=True)

Timestamp('2016-09-18 00:00:00')

In [25]:
def are_you_inferable(date):
    if pd.to_datetime(date, infer_datetime_format=True):
        return True
    else:
        False

In [26]:
are_you_inferable("10-Jan-2009")

True

In [29]:
df["Date"].apply(are_you_inferable)

ValueError: ('Unknown string format:', 'Reported  14-Jul-2016')

This is obviously not working. We have to control for the error. If there only was a way to somehow capture it

In [30]:
def are_you_inferable2(date):
    try:
        pd.to_datetime(date, infer_datetime_format=True)
        return True
    except:
        return False

In [31]:
are_you_inferable2("Reported  14-Jul-2016")

False

In [32]:
df["Date"].apply(are_you_inferable2)

0        True
1        True
2        True
3        True
4        True
        ...  
5987    False
5988    False
5989    False
5990    False
5991    False
Name: Date, Length: 5992, dtype: bool

Now that we can ask every single row if it is inferable, we can easily quantify the two groups

In [33]:
df["Date"].apply(are_you_inferable2).value_counts()

True     5135
False     857
Name: Date, dtype: int64

Move the 857 rows that could not be infered into a separate df for further inspection

In [38]:
df_ninfer = df[df["Date"].apply(are_you_inferable2) == False].copy()
df_ninfer.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
33,2016.07.14.4,Reported 14-Jul-2016,2016,Unprovoked,BAHAMAS,,Tiger Beach,Scuba Diving,Michael Dornellas,M,...,"Lemon shark, 9'","GrindTV, 7/14/2016",2016.07.14.R-TigerBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.07.14.R,2016.07.14.4,5960,,
34,2016.07.08.R,Reported 08-Jul-2016,2016,Unprovoked,SPAIN,Canary Islands,"Las Teresitas, Tenerife",Wading,female,F,...,Angel shark,La Opinion de Tenerife,2016.07.08.R-Spain.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.07.08.R,2016.07.08.R,5959,,
85,2016.03.03.R,Reported 03-Mar-2016,2016,Unprovoked,AUSTRALIA,South Australia,Wrights Bay,Fishing,Lee Taplin,M,...,Bronze whaler,"9 News, 3/1/2016",2016.03.03.R-Taplin.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.03.03.R,2016.03.03.R,5908,,
90,2016.02.10.R,Reported 10-Feb-2016,2016,Invalid,CAYMAN ISLANDS,Grand Cayman,Stingray City Bar,Feeding stingrays?,Richard Branson,M,...,No shark involvement,R. Branson,2016.02.10.R-Branson-stingray.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.02.10.R,2016.02.10.R,5903,,
100,2016.01.11.R,Reported 11-Jan-2016,2016,Unprovoked,AUSTRALIA,Queensland,"Happy Valley Beach, Caloundra",Surfing,Shane Hilder,M,...,Wobbegong shark,"ABC Sunshine Coast, 1/11/2016",2016.01.11.R-Hilder.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.01.11.R,2016.01.11.R,5893,,


In [39]:
df_ninfer.shape

(857, 24)

In [40]:
df_ninfer["Date"].value_counts()

No date                                                             6
1970s                                                               5
No date, Before 1963                                                5
1960s                                                               4
Before 1958                                                         4
Before 1906                                                         4
Reported 10-Oct-1906                                                4
Before 1962                                                         3
May-Jun-1965                                                        3
1700s                                                               3
1941-1945                                                           3
Before 1911                                                         3
1940 - 1950                                                         3
Before 1921                                                         3
Jul-06              

In [41]:
df_ninfer[["Date", "Case Number"]]

Unnamed: 0,Date,Case Number
33,Reported 14-Jul-2016,2016.07.14.4
34,Reported 08-Jul-2016,2016.07.08.R
85,Reported 03-Mar-2016,2016.03.03.R
90,Reported 10-Feb-2016,2016.02.10.R
100,Reported 11-Jan-2016,2016.01.11.R
151,Sep-15,2015.09.00
184,Reported 25-Jun-2015,2015.06.25.R
247,Reported 23-Dec-2014,2014.12.23.R
249,Reported 03-Dec-2014,2014.12.03.R
254,Reported 17-Nov-2014,2014.11.17


add some notes on what's happening with the case number column

In [46]:
date_pattern = r"(\d{4}\W\d{2}\W\d{2})"
df_ninfer["Case Number"].str.extract(date_pattern, expand=False)

33      2016.07.14
34      2016.07.08
85      2016.03.03
90      2016.02.10
100     2016.01.11
151     2015.09.00
184     2015.06.25
247     2014.12.23
249     2014.12.03
254     2014.11.17
276     2014.09.13
282     2014.09.00
290     2014.08.25
303     2014.08.00
316     2014.06.27
320     2014.06.17
337     2014.05.10
343     2014.04.12
359     2014.02.17
419     2013.08.08
430     2013.07.17
445     2013.06.14
465     2013.04.02
469     2013.03.21
485     2013.01.21
496     2012.12.00
510     2012.10.11
548     2012.06.28
579     2012.04.00
597     2012.01.22
607     2011.12.26
621     2011.11.20
625     2011.10.28
662     2011.08.16
665     2011.08.11
688     2011.06.14
690     2011.06.06
703     2011.05.07
711     2011.03.29
717     2011.03.10
719     2011.02.28
725     2011.02.04
732     2011.01.12
738     2010.12.03
743     2010.11.27
748     2010.11.12
750     2010.10.28
766     2010.09.06
801     2010.06.00
813     2010.02.19
818     2010.02.06
855     2009.10.29
861     2009

In [53]:
df_ninfer["Case Number"].str.extract(date_pattern, expand=False).apply(are_you_inferable2).value_counts()

True     640
False    217
Name: Case Number, dtype: int64

We're getting closer.
Create another df with the remaining rows

In [54]:
df_ninfer_ncase = df_ninfer[df_ninfer["Case Number"]
                            .str.extract(date_pattern, expand=False)
                            .apply(are_you_inferable2) == False].copy()

In [59]:
df_ninfer_ncase.sample(frac=1).head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
5769,1847.00.00.a,Ca. 1847,1847,Unprovoked,USA,South Carolina,,Boating,adult,M,...,,"W.H. Greg, p.21",1847.00.00.a-SouthCarolina.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1847.00.00.a,1847.00.00.a,224,,
4868,1928.09.00,Sep-28,1928,Unprovoked,USA,Texas,Corpus Christi,Fishing,W.R. Loesberg,M,...,,"Galveston Daily News, 12/7/1929",1928.09.00-Loesberg.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1928.09.00,1928.09.00,1125,,
2289,1993.00.00.c,Between May & Nov-1993,1993,Unprovoked,SOMALIA,Banaadir Region,Mogadishu,,several Somali children,,...,,"Orlando Sentinel, 112/1/1993, p.A9",1993.00.00.c - Somali children.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1993.00.00.c,1993.00.00.c,3705,,
4266,1950.00.00.e,Summer 1950,1950,Unprovoked,GREECE,,"Piraeus, Athens",Swimming,,,...,,"V.M. Coppleson (1958), p.259; L. Schultz & M. ...",1950.00.00.e-Piraeus.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1950.00.00.e,1950.00.00.e,1727,,
3188,1970.00.00.c,1970s,1970,Unprovoked,IRAQ,Basrah,Shatt-al-Arab River near Abu al Khasib,Washing clothes on stairs,female,F,...,,B.W. Coad & L.A.J. Al-Hassan,1970.00.00.c-Abu-a-Khasib.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.00.00.c,1970.00.00.c,2805,,


Replace "00" with "01" in the "Case Number" column

In [63]:
df_ninfer_ncase["Case Number"]

151         2015.09.00
282         2014.09.00
303         2014.08.00
496         2012.12.00
579         2012.04.00
801         2010.06.00
954         2009.01.00
964         2008.12.00
987         2008.09.00
1016        2008.07.00
1074      2008.00.00.b
1075      2008.00.00.a
1090        2007.11.00
1142        2007.07.00
1161        2007.05.00
1179        2007.02.00
1201      2006.10.00.b
1202      2006.10.00.a
1223      2006.08.00.b
1224      2006.08.00.a
1402        2004.11.00
1424        2004.08.00
1500        2003.11.00
1524        2003.09.00
1536      2003.07.00.c
1537      2003.07.00.b
1538      2003.07.00.a
1547        2003.06.00
1553        2003.05.00
1590      2002.11.00.a
1714        2001.07.00
1727        2001.05.00
1749        2001.03.00
1763        2000.12.00
1793      2000.09.00.b
1803        2000.08.00
1820        2000.07.00
1829        2000.06.00
1846        2000.03.00
2150        1995.07.00
2234      1994.00.00.b
2272      1993.05.00.b
2288      1993.00.00.d
2289      1

In [68]:
(df_ninfer_ncase["Case Number"]
 .str.replace("\.00", ".01")
 .str.extract(date_pattern, expand=False)
 .apply(are_you_inferable2)).value_counts()

True     204
False     13
Name: Case Number, dtype: int64

In [69]:
df_ninfer_ncase_nrepl = df_ninfer_ncase[df_ninfer_ncase["Case Number"]
                                        .str.replace("\.00", ".01")
                                        .str.extract(date_pattern, expand=False)
                                        .apply(are_you_inferable2) == False].copy()

In [72]:
df.drop(df_ninfer_ncase_nrepl.index)

Int64Index([5855, 5856, 5857, 5858, 5859, 5860, 5861, 5862, 5863, 5864, 5865,
            5866, 5867],
           dtype='int64')

In [71]:
pd.to_datetime("1642.01.01", infer_datetime_format=True)

OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 1642-01-01 00:00:00

Apply all the transformations. 

In [74]:
df["Age"].value_counts()

17                         148
18                         145
19                         138
20                         136
15                         135
16                         134
21                         115
22                         113
24                         103
25                         101
14                          97
13                          91
26                          80
23                          80
27                          78
28                          77
29                          75
30                          74
12                          70
35                          64
32                          64
10                          51
31                          50
40                          50
38                          47
34                          45
43                          43
36                          41
33                          40
37                          37
39                          37
42                          36
9       