## Importing libraries and file

In [3]:
import pandas as pd
import numpy as np
import re
import math

In [4]:
shark = pd.read_csv('attacks.csv', encoding = "ISO-8859-1")

## First explorations on the data

In [5]:
shark.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
Case Number               8702 non-null object
Date                      6302 non-null object
Year                      6300 non-null float64
Type                      6298 non-null object
Country                   6252 non-null object
Area                      5847 non-null object
Location                  5762 non-null object
Activity                  5758 non-null object
Name                      6092 non-null object
Sex                       5737 non-null object
Age                       3471 non-null object
Injury                    6274 non-null object
Fatal (Y/N)               5763 non-null object
Time                      2948 non-null object
Species                   3464 non-null object
Investigator or Source    6285 non-null object
pdf                       6302 non-null object
href formula              6301 non-null object
href                      6302 non-null obje

In [6]:
shark.shape

(25723, 24)

In [7]:
shark.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

## I stablish which are my questions that drive my investigation over the data so I can know what data plays an important role :

### What are the areas with a high concentration of shark attacks?
### What percentage of the attacks are lethal?
### Has de severity of the attacks decreased with time?

Therefore, I drop the columns that doesn't have any relation with all this questions

In [8]:
shark.drop(axis = 1, columns = ['Unnamed: 22',
 'Unnamed: 23',
 'href',
 'Sex ',
 'href formula',
 'pdf',
 'Species ',
 'original order',
 'Investigator or Source',
 'Case Number.1',
 'Case Number.2',
'Case Number'],inplace=True)

def process_columns(columns):
    res = []
    for col in columns:
        coli = col.replace(' ','').lower().replace('.',' ').capitalize()
        if('Fatal' in col):
            coli = 'Fatal'
        res.append(coli)
    return res

shark.columns = process_columns(shark.columns)

In [9]:
shark.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Name', 'Age', 'Injury', 'Fatal', 'Time'],
      dtype='object')

## Checking how many null values there are on the fields

In [10]:
shark.isna().sum()

Date        19421
Year        19423
Type        19425
Country     19471
Area        19876
Location    19961
Activity    19965
Name        19631
Age         22252
Injury      19449
Fatal       19960
Time        22775
dtype: int64

In [11]:
shark.drop(axis=1,columns=['Time'],inplace=True)
shark.drop(axis=1,columns=['Age'],inplace=True)
shark.dropna(how='any', inplace = True)

In [12]:
#Drop all duplicates considering all columns
shark.drop_duplicates(subset=list(shark.columns))

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Injury,Fatal
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,"No injury to occupant, outrigger canoe and pad...",N
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,Minor injury to left thigh,N
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,Injury to left lower leg from surfboard skeg,N
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,Minor injury to lower leg,N
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,Lacerations to leg & hand shark PROVOKED INCIDENT,N
...,...,...,...,...,...,...,...,...,...,...
6290,Before 19-Jul-1913,0.0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Wading,a young Scotsman,"FATAL, leg stripped of flesh",Y
6296,Before 1906,0.0,Unprovoked,AUSTRALIA,New South Wales,,Swimming,Arab boy,FATAL,Y
6297,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,FATAL,Y
6299,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,FATAL,Y


In [13]:
shark['Fatal'].unique()

array(['N', 'Y', 'M', '2017', 'UNKNOWN', ' N'], dtype=object)

In [14]:
def fix_fatal(x):
    if 'y' in x.lower():
        return 'Y'
    elif 'n' in x.lower():
        return 'N'
    else:
        return 'UNKNOWN'
#We observe some anomalies that need to be normalized 
shark['Fatal'] = shark['Fatal'].apply(fix_fatal)

In [15]:
shark['Fatal'].unique()

array(['N', 'Y', 'UNKNOWN'], dtype=object)

In [16]:
for i in shark.index:
    print(shark['Year'][i])
    if shark['Year'][i]:
        continue
    else:
        res = re.search('\d{4}?',shark['Date'][i])
        if res and res.group(0):
            shark.loc[i,'Year'] = res.group(0)

2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2018.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0
2017.0

1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1989.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1988.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1987.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1986.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0
1985.0

1908.0
1908.0
1908.0
1908.0
1908.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1907.0
1906.0
1906.0
1906.0
1906.0
1906.0
1906.0
1906.0
1906.0
1906.0
1906.0
1906.0
1905.0
1905.0
1905.0
1905.0
1905.0
1905.0
1905.0
1905.0
1905.0
1905.0
1905.0
1905.0
1904.0
1904.0
1904.0
1904.0
1904.0
1904.0
1904.0
1904.0
1904.0
1903.0
1903.0
1903.0
1903.0
1903.0
1903.0
1903.0
1902.0
1902.0
1902.0
1902.0
1902.0
1902.0
1902.0
1902.0
1902.0
1902.0
1902.0
1902.0
1901.0
1901.0
1901.0
1901.0
1901.0
1901.0
1901.0
1900.0
1900.0
1900.0
1900.0
1900.0
1900.0
1900.0
1900.0
1900.0
1900.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1899.0
1898.0
1898.0
1898.0
1898.0
1898.0
1898.0
1898.0
1898.0
1898.0
1898.0
1898.0
1898.0
1897.0
1897.0
1897.0
1897.0
1897.0
1897.0
1896.0
1896.0
1896.0
1896.0
1896.0
1896.0
1896.0
1895.0
1895.0
1895.0
1895.0
1895.0
1895.0
1895.0
1895.0
1895.0

In [17]:
shark.drop(shark[shark.Year==0].index)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Injury,Fatal
0,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,"No injury to occupant, outrigger canoe and pad...",N
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,Minor injury to left thigh,N
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,Injury to left lower leg from surfboard skeg,N
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,Minor injury to lower leg,N
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,Lacerations to leg & hand shark PROVOKED INCIDENT,N
...,...,...,...,...,...,...,...,...,...,...
6290,Before 19-Jul-1913,1913,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Wading,a young Scotsman,"FATAL, leg stripped of flesh",Y
6296,Before 1906,1906,Unprovoked,AUSTRALIA,New South Wales,,Swimming,Arab boy,FATAL,Y
6297,Before 1903,1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,FATAL,Y
6299,1900-1905,1900,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,FATAL,Y


In [18]:
shark.loc[6271,'Year'] = 1940
shark.loc[6272,'Year'] = 1940

In [19]:
def convert_year(year):
    return int(year)

shark['Year'] = shark['Year'].apply(convert_year)

In [20]:
def classify_year(y):
    if y <= 1900:
        return 1900
    for x in range(1920,2020,20):
        if y<=x:
            return x

In [21]:
shark['year_by_20'] = shark['Year'].apply(classify_year)
shark.groupby('year_by_20')['Fatal'].value_counts()

year_by_20  Fatal
1900.0      Y        188
            N        165
1920.0      Y        113
            N        108
1940.0      N        238
            Y        133
1960.0      N        406
            Y        166
1980.0      N        510
            Y        127
2000.0      N        739
            Y        120
Name: Fatal, dtype: int64

In [22]:
shark['year_by_20'] = shark['year_by_20'].fillna(shark['Year']).apply(convert_year)
shark

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Injury,Fatal,year_by_20
0,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,"No injury to occupant, outrigger canoe and pad...",N,2018
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,Minor injury to left thigh,N,2018
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,Injury to left lower leg from surfboard skeg,N,2018
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,Minor injury to lower leg,N,2018
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,Lacerations to leg & hand shark PROVOKED INCIDENT,N,2018
...,...,...,...,...,...,...,...,...,...,...,...
6290,Before 19-Jul-1913,1913,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Wading,a young Scotsman,"FATAL, leg stripped of flesh",Y,1920
6296,Before 1906,1906,Unprovoked,AUSTRALIA,New South Wales,,Swimming,Arab boy,FATAL,Y,1920
6297,Before 1903,1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,FATAL,Y,1920
6299,1900-1905,1900,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,FATAL,Y,1900


# After cleaning the Dataframe now we search in the data for some question and search answers for them in the data.

## 1-What are the areas with a high concentration of shark attacks?

In [23]:
shark['Area'].value_counts().nlargest(10)

Florida                  906
New South Wales          393
California               250
Queensland               250
Hawaii                   243
Western Cape Province    170
KwaZulu-Natal            165
Western Australia        156
Eastern Cape Province    145
South Carolina           120
Name: Area, dtype: int64

## 2- What percentage of the attacks are lethal?

In [24]:
lethal_victims = int(shark[shark.Fatal == 'Y']['Fatal'].value_counts())
non_lethal = int(shark[shark.Fatal == 'N']['Fatal'].value_counts())
print(f'The total of letal victims is {lethal_victims}')
print(f'The total of nonlethal victims is {non_lethal}')
print(f'The percentage of attacks that are lethal is {round(lethal_victims/(lethal_victims+non_lethal)*100,2)}%')

The total of letal victims is 978
The total of nonlethal victims is 3684
The percentage of attacks that are lethal is 20.98%


##  3- Has de severity of the attacks decreased with time?

( I couldn't solve the issue with some dates getting parsed as Nan for some reason so I finally managed to hardpaste the year, which provoques this ugly check to exclude the values that are not %20.)

In [41]:
print(list(shark['year_by_20'].unique()))

[2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1980, 1960, 1900, 1940, 1920]


In [44]:
for val in shark['year_by_20'].unique():
    if val % 20 == 0:
        lethal_victims = int(shark[(shark.Fatal == 'Y') & (shark['year_by_20']== val)]['Fatal'].value_counts())
        non_lethal = int(shark[(shark.Fatal == 'N') & (shark['year_by_20']== val)]['Fatal'].value_counts())
        print(f'The percentage of attacks in {val} that are lethal is {round(lethal_victims/(lethal_victims+non_lethal)*100,2)}%')

The percentage of attacks in 2000 that are lethal is 13.97%
The percentage of attacks in 1980 that are lethal is 19.94%
The percentage of attacks in 1960 that are lethal is 29.02%
The percentage of attacks in 1900 that are lethal is 53.26%
The percentage of attacks in 1940 that are lethal is 35.85%
The percentage of attacks in 1920 that are lethal is 51.13%
