## Importing libraries and file

In [85]:
import pandas as pd
import numpy as np
import re
import math

In [2]:
shark = pd.read_csv('attacks.csv', encoding = "ISO-8859-1")

## First explorations on the data

In [3]:
shark.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [4]:
shark.shape

(25723, 24)

In [5]:
shark.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [8]:
shark.drop(axis = 1, columns = ['Unnamed: 22',
 'Unnamed: 23',
 'href',
 'Sex ',
 'href formula',
 'pdf',
                                'Species ',
 'original order',
 'Investigator or Source',
 'Case Number.1',
 'Case Number.2',
'Case Number'],inplace=True)

def process_columns(columns):
    res = []
    for col in columns:
        coli = col.replace(' ','').lower().replace('.',' ').capitalize()
        if('Fatal' in col):
            coli = 'Fatal'
        res.append(coli)
    return res

shark.columns = process_columns(shark.columns)

In [9]:
shark.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Name', 'Age', 'Injury', 'Fatal', 'Time'],
      dtype='object')

## Checking how many null values there are on the fields

In [10]:
shark.dropna(how='any',thresh=7, inplace = True)

In [11]:
shark

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Age,Injury,Fatal,Time
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,57,"No injury to occupant, outrigger canoe and pad...",N,18h00
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,11,Minor injury to left thigh,N,14h00 -15h00
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,48,Injury to left lower leg from surfboard skeg,N,07h45
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,,Minor injury to lower leg,N,
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,
...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,,FATAL,Y,
6298,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,,FATAL,Y,
6299,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,,FATAL,Y,
6300,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,,FATAL,Y,


In [12]:
shark.isna().sum()

Date           0
Year           2
Type           4
Country       44
Area         434
Location     518
Activity     525
Name         198
Age         2806
Injury        23
Fatal        522
Time        3329
dtype: int64

In [14]:
shark.drop(axis=1,columns=['Time'],inplace=True)

shark.drop(axis=1,columns=['Age'],inplace=True)

KeyError: "['Time'] not found in axis"

In [66]:
shark.groupby('Date').count()

Unnamed: 0_level_0,Year,Type,Country,Area,Location,Activity,Name,Age,Injury,Fatal,Species,Investigatororsource
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
\n1951.12.15.R,1,1,1,1,1,1,1,0,1,0,1,1
10-Jan-2009,1,1,1,1,1,1,1,0,1,1,1,1
15-Jun-1937,1,1,1,1,1,1,1,1,1,1,0,1
16-Jan-1970,1,1,1,1,1,1,1,0,1,1,0,1
22-Jul-2013,1,1,1,1,1,1,1,1,1,1,0,1
21-Sep-1908,1,1,1,1,1,1,1,0,1,1,1,1
02-Jun-1899,1,1,1,1,1,1,1,0,1,0,1,1
03-Feb-1914,1,1,1,1,1,1,1,0,1,1,0,1
05-Oct-1985,1,1,1,1,1,1,1,0,1,1,1,1
10-Jan-1903,1,1,1,1,1,0,1,0,1,1,1,1


In [15]:
#Drop all duplicates considering all columns
shark.drop_duplicates(subset=list(shark.columns))

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Age,Injury,Fatal
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,57,"No injury to occupant, outrigger canoe and pad...",N
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,11,Minor injury to left thigh,N
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,48,Injury to left lower leg from surfboard skeg,N
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,,Minor injury to lower leg,N
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,,Lacerations to leg & hand shark PROVOKED INCIDENT,N
...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,,FATAL,Y
6298,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,,FATAL,Y
6299,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,,FATAL,Y
6300,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,,FATAL,Y


In [58]:
def classify_year(y):
    if y <= 1900:
        return 1900
    for x in range(1920,2020,20):
        if y<=x:
            return x
shark['year_by_20'] = shark['Year'].apply(classify_year)
shark.groupby('year_by_20')['Fatal'].value_counts()

year_by_20  Fatal
1900.0      Y        364
            N        306
1920.0      Y        136
            N        130
1940.0      N        284
            Y        162
1960.0      N        506
            Y        244
1980.0      N        637
            Y        172
2000.0      N        835
            Y        152
Name: Fatal, dtype: int64

In [59]:
shark['Fatal'].unique()
shark['Fatal'].dropna(how='any', inplace = True)
shark['Fatal']

0       N
1       N
2       N
3       N
4       N
       ..
6297    Y
6298    Y
6299    Y
6300    Y
6301    Y
Name: Fatal, Length: 5755, dtype: object

In [60]:
def fix_fatal(x):
    if 'y' in x.lower():
        return 'Y'
    elif 'n' in x.lower():
        return 'N'
    else:
        return 'UNKNOWN'
#We observe some anomalies that need to be normalized 
shark['Fatal'] = shark['Fatal'].apply(fix_fatal)

In [61]:
shark['Fatal'].unique()

array(['N', 'Y', nan], dtype=object)

# After cleaning the Dataframe now we search in the data for some question and search answers for them in the data.

## 1-What are the areas with a high concentration of shark attacks?

In [29]:
shark['Area'].value_counts().nlargest(10)

Florida                  1037
New South Wales           486
Queensland                311
Hawaii                    298
California                290
KwaZulu-Natal             213
Western Cape Province     195
Western Australia         189
South Carolina            160
Eastern Cape Province     160
Name: Area, dtype: int64

### 2- What percentage of the attacks are lethal?

In [52]:
lethal_victims = int(shark[shark.Fatal == 'Y']['Fatal'].value_counts())
non_lethal = int(shark[shark.Fatal == 'N']['Fatal'].value_counts())
print(f'The percentage of attacks that are lethal is {round(lethal_victims/(lethal_victims+non_lethal)*100,2)}%')

The percentage of attacks that are lethal is 24.44%


##  3- Has de severity of the attacks decreased with time?

In [86]:
shark.replace([np.inf, -np.inf], np.nan)
shark['Year'].dropna(inplace=True)

In [94]:
shark.astype({'Year': 'int64'})

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [92]:
def convert_year(year):
    return int(year)

shark['Year'] = shark['Year'].apply(convert_year)
    
    
        

In [93]:
shark

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Age,Injury,Fatal,year_by_20
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,57,"No injury to occupant, outrigger canoe and pad...",N,
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,11,Minor injury to left thigh,N,
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,48,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,,Minor injury to lower leg,N,
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,
...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,,FATAL,Y,1900.0
6298,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,,FATAL,Y,1900.0
6299,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,,FATAL,Y,1900.0
6300,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,,FATAL,Y,1900.0
