# Importing Libraries

In [1]:
import pandas as pd
import re
import numpy as np
from IPython.display import display

# Reading and Cleaning CSV

In [2]:
shark_attack = pd.read_csv('data/attacks.csv', encoding='latin')

### Drop columns: 'Unnamed: 22', 'Unnamed: 23', 'href formula', 'pdf' and duplicated rows

In [3]:
shark_attack = shark_attack.drop(columns=['Unnamed: 22', 'Unnamed: 23', 'href formula', 'pdf']).drop_duplicates()

### Removing rows that have 18 or more null values:

In [4]:
shark_attack = shark_attack.loc[~(shark_attack.isnull().sum(axis=1) >= 18), :]

In [5]:
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.inf)

### Removing 'Case Number.1' and 'Case Number.2' columns, because they are duplicated

In [6]:
shark_attack.drop(columns=['Case Number.1', 'Case Number.2'], inplace=True)

### Removing 'Investigator or Source' column, because it won't be used for the analysis of the dataset.

In [7]:
shark_attack.drop(columns='Investigator or Source', inplace=True)

### Removing 'original order' column, because it won't be useful for the anlysis.

In [8]:
shark_attack.drop(columns='original order', inplace=True)

In [9]:
shark_attack.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,http://sharkattackfile.net/spreadsheets/pdf_di...
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,http://sharkattackfile.net/spreadsheets/pdf_di...
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,http://sharkattackfile.net/spreadsheets/pdf_di...
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",http://sharkattackfile.net/spreadsheets/pdf_di...


# Categorizing body parts

In [10]:
def list_to_pattern(lst : list):
    '''
    The function receives a list of body part words and returns a regex pattern.
    
    Parameters:
    lst (list): List of body part words

    Returns:
    string: Returning regex pattern
    '''
    
    pattern = ''
    for word in lst:
        pattern += word + '|'
    pattern = pattern[:-1]
    return pattern

In [11]:
arm = ['hand', 'arm', 'forearm', 'wrist', 'shoulder', 'elbow', 'finger', 'thumb']
leg = ['ankle','knee','foot','feet','thigh','l?eg','calf','buttock','pelvis', 'shin', 'heel', 'toe']
head = ['head','neck','face', 'ear', 'nose', 'mouth', 'scalp']
torso = ['torso','chest', 'back', 'abdomen', 'hip', 'flank']
fatal = ['fatal', 'death', 'kill']
no_injury = ['no in?j?ur[iy]e?s?']
hoax = ['hoax', 'Erroneously']

### Transforming the body parts lists to regular expressions:

In [12]:
arm_pattern = list_to_pattern(arm)
leg_pattern = list_to_pattern(leg)
head_pattern = list_to_pattern(head)
torso_pattern = list_to_pattern(torso)
fatal_pattern = list_to_pattern(fatal)
no_injury_pattern = list_to_pattern(no_injury)
hoax_pattern = list_to_pattern(hoax)

### Creating new columns and setting the values to 0:

In [13]:
shark_attack['Arm'] = 0
shark_attack['Leg'] = 0
shark_attack['Head'] = 0
shark_attack['Torso'] = 0
shark_attack['Fatal'] = 0
shark_attack['No_Injury'] = 0
shark_attack['Hoax'] = 0

### Counting how many occurences for each body part

In [14]:
l = -1

for injury in shark_attack['Injury']:
    a = f'{injury}'
    lst_arm_parts = re.findall(arm_pattern, a, flags=re.I)
    lst_leg_parts = re.findall(leg_pattern, a, flags=re.I)
    lst_head_parts = re.findall(head_pattern, a, flags=re.I)
    lst_torso_parts = re.findall(torso_pattern, a, flags=re.I)
    lst_fatal = re.findall(fatal_pattern, a, flags=re.I)
    lst_no_injury = re.findall(no_injury_pattern, a, flags=re.I)
    lst_hoax = re.findall(hoax_pattern, a, flags=re.I)
    
    l += 1
    
    if len(lst_arm_parts) > 0:
        shark_attack['Arm'].update(pd.Series([1], index=[l]))
    if len(lst_leg_parts) > 0:
        shark_attack['Leg'].update(pd.Series([1], index=[l]))
    if len(lst_head_parts) > 0:
        shark_attack['Head'].update(pd.Series([1], index=[l]))
    if len(lst_torso_parts) > 0:
        shark_attack['Torso'].update(pd.Series([1], index=[l]))
    if len(lst_fatal) > 0:
        shark_attack['Fatal'].update(pd.Series([1], index=[l]))
    if len(lst_no_injury) > 0:
        shark_attack['No_Injury'].update(pd.Series([1], index=[l]))
    if len(lst_hoax) > 0:
        shark_attack['Hoax'].update(pd.Series([1], index=[l]))

### Unknown body part:

In [15]:
unknown_injury = shark_attack.query('Arm == 0 and Leg == 0 and Head == 0 and Torso == 0 and No_Injury == 0 and Fatal == 0 and Hoax == 0')

In [16]:
unknown_injuries = unknown_injury.shape[0]

### Attacks in each body part in percentage:

In [17]:
print(f"Leg attacks: {round(shark_attack['Leg'].sum() / shark_attack.shape[0] * 100, 2)}%")
print(f"Arm attacks: {round(shark_attack['Arm'].sum() / shark_attack.shape[0] * 100,2)}%")
print(f"Head attacks: {round(shark_attack['Head'].sum() / shark_attack.shape[0] * 100, 2)}%")
print(f"Torso attacks: {round(shark_attack['Torso'].sum() / shark_attack.shape[0] * 100, 2)}%")
print(f"No injuries: {round(shark_attack['No_Injury'].sum() / shark_attack.shape[0] * 100, 2)}%")
print(f"Hoax: {round(shark_attack['Hoax'].sum() / shark_attack.shape[0] * 100, 2)}%")
print(f'Unknown injuries: {round(unknown_injuries / shark_attack.shape[0] * 100, 2)}%')

Leg attacks: 41.7%
Arm attacks: 18.55%
Head attacks: 6.6%
Torso attacks: 4.76%
No injuries: 12.87%
Hoax: 0.08%
Unknown injuries: 10.33%


# Cleaning 'Sex' Column

In [18]:
shark_attack.rename(columns={'Sex ' : 'Sex'}, inplace=True)

### Unique values for 'Sex':

In [19]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [20]:
shark_attack.query('Sex == "N"')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
4938,1934.07.11,11-Jul-1934,1934.0,Boating,AUSTRALIA,New South Wales,Cronulla,Fishing,"18' boat, occupants William & Leslie Newton",N,,No injury to occupants Sharks continually foll...,N,,"Blue pointer, 11'",http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,0,1,0
6131,1801.12.18.R,Reported 18-Dec-1801,1801.0,Provoked,,,,Standing on landed shark's tail,Stephen Pettigew,N,,"FATAL, PROVOKED INCIDENT",Y,,12' shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,1,0,0


### According to the pdf with the informations about the incident, both people that had sex = 'N' was Male

In [21]:
shark_attack.loc[4938, 'Sex'] = 'M'
shark_attack.loc[6131, 'Sex'] = 'M'

In [22]:
shark_attack.query('Sex == "lli"')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
1624,2004.11.11.b,11-Nov-2004,2004.0,Unprovoked,USA,California,"Bunkers, Humboldt Bay, Eureka, Humboldt County",Surfing,Brian Kang,lli,38,"Lacerations to hand, knee & thigh",N,13h30,5.5 m [18'] white shark,http://sharkattackfile.net/spreadsheets/pdf_di...,1,1,0,0,0,0,0


### According to the pdf with the informations about the incident, the person that had 'Sex' = 'lli' was Male

In [23]:
shark_attack.loc[1624, 'Sex'] = 'M'

### Removing spaces before and after 'M' or 'F'

In [24]:
shark_attack.loc[~shark_attack['Sex'].isna(), 'Sex'] = shark_attack['Sex'].str.strip()

In [25]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, '.'], dtype=object)

In [26]:
shark_attack.query('Sex == "."')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
5437,1908.06.02.R,Reported 02-Jun-1908,1908.0,Sea Disaster,PAPUA NEW GUINEA,New Britain,Matupi,.,,.,,"Remains of 3 humans recovered from shark, but ...",Y,,Allegedly a 33-foot shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,1,0,0


### No informations about the person, change the value to Unknown

In [27]:
shark_attack.loc[5437, 'Sex'] = np.nan

In [28]:
shark_attack['Sex'].fillna(value='Unknown', inplace=True)

In [29]:
shark_attack['Sex'].value_counts()

M          5099
F           637
Unknown     566
Name: Sex, dtype: int64

# Cleaning 'Country' columns

In [30]:
shark_attack['Country'].isna().sum()

50

In [31]:
shark_attack['Country'].fillna(value='Unknown', inplace=True)

In [32]:
shark_attack['Country'] = shark_attack['Country'].apply(lambda x : 'United States' if x == 'USA' else x.title())

In [33]:
def clean_country(x):
    pattern = '(\w*\s?/?\s?\w*)\??'
    return re.findall(pattern, x)[0]

### Top 3 countries with most attacks:

In [34]:
shark_attack['Country'].apply(clean_country).value_counts().head(3)

United States    2229
Australia        1338
South Africa      579
Name: Country, dtype: int64

# Cleaning 'Fatal (Y/N)' column

## Next, we start cleaning the column that says whether the injury was fatal or not

In [35]:
shark_attack['Fatal (Y/N)'].value_counts()

N          4293
Y          1388
UNKNOWN      71
 N            7
2017          1
M             1
N             1
y             1
Name: Fatal (Y/N), dtype: int64

In [36]:
shark_attack['Fatal (Y/N)'].replace(to_replace =' ?N ?', value = 'N', inplace=True, regex = True) # Remove espaces.
shark_attack['Fatal (Y/N)'].replace(to_replace =' ?[NnF]', value = 'N', inplace=True, regex = True) # Adjust the values
shark_attack['Fatal (Y/N)'].replace(to_replace ='y', value = 'Y', inplace=True, regex = True) # Turn it into capital 'Y'

In [37]:
shark_attack['Fatal (Y/N)'].value_counts()

N          4301
Y          1389
UNKNOWN      71
2017          1
M             1
Name: Fatal (Y/N), dtype: int64

## Later, we look for the values that differ from Y, N or UNKNOWN

In [38]:
shark_attack.loc[shark_attack['Fatal (Y/N)'] == '2017']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
786,2012.06.10,10-Jun-2012,2012.0,Provoked,Italy,Sardinia,Muravera,Attempting to rescue an injured & beached shark,Giorgio Zara,M,57,Lower left leg injured PROVOKED ACCIDENT,2017,Morning,"Blue shark, 2.5m",http://sharkattackfile.net/spreadsheets/pdf_di...,0,1,0,0,0,0,0


## Lower left leg injured PROVOKED ACCIDENT, It was not fatal. Therefore we switch it to N	

In [39]:
shark_attack['Fatal (Y/N)'].replace(to_replace ='2017', value = 'N', inplace=True, regex = True) 

In [40]:
shark_attack.loc[shark_attack['Fatal (Y/N)'] == 'M']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
285,2016.04.18.b,18-Apr-2016,2016.0,Provoked,French Polynesia,Tuamotos,Makemo Atoll,Spearfishing,Hoata Iotua,M,22,Laceration to knee by speared shark PROVOKED I...,M,Morning,"Grey reef shark, 2 m",http://sharkattackfile.net/spreadsheets/pdf_di...,0,1,1,0,0,0,0


## Laceration to knee by speared shark PROVOKED INCIDENT. It was not fatal. Therefore we switch it to N

In [41]:
shark_attack['Fatal (Y/N)'].replace(to_replace ='M', value = 'N', inplace=True, regex = True) 

In [42]:
shark_attack['Fatal (Y/N)'].value_counts()

N          4303
Y          1389
UNKNOWN      71
Name: Fatal (Y/N), dtype: int64

# Cleaning 'Year' column

## Next, we start cleaning the Year column.

In [43]:
shark_attack['Year'].value_counts().sum(axis=0)

6300

In [44]:
shark_attack.loc[shark_attack['Year'] == 0].head(60)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
6177,0000.0214,Ca. 214 B.C.,0.0,Unprovoked,Unknown,Ionian Sea,,Ascending from a dive,"Tharsys, a sponge diver",M,,"FATAL, shark/s bit him in two",Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,1,0,0
6178,0000.0336,Ca. 336.B.C..,0.0,Unprovoked,Greece,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,A candidate for initiation,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,1,0,0
6179,0000.0493,493 B.C.,0.0,Sea Disaster,Greece,Off Thessaly,,Shipwrecked Persian Fleet,males,M,,Herodotus tells of sharks attacking men in the...,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,0,0,0
6180,0000.0725,Ca. 725 B.C.,0.0,Sea Disaster,Italy,Tyrrhenian Sea,Krater found during excavations at Lacco Ameno...,Shipwreck,males,M,,Depicts shipwrecked sailors attacked by a sha...,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,1,0,0,0
6181,ND-0154,Before 1939,0.0,Unprovoked,Canada,,Grand Banks,Fishing,Joe Folsom,M,,Arm bitten,N,,,http://sharkattackfile.net/spreadsheets/pdf_di...,1,0,0,0,0,0,0
6182,ND-0153,1990 or 1991,0.0,Unprovoked,Kenya,Mombasa,Kilindini,Diving,Conway Plough & Dr. Jonathan Higgs,M,,Conway's leg was bitten Higgs injury was FATAL,N,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,1,0,0,1,0,0
6183,ND-0152,Before 2016,0.0,Unprovoked,Kenya,Mombasa,Kilindini,Diving,Hamisi Njenga,M,,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,1,0,0
6184,ND-0151,Before Oct-2009,0.0,Unprovoked,Panama,Bocas del Toro Province,Red Frog Beach,Swimming/,male,M,20.0,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,1,0,0
6185,ND-0150,Before 1934,0.0,Unprovoked,Uruguay,Rocha,"Isla Chica, La Paloma",Swimming,,Unknown,,Foot bitten,N,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,1,0,0,0,0,0
6186,ND-0149,Before 1934,0.0,Unprovoked,Uruguay,Rocha,"Playa del Barco, La Pedrera",Swimming,Maciello,M,,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,1,0,0


In [45]:
shark_attack['Year'].quantile(np.arange(0,1.1, 0.1))

0.0       0.0
0.1    1893.0
0.2    1930.0
0.3    1952.0
0.4    1962.0
0.5    1977.0
0.6    1992.0
0.7    2001.0
0.8    2008.0
0.9    2013.0
1.0    2018.0
Name: Year, dtype: float64

In [46]:
shark_attack['Year'].loc[shark_attack['Year'].isna()]

187    NaN
6079   NaN
Name: Year, dtype: float64

In [47]:
shark_attack.loc[187, :]

Case Number                                         2017.01.08.R
Date                                        Reported 08-Jan-2017
Year                                                         NaN
Type                                                     Invalid
Country                                                Australia
Area                                                  Queensland
Location                                                     NaN
Activity                                            Spearfishing
Name                                                Kerry Daniel
Sex                                                            M
Age                                                           35
Injury                    No attack, shark made a threat display
Fatal (Y/N)                                                  NaN
Time                                                         NaN
Species                                               Bull shark
href           http://sha

In [48]:
shark_attack['Year'].fillna(2017, inplace=True) # According to the report, the shark threatened the victim

In [49]:
shark_attack.loc[6079, :]

Case Number                                         1836.08.19.R
Date                                        Reported 19-Aug-1836
Year                                                        2017
Type                                                  Unprovoked
Country                                                  England
Area                                                  Cumberland
Location                                              Whitehaven
Activity                                                Swimming
Name                                                       a boy
Sex                                                            M
Age                                                          NaN
Injury                                                     FATAL
Fatal (Y/N)                                                    Y
Time                                                         NaN
Species                                                      NaN
href           http://sha

In [50]:
shark_attack['Year'].replace(to_replace=2017, value=1836, inplace=True)  # According to the report, it happened in 1836

In [51]:
shark_attack.loc[6079, :]

Case Number                                         1836.08.19.R
Date                                        Reported 19-Aug-1836
Year                                                        1836
Type                                                  Unprovoked
Country                                                  England
Area                                                  Cumberland
Location                                              Whitehaven
Activity                                                Swimming
Name                                                       a boy
Sex                                                            M
Age                                                          NaN
Injury                                                     FATAL
Fatal (Y/N)                                                    Y
Time                                                         NaN
Species                                                      NaN
href           http://sha

In [52]:
shark_attack['Year'].value_counts().sum()

6302

In [53]:
shark_attack['Year'].loc[shark_attack['Year']==0]

6177    0.0
6178    0.0
6179    0.0
6180    0.0
6181    0.0
       ... 
6297    0.0
6298    0.0
6299    0.0
6300    0.0
6301    0.0
Name: Year, Length: 125, dtype: float64

In [54]:
shark_attack.loc[6177, :]

Case Number                                            0000.0214
Date                                                Ca. 214 B.C.
Year                                                           0
Type                                                  Unprovoked
Country                                                  Unknown
Area                                                  Ionian Sea
Location                                                     NaN
Activity                                   Ascending from a dive
Name                                     Tharsys, a sponge diver
Sex                                                            M
Age                                                          NaN
Injury                             FATAL, shark/s bit him in two
Fatal (Y/N)                                                    Y
Time                                                         NaN
Species                                                      NaN
href           http://sha

## Transform the values in column 'Year' into integers

In [55]:
shark_attack['Year'] = shark_attack['Year'].astype(np.int64) 

In [56]:
shark_attack.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
0,2018.06.25,25-Jun-2018,2018,Boating,United States,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,0,1,0
1,2018.06.18,18-Jun-2018,2018,Unprovoked,United States,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,1,0,0,0,0,0
2,2018.06.09,09-Jun-2018,2018,Invalid,United States,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,1,0,0,0,0,0
3,2018.06.08,08-Jun-2018,2018,Unprovoked,Australia,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,1,0,0,0,0,0
4,2018.06.04,04-Jun-2018,2018,Provoked,Mexico,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",http://sharkattackfile.net/spreadsheets/pdf_di...,1,1,0,0,0,0,0


# How many deaths occurred when injured in the leg:

## The most commom body part for the shark to attack is the leg (that includes: ankle, knee, foot thigh, leg, calf, buttock, pelvis, shin, heel and toe):

In [57]:
max_attack = max([shark_attack['Leg'].sum(), shark_attack['Arm'].sum(), shark_attack['Head'].sum(), shark_attack['Torso'].sum()])

In [58]:
print(f'{max_attack} attacks in the leg')

2628 attacks in the leg


In [59]:
deaths_by_body_part = shark_attack.groupby(by='Fatal (Y/N)').sum()

In [60]:
leg_injuries = shark_attack['Leg'].sum()

In [61]:
deaths_leg = deaths_by_body_part['Leg']['Y']
perc = round((deaths_leg / leg_injuries) * 100, 2)

In [62]:
print(f'According to the results, {perc}% of people who were injured in the leg died.')

According to the results, 10.16% of people who were injured in the leg died.
