# Importing Libraries

In [514]:
import pandas as pd
import re
import numpy as np
from IPython.display import display

# Reading and Cleaning CSV

In [515]:
shark_attack = pd.read_csv('data/attacks.csv', encoding='latin')

Drop columns: 'Unnamed: 22', 'Unnamed: 23', 'href formula', 'pdf' and duplicated rows

In [516]:
shark_attack = shark_attack.drop(columns=['Unnamed: 22', 'Unnamed: 23', 'href formula', 'pdf']).drop_duplicates()

Removing rows that have 18 or more null values:

In [517]:
shark_attack = shark_attack.loc[~(shark_attack.isnull().sum(axis=1) >= 18), :]

In [518]:
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.inf)

Removing 'Case Number.1' and 'Case Number.2' columns, because they are duplicated

In [519]:
shark_attack.drop(columns=['Case Number.1', 'Case Number.2'], inplace=True)


Removing 'Investigator or Source' column, because it won't be used for the analysis of the dataset.

In [520]:
shark_attack.drop(columns='Investigator or Source', inplace=True)

Removing 'original order' column because it won't be useful for the anlysis.

In [521]:
shark_attack.drop(columns='original order', inplace=True)

In [522]:
shark_attack.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,http://sharkattackfile.net/spreadsheets/pdf_di...
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,http://sharkattackfile.net/spreadsheets/pdf_di...
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,http://sharkattackfile.net/spreadsheets/pdf_di...
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",http://sharkattackfile.net/spreadsheets/pdf_di...


# Categorizing body parts

In [523]:
def list_to_pattern(lst : list):
    '''
    The function receives a list of body part words and returns a regex pattern.
    
    Parameters:
    lst (list): List of body part words

    Returns:
    string: Returning regex pattern
    '''
    
    pattern = ''
    for word in lst:
        pattern += word + '|'
    pattern = pattern[:-1]
    return pattern

In [524]:
arm = ['hand', 'arm', 'forearm', 'wrist', 'shoulder', 'elbow', 'finger', 'thumb']
leg = ['ankle','knee','foot','feet','thigh','l?eg','calf','buttock','pelvis', 'shin', 'heel', 'toe']
head = ['head','neck','face', 'ear', 'nose', 'mouth', 'scalp']
torso = ['torso','chest', 'back', 'abdomen', 'hip', 'flank']
fatal = ['fatal', 'death', 'kill']
no_injury = ['no in?j?ur[iy]e?s?']
hoax = ['hoax', 'Erroneously']

Transforming the body parts lists to regular expressions:

In [525]:
arm_pattern = list_to_pattern(arm)
leg_pattern = list_to_pattern(leg)
head_pattern = list_to_pattern(head)
torso_pattern = list_to_pattern(torso)
fatal_pattern = list_to_pattern(fatal)
no_injury_pattern = list_to_pattern(no_injury)
hoax_pattern = list_to_pattern(hoax)

Creating new columns and setting the values to 0:

In [526]:
shark_attack['Arm'] = 0
shark_attack['Leg'] = 0
shark_attack['Head'] = 0
shark_attack['Torso'] = 0
shark_attack['Fatal'] = 0
shark_attack['No_Injury'] = 0
shark_attack['Hoax'] = 0

In [527]:
l = -1

for injury in shark_attack['Injury']:
    a = f'{injury}'
    lst_arm_parts = re.findall(arm_pattern, a, flags=re.I)
    lst_leg_parts = re.findall(leg_pattern, a, flags=re.I)
    lst_head_parts = re.findall(head_pattern, a, flags=re.I)
    lst_torso_parts = re.findall(torso_pattern, a, flags=re.I)
    lst_fatal = re.findall(fatal_pattern, a, flags=re.I)
    lst_no_injury = re.findall(no_injury_pattern, a, flags=re.I)
    lst_hoax = re.findall(hoax_pattern, a, flags=re.I)
    
    l += 1
    
    if len(lst_arm_parts) > 0:
        shark_attack['Arm'].update(pd.Series([1], index=[l]))
    if len(lst_leg_parts) > 0:
        shark_attack['Leg'].update(pd.Series([1], index=[l]))
    if len(lst_head_parts) > 0:
        shark_attack['Head'].update(pd.Series([1], index=[l]))
    if len(lst_torso_parts) > 0:
        shark_attack['Torso'].update(pd.Series([1], index=[l]))
    if len(lst_fatal) > 0:
        shark_attack['Fatal'].update(pd.Series([1], index=[l]))
    if len(lst_no_injury) > 0:
        shark_attack['No_Injury'].update(pd.Series([1], index=[l]))
    if len(lst_hoax) > 0:
        shark_attack['Hoax'].update(pd.Series([1], index=[l]))

Unknown body part:

In [528]:
unknown_injury = shark_attack.query('Arm == 0 and Leg == 0 and Head == 0 and Torso == 0 and No_Injury == 0 and Fatal == 0 and Hoax == 0')

In [529]:
unknown_injuries = unknown_injury.shape[0]

How many attacks in each body part?

In [530]:
print(f"Leg attacks: {shark_attack['Leg'].sum() / shark_attack.shape[0] * 100}%")
print(f"Arm attacks: {shark_attack['Arm'].sum() / shark_attack.shape[0] * 100}%")
print(f"Head attacks: {shark_attack['Head'].sum() / shark_attack.shape[0] * 100}%")
print(f"Torso attacks: {shark_attack['Torso'].sum() / shark_attack.shape[0] * 100}%")
print(f"No injuries: {shark_attack['No_Injury'].sum() / shark_attack.shape[0] * 100}%")
print(f"Fatal attacks: {shark_attack['Fatal'].sum() / shark_attack.shape[0] * 100}%")
print(f"Hoax: {shark_attack['Hoax'].sum() / shark_attack.shape[0] * 100}%")
print(f'Unknown injuries: {unknown_injuries / shark_attack.shape[0] * 100}%')

Leg attacks: 41.70104728657569%
Arm attacks: 18.54966677245319%
Head attacks: 6.60107902253253%
Torso attacks: 4.760393525864805%
No injuries: 12.868930498254521%
Fatal attacks: 22.92922881624881%
Hoax: 0.07933989209774674%
Unknown injuries: 10.330053951126626%


## Wich body part was the most attacked one?

# Cleaning 'Sex' Column

In [531]:
shark_attack.rename(columns={'Sex ' : 'Sex'}, inplace=True)

Unique values for 'Sex':

In [532]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [533]:
shark_attack.query('Sex == "N"')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
4938,1934.07.11,11-Jul-1934,1934.0,Boating,AUSTRALIA,New South Wales,Cronulla,Fishing,"18' boat, occupants William & Leslie Newton",N,,No injury to occupants Sharks continually foll...,N,,"Blue pointer, 11'",http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,0,1,0
6131,1801.12.18.R,Reported 18-Dec-1801,1801.0,Provoked,,,,Standing on landed shark's tail,Stephen Pettigew,N,,"FATAL, PROVOKED INCIDENT",Y,,12' shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,1,0,0


According to the pdf with the informations about the incident, both people that had sex = 'N' was Male

In [534]:
shark_attack.loc[4938, 'Sex'] = 'M'
shark_attack.loc[6131, 'Sex'] = 'M'

In [535]:
shark_attack.query('Sex == "lli"')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
1624,2004.11.11.b,11-Nov-2004,2004.0,Unprovoked,USA,California,"Bunkers, Humboldt Bay, Eureka, Humboldt County",Surfing,Brian Kang,lli,38,"Lacerations to hand, knee & thigh",N,13h30,5.5 m [18'] white shark,http://sharkattackfile.net/spreadsheets/pdf_di...,1,1,0,0,0,0,0


According to the pdf with the informations about the incident, the person that had 'Sex' = 'lli' was Male

In [536]:
shark_attack.loc[1624, 'Sex'] = 'M'

Removing spaces before and after 'M' or 'F'

In [537]:
shark_attack.loc[~shark_attack['Sex'].isna(), 'Sex'] = shark_attack['Sex'].str.strip()

In [538]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, '.'], dtype=object)

In [539]:
shark_attack.query('Sex == "."')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso,Fatal,No_Injury,Hoax
5437,1908.06.02.R,Reported 02-Jun-1908,1908.0,Sea Disaster,PAPUA NEW GUINEA,New Britain,Matupi,.,,.,,"Remains of 3 humans recovered from shark, but ...",Y,,Allegedly a 33-foot shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0,1,0,0


No informations about the person, change the value to Nan

In [540]:
shark_attack.loc[5437, 'Sex'] = np.nan

In [541]:
shark_attack['Sex'].fillna(value='Unknown', inplace=True)

In [542]:
shark_attack['Sex'].value_counts()

M          5099
F           637
Unknown     566
Name: Sex, dtype: int64