# Importing Libraries

In [298]:
import pandas as pd
import re
import numpy as np
from IPython.display import display

# Reading CSV

In [299]:
shark_attack = pd.read_csv('data/attacks.csv', encoding='latin')

Drop columns: 'Unnamed: 22', 'Unnamed: 23' and duplicated rows

In [300]:
shark_attack = shark_attack.drop(columns=['Unnamed: 22', 'Unnamed: 23', 'href formula', 'pdf']).drop_duplicates()

In [301]:
shark_attack = shark_attack.loc[~(shark_attack.isnull().sum(axis=1) >= 18), :]

In [302]:
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.inf)
shark_attack.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0


In [303]:
shark_attack.shape

(6302, 20)

# Categorizing body parts

In [304]:
def list_to_pattern(lst : list):
    '''
    The function receives a list of body part words and returns a regex pattern.
    
    Parameters:
    lst (list): List of body part words

    Returns:
    string: Returning regex pattern
    '''
    
    pattern = ''
    for word in lst:
        pattern += word + '|'
    pattern = pattern[:-1]
    return pattern

In [305]:
arm = ['hand', 'arm', 'forearm', 'wrist', 'shoulder', 'elbow', 'finger', 'thumb']
leg = ['ankle','knee','foot','feet','thigh','leg','calf','buttock','pelvis', 'shin', 'heel', 'toe']
head = ['head','neck','face', 'ear', 'nose', 'mouth']
torso = ['torso','chest', 'back', 'abdomen', 'hip']
fatal = ['fatal', 'death', 'kill']

In [306]:
arm_pattern = list_to_pattern(arm)
leg_pattern = list_to_pattern(leg)
head_pattern = list_to_pattern(head)
torso_pattern = list_to_pattern(torso)
fatal_pattern = list_to_pattern(fatal)

In [307]:
shark_attack['Arm'] = 0
shark_attack['Leg'] = 0
shark_attack['Head'] = 0
shark_attack['Torso'] = 0

In [308]:
l = -1

for injury in shark_attack['Injury']:
    a = f'{injury}'
    lst_arm_parts = re.findall(arm_pattern, a, flags=re.I)
    lst_leg_parts = re.findall(leg_pattern, a, flags=re.I)
    lst_head_parts = re.findall(head_pattern, a, flags=re.I)
    lst_torso_parts = re.findall(torso_pattern, a, flags=re.I)
    lst_fatal = re.findall(fatal_pattern, a, flags=re.I)
    l += 1
    if len(lst_arm_parts) > 0:
        shark_attack['Arm'].update(pd.Series([1], index=[l]))
    if len(lst_leg_parts) > 0:
        shark_attack['Leg'].update(pd.Series([1], index=[l]))
    if len(lst_head_parts) > 0:
        shark_attack['Head'].update(pd.Series([1], index=[l]))
    if len(lst_torso_parts) > 0:
        shark_attack['Torso'].update(pd.Series([1], index=[l]))

In [309]:
shark_attack['Leg'].sum()

2620

In [310]:
shark_attack['Arm'].sum()

1169

In [311]:
shark_attack['Head'].sum()

414

In [312]:
shark_attack['Torso'].sum()

299

# Cleaning 'Sex' Column

In [313]:
shark_attack.rename(columns={'Sex ' : 'Sex'}, inplace=True)

In [314]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [315]:
shark_attack.query('Sex == "N"')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href,Case Number.1,Case Number.2,original order,Arm,Leg,Head,Torso
4938,1934.07.11,11-Jul-1934,1934.0,Boating,AUSTRALIA,New South Wales,Cronulla,Fishing,"18' boat, occupants William & Leslie Newton",N,,No injury to occupants Sharks continually foll...,N,,"Blue pointer, 11'","G.P. Whitley, ref: Daily Telegraph, 7/11/1934 ...",http://sharkattackfile.net/spreadsheets/pdf_di...,1934.07.11,1934.07.11,1365.0,0,0,0,0
6131,1801.12.18.R,Reported 18-Dec-1801,1801.0,Provoked,,,,Standing on landed shark's tail,Stephen Pettigew,N,,"FATAL, PROVOKED INCIDENT",Y,,12' shark,"The Evening Post, 12/18/1801",http://sharkattackfile.net/spreadsheets/pdf_di...,1801.12.18.R,1801.12.18.R,172.0,0,0,0,0


According to the pdf with the informations about the incident, both people that had sex = 'N' was Male

In [316]:
shark_attack.loc[4938, 'Sex'] = 'M'
shark_attack.loc[6131, 'Sex'] = 'M'

In [317]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, 'M ', 'lli', '.'], dtype=object)

In [318]:
shark_attack.query('Sex == "lli"')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href,Case Number.1,Case Number.2,original order,Arm,Leg,Head,Torso
1624,2004.11.11.b,11-Nov-2004,2004.0,Unprovoked,USA,California,"Bunkers, Humboldt Bay, Eureka, Humboldt County",Surfing,Brian Kang,lli,38,"Lacerations to hand, knee & thigh",N,13h30,5.5 m [18'] white shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,2004.11.11.b,2004.11.11.b,4679.0,1,1,0,0


According to the pdf with the informations about the incident, the person that had 'Sex' = 'lli' was Male

In [319]:
shark_attack.loc[1624, 'Sex'] = 'M'

In [320]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, 'M ', '.'], dtype=object)

Removing spaces before and after 'M' or 'F'

In [321]:
shark_attack.loc[~shark_attack['Sex'].isna(), 'Sex'] = shark_attack['Sex'].str.strip()

In [322]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, '.'], dtype=object)

In [323]:
shark_attack.query('Sex == "."')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href,Case Number.1,Case Number.2,original order,Arm,Leg,Head,Torso
5437,1908.06.02.R,Reported 02-Jun-1908,1908.0,Sea Disaster,PAPUA NEW GUINEA,New Britain,Matupi,.,,.,,"Remains of 3 humans recovered from shark, but ...",Y,,Allegedly a 33-foot shark,"Taranaki Herald, 6/2/1908",http://sharkattackfile.net/spreadsheets/pdf_di...,1908.06.02.R,1908.06.02.R,866.0,0,0,0,0


In [324]:
shark_attack.loc[5437, 'Sex'] = np.nan

In [325]:
shark_attack['Sex'].unique()

array(['F', 'M', nan], dtype=object)

In [326]:
shark_attack.loc[shark_attack['Sex'].isna(), :]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href,Case Number.1,Case Number.2,original order,Arm,Leg,Head,Torso
32,2018.04.09,09-Apr-2018,2018.0,Unprovoked,NEW CALEDONIA,,"Magenta Beach, Noumea",Windsurfing,,,,"No injury, shark bit board",N,17h00,2 m shark,"Les Nouvelles Caledoniennes, 4/10/2018",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.04.09,2018.04.09,6271.0,0,0,0,0
59,2017.11.25.R,Reported 25-Nov-2017,2017.0,Sea Disaster,LIBYA,,Gars Garabulli,2 boats capsized,31 migrants,,,FATAL,Y,,Some drowned but other may have been killed by...,"TG Com 24, 11/25/2017",http://sharkattackfile.net/spreadsheets/pdf_di...,2017.11.25.R,2017.11.25.R,6244.0,0,0,0,0
86,2017.09.14,Sep-2017,2017.0,Boating,AUSTRALIA,Westerm Australia,Esperance,Fishing,,,,"sharks rammed boats, no injury to occupants",N,,"White shark, 3.5m","B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,2017.09.14,2017.09.14,6217.0,0,0,0,0
124,2017.07.02,02-Jul-2017,2017.0,Invalid,COMOROS,Anjouan,Moya,Fishing,,,,"Skull found in shark, a probable drowning & sc...",,,Shark involvement prior to death not confirmed,"Linfo, 7/3/2017",http://sharkattackfile.net/spreadsheets/pdf_di...,2017.07.02,2017.07.02,6179.0,0,0,0,0
154,2017.04.17.b,17-Apr-2017,2017.0,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",,,,,Minor bite to the foot,N,Afternoon,,"Daytona Beach News-Journal, 4/17/2017",http://sharkattackfile.net/spreadsheets/pdf_di...,2017.04.17.b,2017.04.17.b,6149.0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6212,ND-0107,Before 2004,0.0,Boat,MOZAMBIQUE,Inhambane Province,Off Inhambane,Fishing,"4.8-metre skiboat, Occupants: Rod Salm & 4 fri...",,,"No injury to occupants, shark bumped boat",N,,Whale shark,South African Shark Attack File,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0107,ND-0107,91.0,0,0,0,0
6237,ND.0073,"No date, Before 1963",0.0,Unprovoked,SINGAPORE,,"Keppel Harbor, 2 miles from Singapore city ce...",Swimming,,,,Recovered,N,,,"V.M. Coppleson (1958), p.266",http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0073,ND.0073,66.0,0,0,0,0
6259,ND.0044,1941-1945,0.0,Sea Disaster,,,,A group of survivors on a raft for 17-days,C.,,,"FATAL, shark leapt into raft and bit the man w...",Y,Late afternoon,1.2 m [4'] shark,"G.A. Llano in Airmen Against the Sea, p.69",http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0044,ND.0044,44.0,0,0,0,0
6278,ND.0024,Between 1918 & 1939,0.0,Unprovoked,REUNION,Saint-Denis,Barachois,Swimming,,,,FATAL,Y,,,G. Van Grevelynghe,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0024,ND.0024,25.0,0,0,0,0
