In [91]:
import pandas as pd
import numpy as np
import re 

import warnings
warnings.filterwarnings("ignore")

I am using shark attack incidents dataset from Kaggle: 
https://www.kaggle.com/teajay/global-shark-attacks. This data was compiled by the Global Shark Attack File http://www.sharkattackfile.net/.

# Data cleaning

## Dropping columns

In [2]:
sharks = pd.read_csv('../attacks.csv', encoding = "ISO-8859-1", engine='python')

In [3]:
sharks.shape

(25723, 24)

In [4]:
sharks['Unnamed: 23'].unique()

array([nan, 'Teramo', 'change filename'], dtype=object)

In [5]:
sharks = sharks.drop(['Case Number', 'Investigator or Source', 'Name','pdf', 'href formula', 'href',
                      'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'], axis=1)

In [6]:
sharks.head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,


## Dealing with NA and null values

In [7]:
sharks = sharks.dropna(how='all')

In [8]:
sharks[sharks.Year.isna()]

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
187,Reported 08-Jan-2017,,Invalid,AUSTRALIA,Queensland,,Spearfishing,M,35.0,"No attack, shark made a threat display",,,Bull shark
6079,Reported 19-Aug-1836,,Unprovoked,ENGLAND,Cumberland,Whitehaven,Swimming,M,,FATAL,Y,,


Since it's only two values and we see the Year in the Date column, I'm going to impute them with the year manually

In [9]:
sharks.at[187, 'Year'] = 2017
sharks.at[6079, 'Year'] = 1836

Seems like there is 125 rows with zero year. But the year information is partially available in Date.
I am using regex to extract year information from Date.

In [10]:
sharks[sharks.Year==0.0].count()

Date           125
Year           125
Type           125
Country        121
Area            99
Location        91
Activity       107
Sex            117
Age             13
Injury         124
Fatal (Y/N)    124
Time             7
Species         35
dtype: int64

In [11]:
sharks[sharks.Year==0.0].head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
6177,Ca. 214 B.C.,0.0,Unprovoked,,Ionian Sea,,Ascending from a dive,M,,"FATAL, shark/s bit him in two",Y,,
6178,Ca. 336.B.C..,0.0,Unprovoked,GREECE,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,


In [12]:
# cast year to integer instead of float
sharks['Year'] = sharks['Year'].apply(np.int64)

In [13]:
def extract_year(col): 
    '''Extracting the year from Date information'''
    x = re.findall("\d{4}", col)
    if len(x) == 0:
        return 0
    return int(x[-1])

In [14]:
year_0 = sharks.Year == 0
col = 'Year'
sharks.loc[year_0, col] = [extract_year(x) for x in sharks.loc[year_0, 'Date'].values]

In [15]:
sharks[sharks['Year'] == 0].head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
6177,Ca. 214 B.C.,0,Unprovoked,,Ionian Sea,,Ascending from a dive,M,,"FATAL, shark/s bit him in two",Y,,
6178,Ca. 336.B.C..,0,Unprovoked,GREECE,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,


In [16]:
# Dropping the rest of year = 0
sharks.drop(sharks[sharks.Year == 0].index, inplace=True)

In [17]:
#sharks[sharks.Type.isna()]

In [18]:
sharks.Type.unique()

array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster', nan, 'Boat', 'Boatomg'], dtype=object)

In [19]:
# change Nan to Questionable and Boatomg to Boat

In [20]:
sharks[sharks.Type=='Boat'].head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
538,Reported 27-Jun-2014,2014,Boat,ST. MARTIN,,20 miles from shore,Transatlantic Rowing,M,48.0,His boat was holed by a shark,N,,Oceanic whitetip shark'
565,Reported 12-Apr-2014,2014,Boat,SOUTH AFRICA,,,Shark watching,,,"No injury to occupants, shark bit pontoon",N,,White shark


In [21]:
sharks.loc[sharks.Type.isna(), 'Type'] = 'Questionable'

In [22]:
sharks.loc[sharks.Type=='Boatomg', 'Type'] = 'Boat'
sharks.loc[sharks.Type=='Boating', 'Type'] = 'Boat'

In [23]:
# there is 49 NA in country, I will ignore it for now
sharks[sharks.Country.isna()].count()

Date           49
Year           49
Type           49
Country         0
Area           14
Location       11
Activity       42
Sex            45
Age             9
Injury         47
Fatal (Y/N)    44
Time            7
Species        12
dtype: int64

In [24]:
#sharks.Country.unique()

In [25]:
sharks.loc[sharks.Country==' PHILIPPINES', 'Country'] = 'PHILIPPINES'
sharks.loc[sharks.Country=='RED SEA?', 'Country'] = 'RED SEA'

## Textural data cleaning

In [26]:
act = sharks.groupby('Activity').count()

In [27]:
#act.sort_values('Date', ascending=False)

In [28]:
# Replacing text if there is word fishing with 'Fishing'
def fishing(col): 
    try:
        x = re.findall("Fishing", col)
        if len(x) == 0:
            return col
        return 'Fishing'
    except:
        return col

col = 'Activity'
sharks.loc[:, col] = [fishing(x) for x in sharks.loc[:, col].values]

In [29]:
sharks.loc[sharks.Activity=='Freediving', 'Activity'] = 'Free diving' 

In [30]:
spec = sharks.groupby('Activity').size()
spec.sort_values(ascending=False).head(5)

Activity
Surfing         971
Swimming        869
Fishing         633
Spearfishing    333
Bathing         162
dtype: int64

In [31]:
sharks[sharks['Activity']=='Scuba diving (but on surface)']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
2613,03-Nov-1990,1990,Unprovoked,USA,California,"Monastery Beach, Carmel Bay, Monterey County",Scuba diving (but on surface),F,,Leg bitten,N,15h00,4 m to 5 m [13' to 16.5'] white shark
2631,24-Jun-1990,1990,Unprovoked,SOUTH AFRICA,Western Cape Province,Mossel Bay,Scuba diving (but on surface),F,21.0,"FATAL, thigh bitten",Y,15h45,"4.5 m [14'9""] white shark"
3252,02-Sep-1974,1974,Unprovoked,USA,California,"Franklin Point, San Mateo County",Scuba diving (but on surface),M,41.0,Minor injuries to hand,N,17h30,"White shark, 5 m to 6 m [16.5 to 20']"
3253,02-Sep-1974,1974,Unprovoked,USA,California,"Franklin Point, San Mateo County",Scuba diving (but on surface),M,48.0,Minor bite on foot & swimfin,N,17h30,"White shark, 5 m to 6 m [16.5 to 20']"


In [32]:
sharks[sharks['Activity']=='Scuba diving (submerged)']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
2524,11-Nov-1992,1992,Unprovoked,USA,California,"San Nicholas Island, Santa Barbara County",Scuba diving (submerged),M,40.0,Foot punctured,N,14h00,Unidentified shark
2993,07-Feb-1982,1982,Unprovoked,USA,California,"Stillwater Cove, Sonoma County",Scuba diving (submerged),M,,Calf & ankle bitten,N,11h00,5 m [16.5'] white shark
3103,11-Mar-1979,1979,Unprovoked,USA,California,"Ano Nuevo Island, San Mateo, County",Scuba diving (submerged),M,,"No injury, swim fin bitten",N,10h00,"White shark, 4 m to 5 m [13' to 16.5']"
3309,1973,1973,Unprovoked,PALAU,Aulong Island,Aulong Channel,Scuba diving (submerged),M,,"No injury, shark grabbed scuba tank and descen...",N,,Tiger shark


In [33]:
def activity(col, word, to_replace): 
    try:
        x = re.findall(f'({word})|({word.lower()})', col)
        if len(x) == 0:
            return col

        return to_replace
    except:
        return col

col = 'Activity'
word = "Swimming"
sharks.loc[:, col] = [activity(x, word, word) for x in sharks.loc[:, col].values]

In [34]:
word = 'Boogie Boarding'
sharks.loc[:, col] = [activity(x, word, word) for x in sharks.loc[:, col].values]

In [35]:
word = 'Kite surfing'
to_replace = 'Kite Surfing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [36]:
word = 'Sea disaster'
to_replace = 'Sea Disaster'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [37]:
word = 'Surfing'
to_replace = 'Surfing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [38]:
word = 'Surf-skiing'
to_replace = 'Surf skiing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [39]:
word = 'Scuba Diving'
to_replace = 'Scuba'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [40]:
word = 'Skin diving'
to_replace = 'Skin Diving'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [41]:
word = 'Sitting on surfboard'
to_replace = 'Surfing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [42]:
word = 'Body-boarding'
to_replace = 'Surfing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [43]:
word = 'Floating on his back'
to_replace = 'Bathing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [44]:
word = 'Playing'
to_replace = 'Bathing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [45]:
sharks.rename(index=str, columns={"Sex ": "Sex", "Species ": "Species"}, inplace=True)

In [46]:
sharks.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species'],
      dtype='object')

In [47]:
sharks.Sex.unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [48]:
sharks.loc[sharks.Sex=='M ', 'Sex'] = 'M' 
sharks.loc[sharks.Sex=='lli', 'Sex'] = None
sharks.loc[sharks.Sex=='.', 'Sex'] = None

In [49]:
#sharks[sharks.Sex.isna()].count()

In [50]:
#sharks.Age.unique()

In [51]:
sharks.loc[sharks.Age=='60s', 'Age'] = '60'
sharks.loc[sharks.Age=="60's", 'Age'] = '60'
sharks.loc[sharks.Age=='50s', 'Age'] = '50' 
sharks.loc[sharks.Age=='40s', 'Age'] = '40'
sharks.loc[sharks.Age=='30s', 'Age'] = '30' 
sharks.loc[sharks.Age=='20s', 'Age'] = '20' 
sharks.loc[sharks.Age=='Teen', 'Age'] = '15'
sharks.loc[sharks.Age=='teen', 'Age'] = '15'
sharks.loc[sharks.Age=='Teens', 'Age'] = '15'
sharks.loc[sharks.Age=='18 months', 'Age'] = '1'
sharks.loc[sharks.Age=='\xa0 ', 'Age'] = None
sharks.loc[sharks.Age=='MAKE LINE GREEN', 'Age'] = None
sharks.loc[sharks.Age=='A.M.', 'Age'] = None
sharks.loc[sharks.Age=='X', 'Age'] = None
sharks.loc[sharks.Age=='F', 'Age'] = None
sharks.loc[sharks.Age=='mid-30s', 'Age'] = '35'
sharks.loc[sharks.Age=='28 & 26', 'Age'] = '27'
sharks.loc[sharks.Age=='18 or 20', 'Age'] = '19'
sharks.loc[sharks.Age=='12 or 13', 'Age'] = '13'
sharks.loc[sharks.Age=='46 & 34', 'Age'] = '40'
sharks.loc[sharks.Age=='28, 23 & 30', 'Age'] = '27'
sharks.loc[sharks.Age=='30 or 36', 'Age'] = '33'
sharks.loc[sharks.Age=='6½', 'Age'] = '6'
sharks.loc[sharks.Age=='23 & 20', 'Age'] = '21'
sharks.loc[sharks.Age=='8 or 10', 'Age'] = '9'
sharks.loc[sharks.Age=='7      &    31', 'Age'] = '31'
sharks.loc[sharks.Age=='20?', 'Age'] = '20'
sharks.loc[sharks.Age=='21 & ?', 'Age'] = '21'
sharks.loc[sharks.Age=='36 & 26', 'Age'] = '31'
sharks.loc[sharks.Age=='32 & 30', 'Age'] = '31'
sharks.loc[sharks.Age=='33 or 37', 'Age'] = '35'
sharks.loc[sharks.Age=='16 to 18', 'Age'] = '17'
sharks.loc[sharks.Age=='13 or 18', 'Age'] = '15'
sharks.loc[sharks.Age==' ', 'Age'] = None
sharks.loc[sharks.Age==' 30', 'Age'] = '30'
sharks.loc[sharks.Age=='mid-20s', 'Age'] = '25'
sharks.loc[sharks.Age=='18 to 22', 'Age'] = '20'
sharks.loc[sharks.Age=='Ca. 33', 'Age'] = '33'
sharks.loc[sharks.Age=='74 ', 'Age'] = '74'
sharks.loc[sharks.Age=='45 ', 'Age'] = '45'
sharks.loc[sharks.Age=='21 or 26', 'Age'] = '24'
sharks.loc[sharks.Age=='20 ', 'Age'] = '20'
sharks.loc[sharks.Age=='>50', 'Age'] = '51'
sharks.loc[sharks.Age=='>50', 'Age'] = '51'
sharks.loc[sharks.Age=='9 & 12', 'Age'] = '11'
sharks.loc[sharks.Age=='? & 19', 'Age'] = '19'
sharks.loc[sharks.Age=='9 months', 'Age'] = '1'
sharks.loc[sharks.Age=='25 to 35', 'Age'] = '30'
sharks.loc[sharks.Age=='23 & 26', 'Age'] = '24'
sharks.loc[sharks.Age=='33 & 37', 'Age'] = '35'
sharks.loc[sharks.Age=='25 or 28', 'Age'] = '27'
sharks.loc[sharks.Age=='37, 67, 35, 27,  ? & 27', 'Age'] = '39'
sharks.loc[sharks.Age=='21, 34,24 & 35', 'Age'] = '30'
sharks.loc[sharks.Age=='30 & 32', 'Age'] = '31'
sharks.loc[sharks.Age=='50 & 30', 'Age'] = '40'
sharks.loc[sharks.Age=='17 & 35', 'Age'] = '26'
sharks.loc[sharks.Age=='34 & 19', 'Age'] = '26'
sharks.loc[sharks.Age=='2 to 3 months', 'Age'] = '0'
sharks.loc[sharks.Age=='7 or 8', 'Age'] = '7'
sharks.loc[sharks.Age=='17 & 16', 'Age'] = '16'
sharks.loc[sharks.Age=='Both 11', 'Age'] = '11'
sharks.loc[sharks.Age=='13 or 14', 'Age'] = '13'
sharks.loc[sharks.Age=='2½', 'Age'] = '2'
sharks.loc[sharks.Age==' 43', 'Age'] = '43'
sharks.loc[sharks.Age=='9 or 10', 'Age'] = '10'
sharks.loc[sharks.Age=='36 & 23', 'Age'] = '30'
sharks.loc[sharks.Age=='  ', 'Age'] = None
sharks.loc[sharks.Age=='10 or 12', 'Age'] = '10'
sharks.loc[sharks.Age=='?    &   14', 'Age'] = '14'
sharks.loc[sharks.Age=='31 or 33', 'Age'] = '32'
sharks.loc[sharks.Age=='Elderly', 'Age'] = '70'
sharks.loc[sharks.Age=='(adult)', 'Age'] = '40'
sharks.loc[sharks.Age=='adult', 'Age'] = '40'
sharks.loc[sharks.Age=='"middle-age"', 'Age'] = '50'
sharks.loc[sharks.Age=='"young"', 'Age'] = '20'
sharks.loc[sharks.Age=='young', 'Age'] = '20'

In [52]:
sharks.Age.unique()

array(['57', '11', '48', nan, '18', '52', '15', '12', '32', '10', '21',
       '34', '30', '60', '33', '29', '54', '41', '37', '56', '19', '25',
       '69', '38', '55', '35', '46', '45', '14', '40', '28', '20', '24',
       '26', '49', '22', '7', '31', '17', '13', '42', '3', '8', '50',
       '16', '82', '73', '68', '51', '39', '58', '47', '61', '65', '36',
       '66', '43', '9', '72', '59', '6', '27', '64', '23', '71', '44',
       '62', '63', '70', '1', '53', '77', '74', '5', '86', '84', None,
       '75', ' 28', '87', '67', '33 & 26', '0', '81', '78', '2'],
      dtype=object)

In [53]:
sharks['Fatal (Y/N)'].unique()

array(['N', 'Y', nan, 'M', 'UNKNOWN', '2017', ' N', 'N ', 'y'],
      dtype=object)

In [54]:
sharks[sharks['Fatal (Y/N)']=='M']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
285,18-Apr-2016,2016,Provoked,FRENCH POLYNESIA,Tuamotos,Makemo Atoll,Spearfishing,M,22,Laceration to knee by speared shark PROVOKED I...,M,Morning,"Grey reef shark, 2 m"


In [55]:
sharks.loc[sharks['Fatal (Y/N)']=='M', 'Fatal (Y/N)'] = 'UNKNOWN'
sharks.loc[sharks['Fatal (Y/N)']=='2017', 'Fatal (Y/N)'] = 'N'
sharks.loc[sharks['Fatal (Y/N)']=='N ', 'Fatal (Y/N)'] = 'N'
sharks.loc[sharks['Fatal (Y/N)']==' N', 'Fatal (Y/N)'] = 'N'
sharks.loc[sharks['Fatal (Y/N)']=='y', 'Fatal (Y/N)'] = 'Y'
sharks.loc[sharks['Fatal (Y/N)'].isna(), 'Fatal (Y/N)'] = 'UNKNOWN'

In [56]:
spec = sharks.groupby('Species').count()
spec.sort_values('Date', ascending=False).head(5)

Unnamed: 0_level_0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
White shark,163,163,163,163,158,157,158,140,95,162,163,98
Shark involvement prior to death was not confirmed,105,105,105,103,94,97,69,86,47,105,105,20
Invalid,102,102,102,102,90,93,86,88,45,102,102,28
Shark involvement not confirmed,88,88,88,86,83,82,77,79,51,87,88,44
Tiger shark,73,73,73,73,68,64,71,69,45,73,73,40


In [57]:
def shark(col, word, to_replace): 
    """word to be found and replace by"""
    try:
        x = re.findall(f'({word})|({word.lower()})', col)

        if len(x) == 0:
            return col

        return to_replace
    except:
        return col

col = 'Species'
word = "Bull shark"
sharks.loc[:, col] = [shark(x, word, word) for x in sharks.loc[:, col].values]

In [58]:
col = 'Species'
word = "Grey nurse shark"
sharks.loc[:, col] = [shark(x, word, word) for x in sharks.loc[:, col].values]

In [59]:
word = "Tiger shark"
to_replace = 'Tiger shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [60]:
word = "Hammerhead"
to_replace = 'Hammerhead'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [61]:
word = "Mako shark"
to_replace = 'Mako shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [62]:
word = "White shark"
to_replace = 'White shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [63]:
word = "Blue shark"
to_replace = 'Blue shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [64]:
word = "Blacktip shark"
to_replace = 'Blacktip shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [65]:
word = "Blacktip"
to_replace = 'Blacktip shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [66]:
word = "Bronze whaler shark"
to_replace = 'Bronze whaler shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [67]:
word = 'Caribbean reef shark'
to_replace = 'Caribbean reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [68]:
word = 'Caribbean reef shark'
to_replace = 'Caribbean reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [69]:
invol = []
col = 'Species'
word = 'Shark involvement'
for i in sharks.Species.iteritems():
    try:
        x = re.findall(f'({word})|({word.lower()})', i[1])
        if len(x)>0:
            invol.append(i[1])
    except:
        pass
set(invol)

{'No shark involvement',
 'Reported by media as shark attack, but shark involvement prior to death was not confirmed',
 'Shark involvement  not confirmed',
 'Shark involvement  questionable',
 'Shark involvement doubtful',
 'Shark involvement highly doubtful',
 'Shark involvement not cofirmed',
 'Shark involvement not confirmed',
 'Shark involvement not confirmed & highly unlikely',
 'Shark involvement not confirmed, injury may be due to a stingray',
 'Shark involvement not confirmed, injury may have been caused by a bluefish',
 'Shark involvement not confirmed; officials considered barracua',
 'Shark involvement not confirmed; thought to be a barracuda bite',
 'Shark involvement prior to death could not be determined',
 'Shark involvement prior to death not confirmed',
 'Shark involvement prior to death remains unconfirmed',
 'Shark involvement prior to death still to be determined',
 'Shark involvement prior to death suspected but not confirmed',
 'Shark involvement prior to death un

In [70]:
word = 'Shark involvement'
to_replace = 'Shark involvement unconfirmed'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [71]:
word = 'Wobbegong shark'
to_replace = 'Wobbegong shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [72]:
word = 'Nurse shark'
to_replace = 'Nurse shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [73]:
word = 'Grey reef shark'
to_replace = 'Grey reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [74]:
spec = sharks.groupby('Species').size()
spec.sort_values(ascending=False).head(5)

Species
White shark                      635
Shark involvement unconfirmed    330
Tiger shark                      281
Bull shark                       180
Invalid                          102
dtype: int64

In [75]:
invol = []
col = 'Species'
word = 'Leopard shark'
for i in sharks.Species.iteritems():
    try:
        x = re.findall(f'({word})|({word.lower()})', i[1])
        if len(x)>0:
            invol.append(i[1])
    except:
        pass
set(invol)

{'Leopard shark',
 "Leopard shark, 3' Triakis semifasciata, identified by J.W. DeWitt (1955)"}

In [76]:
sharks.loc[sharks.Age=='Questionable incident - shark bite may have precipitated drowning', 'Species'] = 'Shark involvement unconfirmed'
sharks.loc[sharks.Age=='Questionable incident; reported as shark attack but thought to involve a pinniped instead ', 'Species'] = 'Shark involvement unconfirmed'

In [77]:
word = 'Questionable'
to_replace = 'Questionable'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [78]:
word = 'Reef shark'
to_replace = 'Reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [79]:
word = "Zambesi"
to_replace = 'Bull shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [80]:
word = "Zambezi"
to_replace = 'Bull shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [81]:
word = 'Sevengill'
to_replace = 'Sevengill'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [82]:
word = 'Porbeagle'
to_replace = 'Porbeagle'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [83]:
word = 'Raggedtooth'
to_replace = 'Raggedtooth'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [84]:
#sharks[sharks.Species=='Invalid']

## Adding additional columns

 Adding Hour column for time of the attack

In [85]:
sharks['Hour'] = sharks['Time'].str.extract("([0-9]+)", expand=False).dropna().astype(int)
sharks['Hour']= sharks[sharks['Hour']<25]['Hour']

Assigning countries into Southern and Northern hemispheres and adding Month column for Month of the attack

In [86]:
# Northen - 0, Southern - 1
countries = {'CROATIA': 0, 'NORWAY': 0, 'FRANCE': 0, 'MARTINIQUE': 0, 'ICELAND': 0, 
            'JAVA': 1, 'Sierra Leone': 0, 'CYPRUS': 0, 'LIBERIA': 0, 'NEW BRITAIN': 1, 
            'URUGUAY': 1, 'NORTH ATLANTIC OCEAN ': 0, 'ADMIRALTY ISLANDS': 1, 
            'PAPUA NEW GUINEA': 1, 'DJIBOUTI': 0, 'TAIWAN': 1, 'EL SALVADOR': 0, 
            'ST. MAARTIN': 0, 'ASIA?': 0, 'NAMIBIA': 1, 'OCEAN': 1, 'CAPE VERDE': 0, 
            'MID ATLANTIC OCEAN': 0, 'MAURITIUS': 1, 'ANTIGUA': 0, 'FRENCH POLYNESIA': 1, 
            'JOHNSTON ISLAND': 0, 'SUDAN': 0, 'SOUTH KOREA': 0, 'TUVALU': 1, 
            'SOUTH ATLANTIC OCEAN': 1, 'UNITED ARAB EMIRATES (UAE)': 0, 'DOMINICAN REPUBLIC': 0, 
            ' PHILIPPINES': 0, 'MALAYSIA': 0, 'BRITISH VIRGIN ISLANDS': 0, 'CHINA': 0, 
            'ATLANTIC OCEAN': 0, 'ITALY': 0, 'VENEZUELA': 0, 'SOLOMON ISLANDS / VANUATU': 1, 
            'SOUTH CHINA SEA': 0, 'Between PORTUGAL & INDIA': 2, 'DIEGO GARCIA': 1, 
            'MEDITERRANEAN SEA?': 0, 'INDIAN OCEAN?': 1, 'INDIA': 0, 'SOUTH AFRICA': 1, 
            'St Helena': 1, 'WESTERN SAMOA': 1, 'TASMAN SEA': 1, 'HONG KONG': 0, 'TONGA': 1, 
            'YEMEN': 0, 'COLUMBIA': 0, 'NORTHERN MARIANA ISLANDS': 0, 'GUAM': 0, 'GUINEA': 0, 
            'CENTRAL PACIFIC': 2, 'GUATEMALA': 0, 'FIJI': 1, 'GULF OF ADEN': 0, 'JAPAN': 0, 
            'MID-PACIFC OCEAN': 0, 'ST. MARTIN': 1, 'USA': 0, 'CRETE': 0, 'BRAZIL': 1, 
            'TURKS & CAICOS': 0, 'SOUTHWEST PACIFIC OCEAN': 1, 'GREENLAND': 0, 
            'BAY OF BENGAL': 1, 'PACIFIC OCEAN': 0, 'LEBANON': 0, 'MALTA': 0, 'NIGERIA': 0, 
            'GREECE': 0, 'MEXICO': 0, 'BERMUDA': 0, 'UNITED KINGDOM': 0, 'SINGAPORE': 0, 
            'BRITISH ISLES': 0, 'TURKEY': 0, 'NEVIS': 1, 'AUSTRALIA': 1, 'ENGLAND': 0, 
            'SIERRA LEONE': 0, 'VANUATU': 1, 'NORTH SEA': 0, 'RUSSIA': 0, 'MICRONESIA': 0, 
            'PORTUGAL': 0, 'RED SEA': 0, 'MONTENEGRO': 0, 'IRAQ': 0, 'SWEDEN': 0, 
            'PERSIAN GULF': 0, 'NORTH ATLANTIC OCEAN': 0, 'Fiji': 1, 'SLOVENIA': 0, 
            'PHILIPPINES': 0, 'IRAN / IRAQ': 0, 'TUNISIA': 0, 'SAN DOMINGO': 1, 'AZORES': 0, 
            'GEORGIA': 0, 'BURMA': 0, 'NEW GUINEA': 1, 'SUDAN?': 0, 'NETHERLANDS ANTILLES': 0, 
            'ALGERIA': 0, 'NICARAGUA': 0, 'SEYCHELLES': 1, 'RED SEA?': 0, 'BRITISH NEW GUINEA': 1, 
            'THAILAND': 0, 'PALESTINIAN TERRITORIES': 0, 'FALKLAND ISLANDS': 1, 'IRELAND': 0, 
            'MONACO': 0, 'PARAGUAY': 1, 'SYRIA': 0, 'EGYPT ': 0, 'MADAGASCAR': 1, 
            'NORTH PACIFIC OCEAN': 0, 'EGYPT / ISRAEL': 0, 'COOK ISLANDS': 1, 
            'TRINIDAD & TOBAGO': 0, 'PACIFIC OCEAN ': 0, 'EQUATORIAL GUINEA / CAMEROON': 0, 
            'ISRAEL': 0, 'SAMOA': 1, 'ECUADOR': 1, 'CARIBBEAN SEA': 0, 'NEW CALEDONIA': 1, 
            'MARSHALL ISLANDS': 0, 'PANAMA': 0, 'UNITED ARAB EMIRATES': 0, 'ITALY / CROATIA': 0, 
            'NEW ZEALAND': 1, 'MALDIVE ISLANDS': 0, 'GHANA': 0, 'MOZAMBIQUE': 0, 'SRI LANKA': 0, 
            'SOLOMON ISLANDS': 1, 'Coast of AFRICA': 1, 'BARBADOS': 0, 'BANGLADESH': 0, 
            'CHILE': 1, 'CANADA': 0, 'HONDURAS': 0, 'PALAU': 0, 'AMERICAN SAMOA': 1, 
            'SAUDI ARABIA': 0, ' TONGA': 1, 'SPAIN': 0, 'ARGENTINA': 1, 'CURACAO': 0, 
            'ANDAMAN / NICOBAR ISLANDAS': 0, 'KENYA': 1, 'EGYPT': 0, 'THE BALKANS': 0, 
            'PUERTO RICO': 0, 'KIRIBATI': 0, 'OKINAWA': 0, 'REUNION': 1, 
            'BRITISH WEST INDIES': 0, 'NICARAGUA ': 0, 'FEDERATED STATES OF MICRONESIA': 0, 
            'IRAN': 0, 'CAYMAN ISLANDS': 0, 'SOMALIA': 0, 'INDONESIA': 1, 'KUWAIT': 0, 
            'Seychelles': 1, 'COSTA RICA': 0, 'INDIAN OCEAN': 1, 'CEYLON (SRI LANKA)': 0, 
            'YEMEN ': 0, 'HAITI': 0, 'SCOTLAND': 0, 'CUBA': 0, 'GUYANA': 0, 'LIBYA': 0, 
            'MEXICO ': 0, 'SENEGAL': 0, 'GRAND CAYMAN': 0, 'GABON': 1, 'GRENADA': 0, 
            'RED SEA / INDIAN OCEAN': 0, 'VIETNAM': 0, 'BAHAMAS': 0, 'BAHREIN': 0, 
            'NORTHERN ARABIAN SEA': 0, 'BELIZE': 0, 'MEDITERRANEAN SEA': 0, 'ANGOLA': 1, 
            'SOUTH PACIFIC OCEAN': 1, 'TANZANIA': 1, 'KOREA': 0, 'JAMAICA': 0, 'ARUBA': 0, 
            'MAYOTTE':1}

In [87]:
sharks['Month'] = None

In [88]:
months_dict = {'Jan': 0, 'Feb': 1, 'Mar': 2, 
               'Apr': 3, 'Ap-': 3, 'May': 4, 
               'Jun': 5, 'Jul': 6, 'Aug': 7, 
               'Sep': 8, 'Oct': 9, 'Nov': 10, 
               'Dec': 11}

def extract_month(d):  
    date = d.replace(' ', '')
    date = date.replace('July', 'Jul')
    date = date.replace('Sept', 'Sep')
    date = date.replace('--', '-')
    date = date.replace('y2', 'y-2')
    date = date.replace('v2', 'v-2')
    month = 0
    if len(date) >= 11 and len(date) <= 12 and date[2] == '-':
        month = date[3:6]
    elif len(date) == 10 and date[1] == '-':
        month = date[2:5]
    elif len(date) == 19 and date[10] == '-':
        month = date[11:14]
    try:
        return months_dict[month]
    except:
        return None
    
sharks["Month"] = sharks["Date"].apply(lambda x: extract_month(x));

def hemisphere(x): 
    try:  
        hem = countries[x]
        return hem
    except:
        return None

sharks["Hemisphere"] = sharks["Country"].apply(lambda x: hemisphere(x));

I have decided to focus on data between 1900 and 2018, mostly because the population of sharks is on the decrease and as well as people are going more often into the water to enjoy a variety of water activities. Moreover, the data set included some of the attacks described in the ancient or medieval literature, so I wanted to exclude it.

In [92]:
sharks = sharks[sharks.Year>=1900]

Saving cleaned DataFrame

In [93]:
sharks.to_csv('shark_attack_cleaned.csv', index=False)