In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import re 

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from IPython.display import Image
from IPython.core.display import HTML 

import warnings
warnings.filterwarnings("ignore")

init_notebook_mode(connected=True)

I am using shark attack incidents dataset from Kaggle: 
https://www.kaggle.com/teajay/global-shark-attacks. This data was compiled by the Global Shark Attack File http://www.sharkattackfile.net/.

# Data cleaning

## Dropping columns

In [2]:
sharks = pd.read_csv('../attacks.csv', encoding = "ISO-8859-1", engine='python')

In [3]:
sharks.shape

(25723, 24)

In [4]:
sharks['Unnamed: 23'].unique()

array([nan, 'Teramo', 'change filename'], dtype=object)

In [5]:
sharks = sharks.drop(['Case Number', 'Investigator or Source', 'Name','pdf', 'href formula', 'href',
                      'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'], axis=1)

In [6]:
sharks.head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,


## Dealing with NA and null values

In [7]:
sharks = sharks.dropna(how='all')

In [8]:
# in column Year
sharks.Year.unique()

array([2018., 2017.,   nan, 2016., 2015., 2014., 2013., 2012., 2011.,
       2010., 2009., 2008., 2007., 2006., 2005., 2004., 2003., 2002.,
       2001., 2000., 1999., 1998., 1997., 1996., 1995., 1984., 1994.,
       1993., 1992., 1991., 1990., 1989., 1969., 1988., 1987., 1986.,
       1985., 1983., 1982., 1981., 1980., 1979., 1978., 1977., 1976.,
       1975., 1974., 1973., 1972., 1971., 1970., 1968., 1967., 1966.,
       1965., 1964., 1963., 1962., 1961., 1960., 1959., 1958., 1957.,
       1956., 1955., 1954., 1953., 1952., 1951., 1950., 1949., 1948.,
       1848., 1947., 1946., 1945., 1944., 1943., 1942., 1941., 1940.,
       1939., 1938., 1937., 1936., 1935., 1934., 1933., 1932., 1931.,
       1930., 1929., 1928., 1927., 1926., 1925., 1924., 1923., 1922.,
       1921., 1920., 1919., 1918., 1917., 1916., 1915., 1914., 1913.,
       1912., 1911., 1910., 1909., 1908., 1907., 1906., 1905., 1904.,
       1903., 1902., 1901., 1900., 1899., 1898., 1897., 1896., 1895.,
       1894., 1893.,

In [9]:
sharks[sharks.Year.isna()]

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
187,Reported 08-Jan-2017,,Invalid,AUSTRALIA,Queensland,,Spearfishing,M,35.0,"No attack, shark made a threat display",,,Bull shark
6079,Reported 19-Aug-1836,,Unprovoked,ENGLAND,Cumberland,Whitehaven,Swimming,M,,FATAL,Y,,


In [10]:
# since it's only two values and we see the Year in the Date column, I'm going to impute with the year
sharks.at[187, 'Year'] = 2017
sharks.at[6079, 'Year'] = 1836

In [11]:
# seems like there is 125 rows with zero year. But the year information is partially available in Date.
# with regex I will impute the year from the date information. For the date B.C. I will leave 0 as year
sharks[sharks.Year==0.0].count()

Date           125
Year           125
Type           125
Country        121
Area            99
Location        91
Activity       107
Sex            117
Age             13
Injury         124
Fatal (Y/N)    124
Time             7
Species         35
dtype: int64

In [12]:
sharks[sharks.Year==0.0].head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
6177,Ca. 214 B.C.,0.0,Unprovoked,,Ionian Sea,,Ascending from a dive,M,,"FATAL, shark/s bit him in two",Y,,
6178,Ca. 336.B.C..,0.0,Unprovoked,GREECE,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,


In [13]:
# cast year to integer instead of float
sharks['Year'] = sharks['Year'].apply(np.int64)

In [14]:
# Extracting the year from Date information
def extract_year(col): 
    x = re.findall("\d{4}", col)
    if len(x) == 0:
        return 0
    return int(x[-1])

In [15]:
year_0 = sharks.Year == 0
col = 'Year'
sharks.loc[year_0, col] = [extract_year(x) for x in sharks.loc[year_0, 'Date'].values]

In [16]:
sharks[sharks['Year'] == 0].head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
6177,Ca. 214 B.C.,0,Unprovoked,,Ionian Sea,,Ascending from a dive,M,,"FATAL, shark/s bit him in two",Y,,
6178,Ca. 336.B.C..,0,Unprovoked,GREECE,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,


In [17]:
# Dropping the rest of year = 0
sharks.drop(sharks[sharks.Year == 0].index, inplace=True)

In [18]:
sharks[sharks.Type.isna()]

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
85,15-Sep-2017,2017,,SAMOA,Upolu Island,Nofoalii,Fishing,M,,Injuries to hands and legs,N,Night,
382,27-Jul-2015,2015,,AUSTRALIA,Victoria,Tyrendarra Beach,Surfing,M,40.0,Injury to hand,,,
4867,Reported 11-Sep-1936,1936,,VIETNAM,,Saigon,Wreck of a sampam,M,,FATAL,Y,,
5705,Reported 03-Mar-1890,1890,,CEYLON,,,Diving,M,,FATAL,Y,,


In [19]:
sharks.Type.unique()

array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster', nan, 'Boat', 'Boatomg'], dtype=object)

In [20]:
# change Nan to Questionable and Boatomg to Boat

In [21]:
sharks[sharks.Type=='Boat'].head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
538,Reported 27-Jun-2014,2014,Boat,ST. MARTIN,,20 miles from shore,Transatlantic Rowing,M,48.0,His boat was holed by a shark,N,,Oceanic whitetip shark'
565,Reported 12-Apr-2014,2014,Boat,SOUTH AFRICA,,,Shark watching,,,"No injury to occupants, shark bit pontoon",N,,White shark


In [22]:
sharks.loc[sharks.Type.isna(), 'Type'] = 'Questionable'

In [23]:
sharks.loc[sharks.Type=='Boatomg', 'Type'] = 'Boat'
sharks.loc[sharks.Type=='Boating', 'Type'] = 'Boat'

In [24]:
# there is 49 NA in country, I will ignore it for now
sharks[sharks.Country.isna()].count()

Date           49
Year           49
Type           49
Country         0
Area           14
Location       11
Activity       42
Sex            45
Age             9
Injury         47
Fatal (Y/N)    44
Time            7
Species        12
dtype: int64

In [25]:
sharks.Country.unique()

array(['USA', 'AUSTRALIA', 'MEXICO', 'BRAZIL', 'ENGLAND', 'SOUTH AFRICA',
       'THAILAND', 'COSTA RICA', 'MALDIVES', 'BAHAMAS', 'NEW CALEDONIA',
       'ECUADOR', 'MALAYSIA', 'LIBYA', nan, 'CUBA', 'MAURITIUS',
       'NEW ZEALAND', 'SPAIN', 'SAMOA', 'SOLOMON ISLANDS', 'JAPAN',
       'EGYPT', 'ST HELENA, British overseas territory', 'COMOROS',
       'REUNION', 'FRENCH POLYNESIA', 'UNITED KINGDOM',
       'UNITED ARAB EMIRATES', 'PHILIPPINES', 'INDONESIA', 'CHINA',
       'COLUMBIA', 'CAPE VERDE', 'Fiji', 'DOMINICAN REPUBLIC',
       'CAYMAN ISLANDS', 'ARUBA', 'MOZAMBIQUE', 'FIJI', 'PUERTO RICO',
       'ITALY', 'ATLANTIC OCEAN', 'GREECE', 'ST. MARTIN', 'FRANCE',
       'PAPUA NEW GUINEA', 'TRINIDAD & TOBAGO', 'KIRIBATI', 'ISRAEL',
       'DIEGO GARCIA', 'TAIWAN', 'JAMAICA', 'PALESTINIAN TERRITORIES',
       'GUAM', 'SEYCHELLES', 'BELIZE', 'NIGERIA', 'TONGA', 'SCOTLAND',
       'CANADA', 'CROATIA', 'SAUDI ARABIA', 'CHILE', 'ANTIGUA', 'KENYA',
       'RUSSIA', 'TURKS & CAICOS', 'UNITE

In [26]:
sharks.loc[sharks.Country==' PHILIPPINES', 'Country'] = 'PHILIPPINES'
sharks.loc[sharks.Country=='RED SEA?', 'Country'] = 'RED SEA'

## Textural data cleaning

In [28]:
act = sharks.groupby('Activity').count()

In [30]:
act.sort_values('Date', ascending=False)

Unnamed: 0_level_0,Date,Year,Type,Country,Area,Location,Sex,Age,Injury,Fatal (Y/N),Time,Species
Activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Surfing,971,971,971,970,966,959,946,778,965,933,749,591
Swimming,869,869,869,862,824,821,846,564,867,780,448,406
Fishing,431,431,431,431,397,398,338,132,430,412,114,264
Spearfishing,333,333,333,332,304,309,318,221,333,309,162,255
Bathing,162,162,162,160,149,156,156,53,161,149,45,33
Wading,148,148,148,147,144,143,146,123,147,139,107,82
Diving,127,127,127,123,112,100,112,44,126,114,36,68
Standing,99,99,99,99,97,95,99,79,99,98,69,47
Snorkeling,89,89,89,89,84,81,87,68,89,87,61,66
Scuba diving,76,76,76,76,75,73,76,50,76,63,36,63


In [31]:
# Replacing text if there is word fishing with 'Fishing'
def fishing(col): 
    try:
        x = re.findall("Fishing", col)
        if len(x) == 0:
            return col
        return 'Fishing'
    except:
        return col

col = 'Activity'
sharks.loc[:, col] = [fishing(x) for x in sharks.loc[:, col].values]

In [32]:
sharks.loc[sharks.Activity=='Freediving', 'Activity'] = 'Free diving' 

In [34]:
spec = sharks.groupby('Activity').size()
spec.sort_values(ascending=False).head(50)

Activity
Surfing                           971
Swimming                          869
Fishing                           633
Spearfishing                      333
Bathing                           162
Wading                            148
Diving                            127
Standing                           99
Snorkeling                         89
Scuba diving                       76
Body boarding                      61
Body surfing                       49
Swimming                           47
Free diving                        38
Kayaking                           33
Fell overboard                     32
Treading water                     32
Pearl diving                       31
Boogie boarding                    29
Windsurfing                        19
Walking                            17
Boogie Boarding                    16
Shark fishing                      15
Floating                           14
Canoeing                           13
Surf skiing                        12
Row

In [35]:
sharks[sharks['Activity']=='Scuba diving (but on surface)']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
2613,03-Nov-1990,1990,Unprovoked,USA,California,"Monastery Beach, Carmel Bay, Monterey County",Scuba diving (but on surface),F,,Leg bitten,N,15h00,4 m to 5 m [13' to 16.5'] white shark
2631,24-Jun-1990,1990,Unprovoked,SOUTH AFRICA,Western Cape Province,Mossel Bay,Scuba diving (but on surface),F,21.0,"FATAL, thigh bitten",Y,15h45,"4.5 m [14'9""] white shark"
3252,02-Sep-1974,1974,Unprovoked,USA,California,"Franklin Point, San Mateo County",Scuba diving (but on surface),M,41.0,Minor injuries to hand,N,17h30,"White shark, 5 m to 6 m [16.5 to 20']"
3253,02-Sep-1974,1974,Unprovoked,USA,California,"Franklin Point, San Mateo County",Scuba diving (but on surface),M,48.0,Minor bite on foot & swimfin,N,17h30,"White shark, 5 m to 6 m [16.5 to 20']"


In [36]:
sharks[sharks['Activity']=='Scuba diving (submerged)']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
2524,11-Nov-1992,1992,Unprovoked,USA,California,"San Nicholas Island, Santa Barbara County",Scuba diving (submerged),M,40.0,Foot punctured,N,14h00,Unidentified shark
2993,07-Feb-1982,1982,Unprovoked,USA,California,"Stillwater Cove, Sonoma County",Scuba diving (submerged),M,,Calf & ankle bitten,N,11h00,5 m [16.5'] white shark
3103,11-Mar-1979,1979,Unprovoked,USA,California,"Ano Nuevo Island, San Mateo, County",Scuba diving (submerged),M,,"No injury, swim fin bitten",N,10h00,"White shark, 4 m to 5 m [13' to 16.5']"
3309,1973,1973,Unprovoked,PALAU,Aulong Island,Aulong Channel,Scuba diving (submerged),M,,"No injury, shark grabbed scuba tank and descen...",N,,Tiger shark


In [38]:
def activity(col, word, to_replace): 
    try:
        x = re.findall(f'({word})|({word.lower()})', col)
        if len(x) == 0:
            return col

        return to_replace
    except:
        return col

col = 'Activity'
word = "Swimming"
sharks.loc[:, col] = [activity(x, word, word) for x in sharks.loc[:, col].values]

In [39]:
word = 'Boogie Boarding'
sharks.loc[:, col] = [activity(x, word, word) for x in sharks.loc[:, col].values]

In [40]:
word = 'Kite surfing'
to_replace = 'Kite Surfing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [41]:
word = 'Sea disaster'
to_replace = 'Sea Disaster'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [42]:
word = 'Surfing'
to_replace = 'Surfing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [43]:
word = 'Surf-skiing'
to_replace = 'Surf skiing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [44]:
word = 'Scuba Diving'
to_replace = 'Scuba'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [45]:
word = 'Skin diving'
to_replace = 'Skin Diving'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [46]:
word = 'Sitting on surfboard'
to_replace = 'Surfing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [47]:
word = 'Body-boarding'
to_replace = 'Surfing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [48]:
word = 'Floating on his back'
to_replace = 'Bathing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [49]:
word = 'Playing'
to_replace = 'Bathing'
sharks.loc[:, col] = [activity(x, word, to_replace) for x in sharks.loc[:, col].values]

In [50]:
sharks.rename(index=str, columns={"Sex ": "Sex", "Species ": "Species"}, inplace=True)

In [51]:
sharks.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species'],
      dtype='object')

In [52]:
sharks.Sex.unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [53]:
sharks.loc[sharks.Sex=='M ', 'Sex'] = 'M' 
sharks.loc[sharks.Sex=='lli', 'Sex'] = None
sharks.loc[sharks.Sex=='.', 'Sex'] = None

In [54]:
sharks[sharks.Sex.isna()].count()

Date           567
Year           567
Type           567
Country        563
Area           505
Location       500
Activity       416
Sex              0
Age             39
Injury         553
Fatal (Y/N)    489
Time           106
Species        321
dtype: int64

In [55]:
sharks.Age.unique()

array(['57', '11', '48', nan, '18', '52', '15', '12', '32', '10', '21',
       '34', '30', '60', '33', '29', '54', '41', '37', '56', '19', '25',
       '69', '38', '55', '35', '46', '45', '14', '40s', '28', '20', '24',
       '26', '49', '22', '7', '31', '17', '40', '13', '42', '3', '8',
       '50', '16', '82', '73', '20s', '68', '51', '39', '58', 'Teen',
       '47', '61', '65', '36', '66', '43', '60s', '9', '72', '59', '6',
       '27', '64', '23', '71', '44', '62', '63', '70', '18 months', '53',
       '30s', '50s', 'teen', '77', '74', '28 & 26', '5', '86', '18 or 20',
       '12 or 13', '46 & 34', '28, 23 & 30', 'Teens', '36 & 26',
       '8 or 10', '84', '\xa0 ', ' ', '30 or 36', '6½', '21 & ?', '75',
       '33 or 37', 'mid-30s', '23 & 20', ' 30', '7      &    31', ' 28',
       '20?', "60's", '32 & 30', '16 to 18', '87', '67', 'Elderly',
       'mid-20s', 'Ca. 33', '74 ', '45 ', '21 or 26', '20 ', '>50',
       '18 to 22', 'adult', '9 & 12', '? & 19', '9 months', '25 to 35',
  

In [56]:
sharks.loc[sharks.Age=='60s', 'Age'] = '60'
sharks.loc[sharks.Age=="60's", 'Age'] = '60'
sharks.loc[sharks.Age=='50s', 'Age'] = '50' 
sharks.loc[sharks.Age=='40s', 'Age'] = '40'
sharks.loc[sharks.Age=='30s', 'Age'] = '30' 
sharks.loc[sharks.Age=='20s', 'Age'] = '20' 
sharks.loc[sharks.Age=='Teen', 'Age'] = '15'
sharks.loc[sharks.Age=='teen', 'Age'] = '15'
sharks.loc[sharks.Age=='Teens', 'Age'] = '15'
sharks.loc[sharks.Age=='18 months', 'Age'] = '1'
sharks.loc[sharks.Age=='\xa0 ', 'Age'] = None
sharks.loc[sharks.Age=='MAKE LINE GREEN', 'Age'] = None
sharks.loc[sharks.Age=='A.M.', 'Age'] = None
sharks.loc[sharks.Age=='X', 'Age'] = None
sharks.loc[sharks.Age=='F', 'Age'] = None
sharks.loc[sharks.Age=='mid-30s', 'Age'] = '35'
sharks.loc[sharks.Age=='28 & 26', 'Age'] = '27'
sharks.loc[sharks.Age=='18 or 20', 'Age'] = '19'
sharks.loc[sharks.Age=='12 or 13', 'Age'] = '13'
sharks.loc[sharks.Age=='46 & 34', 'Age'] = '40'
sharks.loc[sharks.Age=='28, 23 & 30', 'Age'] = '27'
sharks.loc[sharks.Age=='30 or 36', 'Age'] = '33'
sharks.loc[sharks.Age=='6½', 'Age'] = '6'
sharks.loc[sharks.Age=='23 & 20', 'Age'] = '21'
sharks.loc[sharks.Age=='8 or 10', 'Age'] = '9'
sharks.loc[sharks.Age=='7      &    31', 'Age'] = '31'
sharks.loc[sharks.Age=='20?', 'Age'] = '20'
sharks.loc[sharks.Age=='21 & ?', 'Age'] = '21'
sharks.loc[sharks.Age=='36 & 26', 'Age'] = '31'
sharks.loc[sharks.Age=='32 & 30', 'Age'] = '31'
sharks.loc[sharks.Age=='33 or 37', 'Age'] = '35'
sharks.loc[sharks.Age=='16 to 18', 'Age'] = '17'
sharks.loc[sharks.Age=='13 or 18', 'Age'] = '15'
sharks.loc[sharks.Age==' ', 'Age'] = None
sharks.loc[sharks.Age==' 30', 'Age'] = '30'
sharks.loc[sharks.Age=='mid-20s', 'Age'] = '25'
sharks.loc[sharks.Age=='18 to 22', 'Age'] = '20'
sharks.loc[sharks.Age=='Ca. 33', 'Age'] = '33'
sharks.loc[sharks.Age=='74 ', 'Age'] = '74'
sharks.loc[sharks.Age=='45 ', 'Age'] = '45'
sharks.loc[sharks.Age=='21 or 26', 'Age'] = '24'
sharks.loc[sharks.Age=='20 ', 'Age'] = '20'
sharks.loc[sharks.Age=='>50', 'Age'] = '51'
sharks.loc[sharks.Age=='>50', 'Age'] = '51'
sharks.loc[sharks.Age=='9 & 12', 'Age'] = '11'
sharks.loc[sharks.Age=='? & 19', 'Age'] = '19'
sharks.loc[sharks.Age=='9 months', 'Age'] = '1'
sharks.loc[sharks.Age=='25 to 35', 'Age'] = '30'
sharks.loc[sharks.Age=='23 & 26', 'Age'] = '24'
sharks.loc[sharks.Age=='33 & 37', 'Age'] = '35'
sharks.loc[sharks.Age=='25 or 28', 'Age'] = '27'
sharks.loc[sharks.Age=='37, 67, 35, 27,  ? & 27', 'Age'] = '39'
sharks.loc[sharks.Age=='21, 34,24 & 35', 'Age'] = '30'
sharks.loc[sharks.Age=='30 & 32', 'Age'] = '31'
sharks.loc[sharks.Age=='50 & 30', 'Age'] = '40'
sharks.loc[sharks.Age=='17 & 35', 'Age'] = '26'
sharks.loc[sharks.Age=='34 & 19', 'Age'] = '26'
sharks.loc[sharks.Age=='2 to 3 months', 'Age'] = '0'
sharks.loc[sharks.Age=='7 or 8', 'Age'] = '7'
sharks.loc[sharks.Age=='17 & 16', 'Age'] = '16'
sharks.loc[sharks.Age=='Both 11', 'Age'] = '11'
sharks.loc[sharks.Age=='13 or 14', 'Age'] = '13'
sharks.loc[sharks.Age=='2½', 'Age'] = '2'
sharks.loc[sharks.Age==' 43', 'Age'] = '43'
sharks.loc[sharks.Age=='9 or 10', 'Age'] = '10'
sharks.loc[sharks.Age=='36 & 23', 'Age'] = '30'
sharks.loc[sharks.Age=='  ', 'Age'] = None
sharks.loc[sharks.Age=='10 or 12', 'Age'] = '10'
sharks.loc[sharks.Age=='?    &   14', 'Age'] = '14'
sharks.loc[sharks.Age=='31 or 33', 'Age'] = '32'
sharks.loc[sharks.Age=='Elderly', 'Age'] = '70'
sharks.loc[sharks.Age=='(adult)', 'Age'] = '40'
sharks.loc[sharks.Age=='adult', 'Age'] = '40'
sharks.loc[sharks.Age=='"middle-age"', 'Age'] = '50'
sharks.loc[sharks.Age=='"young"', 'Age'] = '20'
sharks.loc[sharks.Age=='young', 'Age'] = '20'

In [57]:
sharks.Age.unique()

array(['57', '11', '48', nan, '18', '52', '15', '12', '32', '10', '21',
       '34', '30', '60', '33', '29', '54', '41', '37', '56', '19', '25',
       '69', '38', '55', '35', '46', '45', '14', '40', '28', '20', '24',
       '26', '49', '22', '7', '31', '17', '13', '42', '3', '8', '50',
       '16', '82', '73', '68', '51', '39', '58', '47', '61', '65', '36',
       '66', '43', '9', '72', '59', '6', '27', '64', '23', '71', '44',
       '62', '63', '70', '1', '53', '77', '74', '5', '86', '84', None,
       '75', ' 28', '87', '67', '33 & 26', '0', '81', '78', '2'],
      dtype=object)

In [58]:
sharks['Fatal (Y/N)'].unique()

array(['N', 'Y', nan, 'M', 'UNKNOWN', '2017', ' N', 'N ', 'y'],
      dtype=object)

In [59]:
sharks[sharks['Fatal (Y/N)']=='M']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
285,18-Apr-2016,2016,Provoked,FRENCH POLYNESIA,Tuamotos,Makemo Atoll,Spearfishing,M,22,Laceration to knee by speared shark PROVOKED I...,M,Morning,"Grey reef shark, 2 m"


In [60]:
sharks.loc[sharks['Fatal (Y/N)']=='M', 'Fatal (Y/N)'] = 'UNKNOWN'
sharks.loc[sharks['Fatal (Y/N)']=='2017', 'Fatal (Y/N)'] = 'N'
sharks.loc[sharks['Fatal (Y/N)']=='N ', 'Fatal (Y/N)'] = 'N'
sharks.loc[sharks['Fatal (Y/N)']==' N', 'Fatal (Y/N)'] = 'N'
sharks.loc[sharks['Fatal (Y/N)']=='y', 'Fatal (Y/N)'] = 'Y'
sharks.loc[sharks['Fatal (Y/N)'].isna(), 'Fatal (Y/N)'] = 'UNKNOWN'

In [61]:
spec = sharks.groupby('Species').count()
spec.sort_values('Date', ascending=False)

Unnamed: 0_level_0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
White shark,163,163,163,163,158,157,158,140,95,162,163,98
Shark involvement prior to death was not confirmed,105,105,105,103,94,97,69,86,47,105,105,20
Invalid,102,102,102,102,90,93,86,88,45,102,102,28
Shark involvement not confirmed,88,88,88,86,83,82,77,79,51,87,88,44
Tiger shark,73,73,73,73,68,64,71,69,45,73,73,40
Shark involvement prior to death unconfirmed,68,68,68,67,59,58,41,57,5,67,68,7
Bull shark,52,52,52,52,47,50,46,50,41,52,52,35
6' shark,40,40,40,40,38,38,38,39,29,40,40,19
4' shark,40,40,40,40,40,40,40,39,36,40,40,35
1.8 m [6'] shark,35,35,35,35,34,33,35,34,28,35,35,24


In [62]:
def shark(col, word, to_replace): 
    """word to be found and replace by"""
    try:
        x = re.findall(f'({word})|({word.lower()})', col)

        if len(x) == 0:
            return col

        return to_replace
    except:
        return col

col = 'Species'
word = "Bull shark"
sharks.loc[:, col] = [shark(x, word, word) for x in sharks.loc[:, col].values]

In [63]:
col = 'Species'
word = "Grey nurse shark"
sharks.loc[:, col] = [shark(x, word, word) for x in sharks.loc[:, col].values]

In [64]:
word = "Tiger shark"
to_replace = 'Tiger shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [65]:
word = "Hammerhead"
to_replace = 'Hammerhead'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [66]:
word = "Mako shark"
to_replace = 'Mako shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [67]:
word = "White shark"
to_replace = 'White shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [68]:
word = "Blue shark"
to_replace = 'Blue shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [69]:
word = "Blacktip shark"
to_replace = 'Blacktip shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [70]:
word = "Blacktip"
to_replace = 'Blacktip shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [71]:
word = "Bronze whaler shark"
to_replace = 'Bronze whaler shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [72]:
word = 'Caribbean reef shark'
to_replace = 'Caribbean reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [73]:
word = 'Caribbean reef shark'
to_replace = 'Caribbean reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [74]:
invol = []
col = 'Species'
word = 'Shark involvement'
for i in sharks.Species.iteritems():
    try:
        x = re.findall(f'({word})|({word.lower()})', i[1])
        if len(x)>0:
            invol.append(i[1])
    except:
        pass
set(invol)

{'No shark involvement',
 'Reported by media as shark attack, but shark involvement prior to death was not confirmed',
 'Shark involvement  not confirmed',
 'Shark involvement  questionable',
 'Shark involvement doubtful',
 'Shark involvement highly doubtful',
 'Shark involvement not cofirmed',
 'Shark involvement not confirmed',
 'Shark involvement not confirmed & highly unlikely',
 'Shark involvement not confirmed, injury may be due to a stingray',
 'Shark involvement not confirmed, injury may have been caused by a bluefish',
 'Shark involvement not confirmed; officials considered barracua',
 'Shark involvement not confirmed; thought to be a barracuda bite',
 'Shark involvement prior to death could not be determined',
 'Shark involvement prior to death not confirmed',
 'Shark involvement prior to death remains unconfirmed',
 'Shark involvement prior to death still to be determined',
 'Shark involvement prior to death suspected but not confirmed',
 'Shark involvement prior to death un

In [75]:
word = 'Shark involvement'
to_replace = 'Shark involvement unconfirmed'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [76]:
word = 'Wobbegong shark'
to_replace = 'Wobbegong shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [77]:
word = 'Nurse shark'
to_replace = 'Nurse shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [78]:
word = 'Grey reef shark'
to_replace = 'Grey reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [79]:
spec = sharks.groupby('Species').size()
spec.sort_values(ascending=False).head(50)

Species
White shark                        635
Shark involvement unconfirmed      330
Tiger shark                        281
Bull shark                         180
Invalid                            102
Blacktip shark                     101
Nurse shark                         97
Bronze whaler shark                 60
Mako shark                          54
Hammerhead                          48
Wobbegong shark                     46
6' shark                            40
4' shark                            40
Blue shark                          39
Questionable incident               35
1.8 m [6'] shark                    35
Questionable                        34
1.5 m [5'] shark                    32
1.2 m [4'] shark                    27
5' shark                            26
3' shark                            26
2 m shark                           25
4' to 5' shark                      24
3 m [10'] shark                     22
3' to 4' shark                      18
Grey reef shark  

In [80]:
invol = []
col = 'Species'
word = 'Leopard shark'
for i in sharks.Species.iteritems():
    try:
        x = re.findall(f'({word})|({word.lower()})', i[1])
        if len(x)>0:
            invol.append(i[1])
    except:
        pass
set(invol)

{'Leopard shark',
 "Leopard shark, 3' Triakis semifasciata, identified by J.W. DeWitt (1955)"}

In [81]:
sharks.loc[sharks.Age=='Questionable incident - shark bite may have precipitated drowning', 'Species'] = 'Shark involvement unconfirmed'
sharks.loc[sharks.Age=='Questionable incident; reported as shark attack but thought to involve a pinniped instead ', 'Species'] = 'Shark involvement unconfirmed'

In [82]:
word = 'Questionable'
to_replace = 'Questionable'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [83]:
word = 'Reef shark'
to_replace = 'Reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [84]:
word = "Zambesi"
to_replace = 'Bull shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [85]:
word = "Zambezi"
to_replace = 'Bull shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [86]:
word = 'Sevengill'
to_replace = 'Sevengill'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [87]:
word = 'Porbeagle'
to_replace = 'Porbeagle'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [88]:
word = 'Raggedtooth'
to_replace = 'Raggedtooth'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [89]:
sharks[sharks.Species=='Invalid']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
226,Sep-2016,2016,Invalid,MEXICO,,Guadalupe Island,Cage Diving,,,"No injury to divers, white shark breached cage",UNKNOWN,,Invalid
578,13-Mar-2014,2014,Invalid,CAYMAN ISLANDS,,,Scuba diving / culling lionfish,M,,"Caribbean reef shark buzzed him. No injury, no...",UNKNOWN,,Invalid
676,08-May-2013,2013,Invalid,USA,California,"Tourmaline Surf Park, San Diego County",Surfing,M,42,Shark bites were post-mortem,UNKNOWN,,Invalid
926,Reported 07-May-2011,2011,Invalid,UNITED ARAB EMIRATES (UAE),Umm al Qaywayan Province,Khor Fakkan,Fishing,M,43,Erroneously reported on several internet sites...,UNKNOWN,,Invalid
1044,04-Feb-2010,2010,Invalid,GUAM,Merizo,Achang Reef,Spearfishing (free diving),M,31,Shark bites were post-mortem,UNKNOWN,11h00,Invalid
1093,02-Sep-2009,2009,Invalid,NEVIS,,Castle Beach,Swimming,M,,Death was due to drowning. Two days later his ...,UNKNOWN,,Invalid
1167,23-Jan-2009,2009,Invalid,BRAZIL,Maranhão,Olho d'Água,Swimming,M,17,"Drowned, body scavenged by shark",UNKNOWN,,Invalid
1301,19-Dec-2007,2007,Invalid,BRITISH VIRGIN ISLANDS,Green Bay,,Scuba diving,M,53,Shark bites were post-mortem,UNKNOWN,,Invalid
1314,November 2011,2007,Invalid,MEXICO,Baja California,Guadalupe Island,Shark diving,M,,White shark breached cage. No injury to occupants,UNKNOWN,,Invalid
1427,30-Sep-2006,2006,Invalid,SOUTH AFRICA,Western Cape Province,Miller's Point,Spearfishing,M,36,No injury; 4m white shark made a threat display,UNKNOWN,,Invalid


# Data Visualization

## Scatterplot: Shark Attacks worldwide 1900-2017

Let’s take a look at the general development of shark attacks from 1900 to 2017.

In [100]:
# color pallet I'm gonna use for all my plots:
light_blue = 'rgb(142, 212, 229)'
dark_blue = 'rgb(19, 77, 102)'
green = 'rgb(199, 204, 118)'
pink = 'rgb(254, 207, 173)'
orange = 'rgb(253, 174, 97)'
red = 'rgb(253, 107, 97)'

In [101]:
fatal_attack = sharks[sharks['Fatal (Y/N)'] == 'Y'].groupby(['Year']).count().iloc[:, :1]
fatal_attack.rename(index=str, columns={"Date": "Fatal"}, inplace=True)

nonfatal_attack = sharks[sharks['Fatal (Y/N)'] == 'N'].groupby(['Year']).count().iloc[:, :1]
nonfatal_attack.rename(index=str, columns={"Date": "Non-Fatal"}, inplace=True)

fatality_unknown = sharks[sharks['Fatal (Y/N)'] == 'UNKNOWN'].groupby(['Year']).count().iloc[:, :1]
fatality_unknown.rename(index=str, columns={"Date": "Unknown"}, inplace=True)

scatter = pd.concat([fatal_attack,nonfatal_attack, fatality_unknown], axis=1).iloc[126:-4, :]

In [102]:
scatter.head(2)

Unnamed: 0,Fatal,Non-Fatal,Unknown
1900,3.0,8.0,3.0
1901,3.0,5.0,2.0


In [103]:
scatter.Fatal.max()

24.0

In [104]:
init_notebook_mode(connected=True)

In [105]:
fatal = go.Scatter(
    x = scatter.index,
    y = scatter['Fatal'],
    name = 'Fatal',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1,
            color = 'rgb(0, 0, 0)'
        ), color = pink
    )
)

nonfatal = go.Scatter(
    x = scatter.index,
    y = scatter['Non-Fatal'],
    name = 'Non-fatal',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1), color = green
        )
    )
fatality_na = go.Scatter(
    x = scatter.index,
    y = scatter['Unknown'],
    name = 'Unknown',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1), color = light_blue
        )
    )

data = [nonfatal, fatal, fatality_na]

layout = dict(title = 'Shark attacks worldwide 1900-2017',
              yaxis = dict(zeroline = False, title='Count of total shark attachs'),
              xaxis = dict(zeroline = False, title='Year')
             )

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Shark attacks scatter plot")

#plotly.offline.plot(data, filename='test.html') # creates html
# plotly.offline.plot(data, include_plotlyjs=False, output_type='div') # for jinja2 embeding

There is a peak in the attacks in around 1960, let's see where the pick is coming from

In [111]:
sharks[sharks['Year'] == 1960].groupby('Country').size().sort_values(ascending=False)[:5]

Country
USA                 25
AUSTRALIA           21
PAPUA NEW GUINEA    10
SOUTH AFRICA         9
IRAQ                 3
dtype: int64

In [112]:
sharks[sharks['Year'] == 1958].groupby('Country').size().sort_values(ascending=False)[:5]

Country
USA                 16
PAPUA NEW GUINEA     9
SOUTH AFRICA         8
AUSTRALIA            5
BAHAMAS              3
dtype: int64

We can see that shark attacks are increasing, though the fatal attacks are staying at the same level throughout the century, at about average of 9.2 number of fatal attacks globally per year. This number doesn’t change much by decade either.
<br>If you look at the shark attacks development, especially non-fatal ones, you can see a peak around 1960. The peak happens in the USA and Australia, driven by surfing starting to get popular in the 60s.


<i>Disclaimer: all following plots use the data from 1900 to 2018 unless otherwise commented.

In [113]:
sharks = sharks[sharks.Year>=1900]

In [116]:
# saving cleaned data
sharks.to_csv('sharks_cleaned.csv', index=False)
sharks = pd.read_csv('sharks_cleaned.csv')
sharks.head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,25-Jun-2018,2018,Boat,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,


## Historgram: Shark attacks by time of the day

Sharks hunt at dawn and dusk to benefit from darkness by using not only their highly evolved sense of smell but also detecting electricity and vibrations in the water. The assumption is that if sharks prey on humans, we would see a peak of attacks around dusk and dawn.

In [117]:
sharks['Hour'] = sharks['Time'].str.extract("([0-9]+)", expand=False).dropna().astype(int)
sharks['Hour']= sharks[sharks['Hour']<25]['Hour']

In [118]:
time_nonfatal = sharks[sharks['Fatal (Y/N)'] == 'N']
time_fatal = sharks[sharks['Fatal (Y/N)'] == 'Y']
time_na_fatality = sharks[sharks['Fatal (Y/N)'] == 'UNKNOWN']

In [119]:
hour = list(range(0,25,1))
hourtext = ['midnight', '1 am', '2 am', '3 am', '4 am', '5 am', '6 am', '7 am', '8 am', '9 am', '10 am', '11 am',
           'noon', '1 pm', '2 pm', '3 pm', '4 pm', '5 pm', '6 pm', '7 pm', '8 pm', '9 pm', '10 pm', '11 pm']
trace1 = go.Histogram(
    x=time_nonfatal['Hour'],
    opacity=1, name = "Non-fatal", marker=dict(color=green)
)
trace2 = go.Histogram(
    x=time_fatal['Hour'],
    opacity=1, name = "Fatal", marker=dict(color=pink)
)

trace3 = go.Histogram(
    x=time_na_fatality['Hour'],
    opacity=1, name = "Unknown", marker=dict(color=light_blue)
)

data = [trace1, trace2, trace3]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Time of the day',
            tickvals=list(range(0,25,1)),
            ticktext = hourtext, tickangle=-45),
                   
    title='Number of shark attacks by hour',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks histogram")

As you can see that is not true. Obviously, humans are not part of sharks’ diet and they are not targetting human, so the attacks happen when more people are in the water, between around 10 am and 5 pm.

## Barplot: Number of shark attacks by type

The myth of the shark attack hunting people is scary. Let’s look at the data to find out how sharks attack.
From this plot, you can see that most of the time, the attack is unprovoked. Though, I believe the attacks happen when a shark mistakes people for their prey, as we will see in later charts showing the number of attacks per activity. For example, a surfer sitting on their surfboard waiting for a wave looks and behave like a hurt Seal or Sea Lion from the bottom.

In [120]:
sharks.Type.unique()

array(['Boat', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster'], dtype=object)

In [121]:
fatal_attack = sharks[sharks['Fatal (Y/N)'] == 'Y'].groupby(['Type']).count().iloc[:, :1]
fatal_attack.rename(index=str, columns={"Date": "Fatal"}, inplace=True)

nonfatal_attack = sharks[sharks['Fatal (Y/N)'] == 'N'].groupby(['Type']).count().iloc[:, :1]
nonfatal_attack.rename(index=str, columns={"Date": "Non-Fatal"}, inplace=True)

fatality_unknown = sharks[sharks['Fatal (Y/N)'] == 'UNKNOWN'].groupby(['Type']).count().iloc[:, :1]
fatality_unknown.rename(index=str, columns={"Date": "Unknown"}, inplace=True)

In [122]:
attacks_type = pd.concat([fatal_attack,nonfatal_attack, fatality_unknown], axis=1).sort_values('Non-Fatal', ascending=True)
attacks_type.index

Index(['Questionable', 'Invalid', 'Sea Disaster', 'Boat', 'Provoked',
       'Unprovoked'],
      dtype='object', name='Type')

In [124]:
trace1 = go.Bar(
    y=attacks_type.index,
    x=attacks_type['Unknown'],
    name='Unknown',
    orientation = 'h', marker=dict(color = light_blue))

trace2 = go.Bar(
    y=attacks_type.index,
    x=attacks_type['Non-Fatal'],
    name='Non-Fatal',
    orientation = 'h',
    marker = dict(color = green))

trace3 = go.Bar(
    y=attacks_type.index,
    x=attacks_type['Fatal'],
    name='Fatal',
    orientation = 'h',
    marker = dict(color = pink))

data = [trace1, trace2, trace3]

layout = go.Layout(barmode='stack',
        xaxis=dict(title='Number of attacks'),
                   
    title='Number of shark attacks by type',
    yaxis=dict(
        title='Type'
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks barplot")

We can see that about ⅓ of all attacks are either provoked or happening while people are on the Boat or at Sea Disaster or when the type is Invalid/Unknown. The boat attacks in the data set are usually when the shark bumps the boat. Sea Disaster is usually suspected that the shark was feasting on cadaver after people were drawn. This shows us that only in ⅔ of the cases there is an unprovoked attack. 

## Boxplot: Age distribution of shark attack victims

I wanted to know the age distribution of shark attack victims. 

In [125]:
male = sharks[sharks.Sex=='M'].Age.values
female = sharks[sharks.Sex=='F'].Age.values

In [126]:
trace0 = go.Box(x=male, name='Male', marker = dict(
        color = dark_blue
    ))
trace1 = go.Box(x=female, name='Female', marker = dict(
        color = pink
    ))
data = [trace0, trace1]

layout = go.Layout(title = 'Age distribution of victim at shark attack',
    yaxis=dict(
        title='Gender',
        zeroline=False
    ), xaxis=dict(title='Age')
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks barplot")

Seems that female victim age distribution is wider than the male one and the medium age is a bit lower. Overall, I think this chart reflects the age distribution of people who are active in water sports earlier in their life.

## Heatmap: Number of shark attacks by activity and country

We have seen a peak in the shark attacks during the ’60s and I assumed that it strongly correlates with the increasing popularity of surfing. Let’s analyze if it is true and what other activities people are engaged in when they are attacked by a shark. I also wanted to see a breakdown by country, thus I analyzed the top 15 countries by the count of shark attacks.

In [127]:
top_activities = list(sharks.groupby('Activity').count().sort_values('Date', ascending=False)[:10].index)

In [128]:
top_countries = list(sharks.groupby('Country').count().sort_values('Date', ascending=False)[:15].index)

In [129]:
heatmap = sharks[sharks.Activity.isin(top_activities) & 
           sharks.Country.isin(top_countries)]
heatmap['Count'] =1
heatmap = heatmap[['Country', 'Activity', 'Count']]
a = heatmap.groupby(['Country', 'Activity'], group_keys=False).sum()
heatmap_dict = {}
for i in a.itertuples():
    heatmap_dict[i[0]] =i[1]

In [132]:
heatmap_dict[('USA', 'Surfing')]

655

In [133]:
z = []

for act in top_activities:
    new_row = []
    for country in top_countries:
        if (str(country), str(act)) in heatmap_dict:
            new_row.append(heatmap_dict[(str(country), str(act))])
        else:
            new_row.append(0)
    z.append(list(new_row))

In [134]:
trace = go.Heatmap(z=z,
                   x=top_countries,
                   y=top_activities, xgap =5, ygap=5,
                  colorscale=[[0.0, 'rgb(199,204,118)'], 
                              [0.035,'rgb(69,117,180)' ],
                              [0.06, 'rgb(116,173,209)'],
                              [0.085, 'rgb(171,217,233)'],
                              [0.11, 'rgb(224,243,248)'],
                              [0.135, 'rgb(254,224,144)'],
                              [0.16, 'rgb(254,207,173)'],
                              [0.185, 'rgb(244,109,67)'],
                              [0.2, 'rgb(215,48,39)'],
                              [1.0, 'rgb(165,0,38)' ]])
data=[trace]


layout = go.Layout(
    title='Number of Shark attacks by Activity and Country',
    xaxis = dict(ticks='', nticks=20, title = 'Country', tickmode = 'linear'),
    yaxis = dict(ticks='', tickprefix = "", side='left',
        position=0.0, title = 'Activity', tickangle=0, tickfont=dict(
            size=10,
        ))
)

fig = go.Figure(data=data, layout=layout)
fig['layout']['yaxis']['autorange'] = "reversed"
iplot(fig, filename = "Sharks heatmap")

It seems that surfing in the USA and Australia is by far the most dangerous activity. 
Sharks have been known to attack humans when they are confused or curious. Sharks often mistake surfers or a human splashing in the water human for prey. They get curious and may try to investigate.
Here is the picture of how surfer on the board looks similar to shark’s prey:

In [140]:
Image(url= "plots/sealion_vs_surfer.png", width=500)

## Overlaid area chart

I am going to plot attacks by month, for that I need to assign country to the southern or northen hemisphear since the distribution by hemisphere is different

### Assigning countries into Southern and Northern hemispheres

In [142]:
# Northen - 0, Southern - 1
countries = {'CROATIA': 0, 'NORWAY': 0, 'FRANCE': 0, 'MARTINIQUE': 0, 'ICELAND': 0, 
            'JAVA': 1, 'Sierra Leone': 0, 'CYPRUS': 0, 'LIBERIA': 0, 'NEW BRITAIN': 1, 
            'URUGUAY': 1, 'NORTH ATLANTIC OCEAN ': 0, 'ADMIRALTY ISLANDS': 1, 
            'PAPUA NEW GUINEA': 1, 'DJIBOUTI': 0, 'TAIWAN': 1, 'EL SALVADOR': 0, 
            'ST. MAARTIN': 0, 'ASIA?': 0, 'NAMIBIA': 1, 'OCEAN': 1, 'CAPE VERDE': 0, 
            'MID ATLANTIC OCEAN': 0, 'MAURITIUS': 1, 'ANTIGUA': 0, 'FRENCH POLYNESIA': 1, 
            'JOHNSTON ISLAND': 0, 'SUDAN': 0, 'SOUTH KOREA': 0, 'TUVALU': 1, 
            'SOUTH ATLANTIC OCEAN': 1, 'UNITED ARAB EMIRATES (UAE)': 0, 'DOMINICAN REPUBLIC': 0, 
            ' PHILIPPINES': 0, 'MALAYSIA': 0, 'BRITISH VIRGIN ISLANDS': 0, 'CHINA': 0, 
            'ATLANTIC OCEAN': 0, 'ITALY': 0, 'VENEZUELA': 0, 'SOLOMON ISLANDS / VANUATU': 1, 
            'SOUTH CHINA SEA': 0, 'Between PORTUGAL & INDIA': 2, 'DIEGO GARCIA': 1, 
            'MEDITERRANEAN SEA?': 0, 'INDIAN OCEAN?': 1, 'INDIA': 0, 'SOUTH AFRICA': 1, 
            'St Helena': 1, 'WESTERN SAMOA': 1, 'TASMAN SEA': 1, 'HONG KONG': 0, 'TONGA': 1, 
            'YEMEN': 0, 'COLUMBIA': 0, 'NORTHERN MARIANA ISLANDS': 0, 'GUAM': 0, 'GUINEA': 0, 
            'CENTRAL PACIFIC': 2, 'GUATEMALA': 0, 'FIJI': 1, 'GULF OF ADEN': 0, 'JAPAN': 0, 
            'MID-PACIFC OCEAN': 0, 'ST. MARTIN': 1, 'USA': 0, 'CRETE': 0, 'BRAZIL': 1, 
            'TURKS & CAICOS': 0, 'SOUTHWEST PACIFIC OCEAN': 1, 'GREENLAND': 0, 
            'BAY OF BENGAL': 1, 'PACIFIC OCEAN': 0, 'LEBANON': 0, 'MALTA': 0, 'NIGERIA': 0, 
            'GREECE': 0, 'MEXICO': 0, 'BERMUDA': 0, 'UNITED KINGDOM': 0, 'SINGAPORE': 0, 
            'BRITISH ISLES': 0, 'TURKEY': 0, 'NEVIS': 1, 'AUSTRALIA': 1, 'ENGLAND': 0, 
            'SIERRA LEONE': 0, 'VANUATU': 1, 'NORTH SEA': 0, 'RUSSIA': 0, 'MICRONESIA': 0, 
            'PORTUGAL': 0, 'RED SEA': 0, 'MONTENEGRO': 0, 'IRAQ': 0, 'SWEDEN': 0, 
            'PERSIAN GULF': 0, 'NORTH ATLANTIC OCEAN': 0, 'Fiji': 1, 'SLOVENIA': 0, 
            'PHILIPPINES': 0, 'IRAN / IRAQ': 0, 'TUNISIA': 0, 'SAN DOMINGO': 1, 'AZORES': 0, 
            'GEORGIA': 0, 'BURMA': 0, 'NEW GUINEA': 1, 'SUDAN?': 0, 'NETHERLANDS ANTILLES': 0, 
            'ALGERIA': 0, 'NICARAGUA': 0, 'SEYCHELLES': 1, 'RED SEA?': 0, 'BRITISH NEW GUINEA': 1, 
            'THAILAND': 0, 'PALESTINIAN TERRITORIES': 0, 'FALKLAND ISLANDS': 1, 'IRELAND': 0, 
            'MONACO': 0, 'PARAGUAY': 1, 'SYRIA': 0, 'EGYPT ': 0, 'MADAGASCAR': 1, 
            'NORTH PACIFIC OCEAN': 0, 'EGYPT / ISRAEL': 0, 'COOK ISLANDS': 1, 
            'TRINIDAD & TOBAGO': 0, 'PACIFIC OCEAN ': 0, 'EQUATORIAL GUINEA / CAMEROON': 0, 
            'ISRAEL': 0, 'SAMOA': 1, 'ECUADOR': 1, 'CARIBBEAN SEA': 0, 'NEW CALEDONIA': 1, 
            'MARSHALL ISLANDS': 0, 'PANAMA': 0, 'UNITED ARAB EMIRATES': 0, 'ITALY / CROATIA': 0, 
            'NEW ZEALAND': 1, 'MALDIVE ISLANDS': 0, 'GHANA': 0, 'MOZAMBIQUE': 0, 'SRI LANKA': 0, 
            'SOLOMON ISLANDS': 1, 'Coast of AFRICA': 1, 'BARBADOS': 0, 'BANGLADESH': 0, 
            'CHILE': 1, 'CANADA': 0, 'HONDURAS': 0, 'PALAU': 0, 'AMERICAN SAMOA': 1, 
            'SAUDI ARABIA': 0, ' TONGA': 1, 'SPAIN': 0, 'ARGENTINA': 1, 'CURACAO': 0, 
            'ANDAMAN / NICOBAR ISLANDAS': 0, 'KENYA': 1, 'EGYPT': 0, 'THE BALKANS': 0, 
            'PUERTO RICO': 0, 'KIRIBATI': 0, 'OKINAWA': 0, 'REUNION': 1, 
            'BRITISH WEST INDIES': 0, 'NICARAGUA ': 0, 'FEDERATED STATES OF MICRONESIA': 0, 
            'IRAN': 0, 'CAYMAN ISLANDS': 0, 'SOMALIA': 0, 'INDONESIA': 1, 'KUWAIT': 0, 
            'Seychelles': 1, 'COSTA RICA': 0, 'INDIAN OCEAN': 1, 'CEYLON (SRI LANKA)': 0, 
            'YEMEN ': 0, 'HAITI': 0, 'SCOTLAND': 0, 'CUBA': 0, 'GUYANA': 0, 'LIBYA': 0, 
            'MEXICO ': 0, 'SENEGAL': 0, 'GRAND CAYMAN': 0, 'GABON': 1, 'GRENADA': 0, 
            'RED SEA / INDIAN OCEAN': 0, 'VIETNAM': 0, 'BAHAMAS': 0, 'BAHREIN': 0, 
            'NORTHERN ARABIAN SEA': 0, 'BELIZE': 0, 'MEDITERRANEAN SEA': 0, 'ANGOLA': 1, 
            'SOUTH PACIFIC OCEAN': 1, 'TANZANIA': 1, 'KOREA': 0, 'JAMAICA': 0, 'ARUBA': 0, 
            'MAYOTTE':1}

In [143]:
sharks['Month'] = None

In [144]:
months_dict = {'Jan': 0, 'Feb': 1, 'Mar': 2, 
               'Apr': 3, 'Ap-': 3, 'May': 4, 
               'Jun': 5, 'Jul': 6, 'Aug': 7, 
               'Sep': 8, 'Oct': 9, 'Nov': 10, 
               'Dec': 11}

def extract_month(d):  
    date = d.replace(' ', '')
    date = date.replace('July', 'Jul')
    date = date.replace('Sept', 'Sep')
    date = date.replace('--', '-')
    date = date.replace('y2', 'y-2')
    date = date.replace('v2', 'v-2')
    month = 0
    if len(date) >= 11 and len(date) <= 12 and date[2] == '-':
        month = date[3:6]
    elif len(date) == 10 and date[1] == '-':
        month = date[2:5]
    elif len(date) == 19 and date[10] == '-':
        month = date[11:14]
    try:
        return months_dict[month]
    except:
        return None
    
sharks["Month"] = sharks["Date"].apply(lambda x: extract_month(x));

def hemisphere(x): 
    try:  
        hem = countries[x]
        return hem
    except:
        return None

sharks["Hemisphere"] = sharks["Country"].apply(lambda x: hemisphere(x));

In [187]:
sharks.head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour,Month,Hemisphere
0,25-Jun-2018,2018,Boat,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,18.0,5.0,0.0
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,,14.0,5.0,0.0


In [146]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [147]:
south = sharks[sharks.Hemisphere==1].groupby('Month').count().Date.values
north = sharks[sharks.Hemisphere==0].groupby('Month').count().Date.values

In [154]:
trace1 = go.Scatter(
    x=months,
    y=south,
    fill='tonexty',
    mode= 'none', name='Southern Hemisphere'
)
trace2 = go.Scatter(
    x=months,
    y=north,
    fill='tozeroy',
    mode= 'none', name='Northern Hemisphere'
)

data = [trace1, trace2]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months and hemispheres',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

In [149]:
sharks.to_csv('sharks_hem.csv', index=False)
sharks = pd.read_csv('sharks_hem.csv')

## Stacked area plot: Number of sharks by months, type and hemisphere

Another common misconception is that shark attacks increase when sharks are migrating or pupping. For example, shark pupping season along SoCal coast is known to be around April and May and shark migration from SoCal to Central and South America around winter time. 
<br>I wanted to know if it is true that the attacks are increasing due to migration and pupping, so I grouped attacks by months and separated countries into Northern and Southern Hemisphere as the distribution for these two groups would be different by months.

In [155]:
types = [i.lower() for i in sharks.Type.unique()]
types = [i.replace(' ', '_') for i in types]

for i, t in enumerate(types):
    vars()[types[i]+'_s'] = sharks[(sharks.Hemisphere==1) & (sharks.Type == sharks.Type.unique()[i]) & \
                                   (sharks.Year < 2018)]\
    .groupby('Month').count().Date.values

In [156]:
for i, t in enumerate(types):
    vars()[types[i]+'_n'] = sharks[(sharks.Hemisphere==0) & (sharks.Type == sharks.Type.unique()[i]) & \
                                   (sharks.Year < 2018)]\
    .groupby('Month').count().Date.values

In [157]:
types

['boat', 'unprovoked', 'invalid', 'provoked', 'questionable', 'sea_disaster']

In [158]:
trace0 = dict(
    x=months,
    y=invalid_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = pink),
    stackgroup='one', name = 'Invalid'
)
trace1 = dict(
    x=months,
    y=questionable_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = light_blue),
    stackgroup='one', name="Questionable"
)
trace2 = dict(
    x=months,
    y=sea_disaster_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = green),
              
    stackgroup='one', name='Sea Disaster'
)
trace3 = dict(
    x=months,
    y=boat_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = dark_blue),
              
    stackgroup='one', name='Boat'
)
trace4 = dict(
    x=months,
    y=provoked_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = red),
             
    stackgroup='one', name='Provoked'
)
trace5 = dict(
    x=months,
    y=unprovoked_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = orange),
              
    stackgroup='one', name='Unprovoked'
)
data = [trace0, trace1, trace2, trace3, trace4, trace5]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Southern Hemisphere 1900-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

In [159]:
trace0 = dict(
    x=months,
    y=invalid_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = pink),
    stackgroup='one', name = 'Invalid'
)
trace1 = dict(
    x=months,
    y=questionable_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = light_blue),
    stackgroup='one', name="Questionable"
)
trace2 = dict(
    x=months,
    y=sea_disaster_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = green),
              
    stackgroup='one', name='Sea Disaster'
)
trace3 = dict(
    x=months,
    y=boat_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = dark_blue),
              
    stackgroup='one', name='Boat'
)
trace4 = dict(
    x=months,
    y=provoked_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = red),
             
    stackgroup='one', name='Provoked'
)
trace5 = dict(
    x=months,
    y=unprovoked_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = orange),
              
    stackgroup='one', name='Unprovoked'
)
data = [trace0, trace1, trace2, trace3, trace4, trace5]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Northern Hemisphere 1900-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

You can see that most attacks happen when people are on vacation and enjoying summer months in both Northern and Southern hemispheres and has very less to do with shark migration and pupping seasons. 

<i>Following graphs were generated in Tableau:

## Bubble Map: Fatal shark attacks in 2008-2018

We know that the shark's population is decreasing, thus the patterns in the attacks might be different now than a couple of decades ago. Therefore, in a couple of following graphs, I focused on the shark attacks during the last decade. 
<br>This bubble map shows the number of fatal shark attacks. From this graph, we can clearly see that the attacks are very rare. For example, in the US there were only 9 fatal shark attacks during the last 10 years.

In [161]:
Image(url= "plots/bubble_map.png")

## Chloropleth map: Shark attacks in the US 2008-2018

To dig a bit deeper on the shark attacks in the US during 2008-2018, I plotted state location and the number of attacks on the choropleth map. Here all attacks are included: fatal, non-fatal, and unknown fatality. The number of attacks is pretty low. The attacks happen in places where people tend to enjoy the sea the most: Florida and California.

In [163]:
Image(url= "plots/chloropleth.png")

## Tree map

Let’s take a look, which species of sharks have a higher likelihood to attack humans. Most of the attacks don’t have identified species; thus they are excluded from this analysis. 

In [164]:
Image(url= "plots/tree_map.png")

You can see that from global shark attacks during the last decade, the white shark is leading followed by a bull shark. Interestingly, that third place is when shark involvement is unconfirmed. This is mostly the cases where the people drown and sharks ate the cadaver or there was some other animal attacking such as pinniped.
<br>Just to give you a perspective, about 30-50 dies in the US yearly as a consequence of dog attack. Which is about 300-500 deaths in the US only for 2008-2018. This is more than double than the white shark attack worldwide, which includes non-fatal and unknown outcome.

## Connection map: Migration routes of sharks

Since my data doesn't really have any basis for a connection map, I decided to plot the major migration routes for sharks. These routes are based on my knowledge and internet research and do not represent any scientific point of view. There is a Pacific migration, happening from central California coast to other feeding grounds far away in the Pacific Ocean. Another major migration route is along the East coast, where sharks are migrating from the north to the warmer waters of Florida.

In [165]:
pacific_migration = [go.Scattergeo(
    lat = [28.822418, 38.170194],
    lon = [-158.859361, -123.720130],
    mode = 'lines',
    line = go.scattergeo.Line(
        width = 2,
        color = 'red',
    ),
)]

atlantic_migration = [go.Scattergeo(
    lat = [25.869109, 44.873876],
    lon = [-78.021723, -54.650979],
    mode = 'lines',
    line = go.scattergeo.Line(
        width = 2,
        color = 'red',
    ),
)]
layout = go.Layout(
    title = go.layout.Title(
        text = 'Approximate shark migration routes in Pacific and Atlantic side of the US'
    ),
    showlegend = False,
    geo = go.layout.Geo(
        resolution = 50,
        showland = True,
        showlakes = True,
        landcolor = 'rgb(102, 153, 204)',
        countrycolor = 'rgb(102, 153, 204)',
        lakecolor = 'rgb(255, 255, 255)',
        projection = go.layout.geo.Projection(
            type = "equirectangular"
        ),
        coastlinewidth = 2,
        lataxis = go.layout.geo.Lataxis(
            range = [20, 60],
            showgrid = True,
            dtick = 10
        ),
        lonaxis = go.layout.geo.Lonaxis(
            range = [-100, 20],
            showgrid = True,
            dtick = 20
        ),
    )
)
fig = go.Figure(data = pacific_migration+atlantic_migration, layout = layout)
iplot(fig, filename = "Sharks stacked")

## Core Story: How dangerous are sharks really?

While I had various interesting plots, I would like to focus on a stacked area graph, which supports my insight the most.  I am showing the shark attack count by month and fatality. Additionally, the fatality numbers are broken down by activity: surfing and other activity. To account for the difference in seasons, I grouped countries into Northern and Southern hemispheres. For the time period, I decided to explore a period of 10 years. But first, <br>I would like to start with supporting chart, bar plot by hemisphere, which shows that there are far more shark attacks in Northern hemisphere than in Southern, although deadly attacks are twice as likely in the Southern hemisphere than in the Northern.


In [166]:
sharks['Fatal (Y/N)'].unique()

array(['N', 'Y', 'UNKNOWN'], dtype=object)

In [167]:
south_u = sharks[(sharks.Hemisphere==1) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='UNKNOWN')].count()[0]
south_n = sharks[(sharks.Hemisphere==1) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='N')].count()[0]
south_y = sharks[(sharks.Hemisphere==1) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='Y')].count()[0]
north_u = sharks[(sharks.Hemisphere==0) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='UNKNOWN')].count()[0]
north_n = sharks[(sharks.Hemisphere==0) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='N')].count()[0]
north_y = sharks[(sharks.Hemisphere==0) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='Y')].count()[0]

In [168]:
percentage_nonfatal = (south_n+north_n)/(south_n +south_u+ south_y+north_n+north_u+north_y)*100
f'Percentage of Non-Fatal shark attacks {round(percentage_nonfatal, 2)}%'

'Percentage of Non-Fatal shark attacks 84.36%'

In [169]:
trace1 = go.Bar(
    x=['Southern', 'Northern'],
    y=[south_u, north_u], marker = dict(color=light_blue),
    name='Unknown'
)
trace2 = go.Bar(
    x=['Southern', 'Northern'],
    y=[south_y, north_y],
    name='Fatal', marker = dict(color=pink)
)

trace3 = go.Bar(
    x=['Southern', 'Northern'],
    y=[south_n, north_n],
    name='Non-Fatal', marker = dict(color=green)
)
data = [trace1, trace2, trace3]
layout = go.Layout(
    barmode='stack', title = 'Shark attacks by hepisphere 2007-2017', xaxis=dict(title='Hemisphere'),
    yaxis=dict(
        title='Count of total shark attacks'
    ),
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")


From the two stacked plots below, you can see that generally, fatal shark attacks are very rare, especially in the Northern Hemisphere, although the overall there are more attacks in the Northern Hemisphere.
<br>The fatality rate is higher in the Southern Hemisphere, the number of attacks is higher in total for Northern Hemisphere.  Nearly half of the attacks happen while people are surfing, the situation where the shark mistake surfers for their favorite prey - seals.

In [170]:
fatality = [i.lower() for i in sharks['Fatal (Y/N)'].unique()]
fatality 

['n', 'y', 'unknown']

In [171]:
fatal_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'Y') & (sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))].groupby('Month').count().Date.values
fatal_surfing


array([1, 1, 2, 1, 1, 2, 3, 1, 1])

In [174]:
fatal_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'Y') & (sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))].groupby('Month').count().Date.values

fatal_not_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'Y') &(sharks.Activity != 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
    
nonfatal_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
nonfatal_not_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values

In [180]:
fatality

['n', 'y', 'unknown']

In [183]:
for i, t in enumerate(fatality):
    vars()[fatality[i]+'_s'] = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == sharks['Fatal (Y/N)'].unique()[i]) & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values

In [182]:
fatal_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'Y') & (sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))].groupby('Month').count().Date.values

fatal_not_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'Y') &(sharks.Activity != 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
    
nonfatal_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
nonfatal_not_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values

trace0 = dict(
    x=months,
    y=unknown_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = dark_blue),
              
    stackgroup='one', name = 'Unknown'
)
trace1 = dict(
    x=months,
    y=nonfatal_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = light_blue),
    stackgroup='one', name="Non-Fatal Surfing"
)
trace2 = dict(
    x=months,
    y=nonfatal_not_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = green),
              
    stackgroup='one', name='Non-Fatal Other'
)

trace3 = dict(
    x=months,
    y=fatal_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = red),
    stackgroup='one', name="Fatal Surfing"
)
trace4 = dict(
    x=months,
    y=fatal_not_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = orange),
              
    stackgroup='one', name='Fatal Other'
)

data = [trace0, trace1, trace2, trace3, trace4]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Southern Hemisphere 2007-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

In [184]:
for i, t in enumerate(fatality):
    vars()[fatality[i]+'_n'] = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == sharks['Fatal (Y/N)'].unique()[i]) & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
    
fatal_surfing = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == 'Y') & (sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))].groupby('Month').count().Date.values

fatal_not_surfing = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == 'Y') &(sharks.Activity != 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
    
nonfatal_surfing = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
nonfatal_not_surfing = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values

trace0 = dict(
    x=months,
    y=unknown_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = dark_blue),
              
    stackgroup='one', name = 'Unknown'
)
trace1 = dict(
    x=months,
    y=nonfatal_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = light_blue),
    stackgroup='one', name="Non-Fatal Surfing"
)
trace2 = dict(
    x=months,
    y=nonfatal_not_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = green),
              
    stackgroup='one', name='Non-Fatal Other'
)

trace3 = dict(
    x=months,
    y=fatal_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = red),
    stackgroup='one', name="Fatal Surfing"
)
trace4 = dict(
    x=months,
    y=fatal_not_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = orange),
              
    stackgroup='one', name='Fatal Other'
)

data = [trace0, trace1, trace2, trace3, trace4]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Northern Hemisphere 2007-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

# Summary

While a shark attack is a frightening event, we need to understand that it is very rare and happens by mistake. Most of the shark accidents occur when the shark feels threatened or confused by either the presence of many dead fishes, murky water or mistakes people for food. 
<br>Surfers, swimmers, and fishers in the USA and Australia are the most common case for the shark attacks. Though in 84.4 % the attack is not fatal.
<br>Many of the attacks are very minor cases recorded such as when shark bumps the boat. There are also many cases where shark attack is not confirmed, either the attack is by other animal or the shark attack happened post-mortem in case of drowning.
<br>I hope this data and my visualizations could convince you that sharks are not the killer machines waiting for you to enter the waters. But instead, curious animals which also needs to be treated with respect.

**Sources**
<br>https://www.projectaware.org/sharks
<br>https://www.sharkwater.com/
<br>https://en.m.wikipedia.org/wiki/Fatal_dog_attacks_in_the_United_States
<br>https://www.dailymail.co.uk/news/Researchers-examine-sharks-mistake-surfers-intended-prey

Further exploration:
There are many quationable incidents recorded. They have comments like "Questionable incident - shark bite may have precipitated drowning', or 'Questionable incident; reported as shark attack but thought to involve a pinniped instead". Also some of the reported accidents involves basking shark, which doesn't even have teeth. Another category, I noticed has comment 'no injury'. I would like to investigate and approximate such cases for the last 10 year and isolate the real dangerous numbers.
<br> Also I would like to check scuba attacks in California attacks