In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import re 

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from IPython.display import Image
from IPython.core.display import HTML
from IPython.display import HTML

import warnings
warnings.filterwarnings("ignore")

init_notebook_mode(connected=True)

In [294]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" 
value="Click here to toggle on/off the raw code."></form>''')

# How dangerous are sharks? Is all the fear people have justified?


<font size='4'>Most people are afraid of sharks, and there's a misconception that if you are in the water with
them you are in deep trouble. The idea that sharks are killing machines preying on humans is a
myth that I learned to overcome after I became an avid scuba diver, but I've only had two
encounters with sharks so far. I’m often diving in the murky waters of Northern California, which
is famous for its large population of white sharks, so I wanted to learn more about the attacks
and real risks.
</font>

<img src="plots/great_white.png" width="700" height="200" align="center"/>

### <font color=grey>Analyzing shark attacks data from <a href="http://www.sharkattackfile.net/">Global Shark Attack File</a></font>

#### Dataset

I am using shark attack incidents dataset from Kaggle: 
https://www.kaggle.com/teajay/global-shark-attacks. This data was compiled by the Global Shark Attack File http://www.sharkattackfile.net/. The data was cleaned and preprocesseed in a [separate Jupyter Notebook](https://github.com/katjawittfoth/Data_Viz/blob/master/Notebooks/Data%20Cleaning%20Shark%20Attacks.ipynb).

In [126]:
sharks = pd.read_csv('shark_attack_cleaned.csv')

In [4]:
sharks.head(5)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour,Month,Hemisphere
0,25-Jun-2018,2018,Boat,USA,California,"Oceanside, San Diego County",Paddling,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,18.0,5.0,0.0
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,14.0,5.0,0.0
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,7.0,5.0,0.0
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark,,5.0,1.0
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,Tiger shark,,5.0,0.0


In [5]:
# I'm going to use this color pallette for my plots:
light_blue = 'rgb(142, 212, 229)'
dark_blue = 'rgb(19, 77, 102)'
green = 'rgb(199, 204, 118)'
pink = 'rgb(254, 207, 173)'
orange = 'rgb(253, 174, 97)'
red = 'rgb(253, 107, 97)'

## What is the trend in shark attacks? Is number of shark attacks decreasing or increasing?

Let’s take a look at the development of shark attacks from 1900 to 2017.

In [6]:
fatal_attack = sharks[sharks['Fatal (Y/N)'] == 'Y'].groupby(['Year']).count().iloc[:, :1]
fatal_attack.rename(index=str, columns={"Date": "Fatal"}, inplace=True)

nonfatal_attack = sharks[sharks['Fatal (Y/N)'] == 'N'].groupby(['Year']).count().iloc[:, :1]
nonfatal_attack.rename(index=str, columns={"Date": "Non-Fatal"}, inplace=True)

fatality_unknown = sharks[sharks['Fatal (Y/N)'] == 'UNKNOWN'].groupby(['Year']).count().iloc[:, :1]
fatality_unknown.rename(index=str, columns={"Date": "Unknown"}, inplace=True)

scatter = pd.concat([fatal_attack,nonfatal_attack, fatality_unknown], axis=1).iloc[:-1, :]

In [7]:
#scatter.tail(2)

In [8]:
#scatter.Fatal.max()

In [9]:
fatal = go.Scatter(
    x = scatter.index,
    y = scatter['Fatal'],
    name = 'Fatal',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1,
            color = 'rgb(0, 0, 0)'
        ), color = pink))

nonfatal = go.Scatter(
    x = scatter.index,
    y = scatter['Non-Fatal'],
    name = 'Non-fatal',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1), color = green))
fatality_na = go.Scatter(
    x = scatter.index,
    y = scatter['Unknown'],
    name = 'Unknown',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1), color = light_blue))

data = [nonfatal, fatal, fatality_na]

layout = dict(title = 'Shark attacks worldwide 1900-2017',
              yaxis = dict(zeroline = False, title='Count of total shark attachs'),
              xaxis = dict(zeroline = False, title='Year'))

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Shark attacks scatter plot")

#plotly.offline.plot(data, filename='test.html') # creates html
# plotly.offline.plot(data, include_plotlyjs=False, output_type='div') # for jinja2 embeding

Generally, shark attacks are increasing, though the fatal attacks are staying at the same level
throughout the century at about average of 9.2 number of fatal attacks per year. This
number doesn’t change much by decade either.
<br>
<br>If you look at the shark attacks development, especially non-fatal ones, you can see a peak
around 1960. This peak happens in the USA and Australia, driven by surfing starting to get
popular in the 60s.

In [10]:
print('Shark attacks in 1958, top 5 countries:')
sharks[sharks['Year'] == 1958].groupby('Country').size().sort_values(ascending=False)[:5]

Shark attacks in 1958, top 5 countries:


Country
USA                 16
PAPUA NEW GUINEA     9
SOUTH AFRICA         8
AUSTRALIA            5
BAHAMAS              3
dtype: int64

In [11]:
print('Shark attacks in 1960, top 5 countries:')
sharks[sharks['Year'] == 1960].groupby('Country').size().sort_values(ascending=False)[:5]

Shark attacks in 1960, top 5 countries:


Country
USA                 25
AUSTRALIA           21
PAPUA NEW GUINEA    10
SOUTH AFRICA         9
IRAQ                 3
dtype: int64

<i>Disclaimer: all following plots use the data from 1900 to 2018 unless otherwise commented.</i>

## Do sharks attack at dawn and dusk?

Sharks hunt at dawn and dusk to benefit from darkness by using not only their highly evolved sense of smell but also detecting electricity and vibrations in the water. If sharks prey on humans, we should see a peak of attacks around dusk and dawn.

In [12]:
time_nonfatal = sharks[sharks['Fatal (Y/N)'] == 'N']
time_fatal = sharks[sharks['Fatal (Y/N)'] == 'Y']
time_na_fatality = sharks[sharks['Fatal (Y/N)'] == 'UNKNOWN']

In [13]:
hour = list(range(0,25,1))
hourtext = ['midnight', '1 am', '2 am', '3 am', '4 am', '5 am', '6 am', '7 am', '8 am', '9 am', '10 am', '11 am',
           'noon', '1 pm', '2 pm', '3 pm', '4 pm', '5 pm', '6 pm', '7 pm', '8 pm', '9 pm', '10 pm', '11 pm']
trace1 = go.Histogram(
    x=time_nonfatal['Hour'],
    opacity=1, name = "Non-fatal", marker=dict(color=green)
)
trace2 = go.Histogram(
    x=time_fatal['Hour'],
    opacity=1, name = "Fatal", marker=dict(color=pink)
)

trace3 = go.Histogram(
    x=time_na_fatality['Hour'],
    opacity=1, name = "Unknown", marker=dict(color=light_blue)
)

data = [trace1, trace2, trace3]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Time of the day',
            tickvals=list(range(0,25,1)),
            ticktext = hourtext, tickangle=-45),
                   
    title='Number of shark attacks by hour',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks histogram")

Not true! Obviously, humans are not part of sharks’ diet and they are not targetting human, so the attacks happen when more people are in the water, between around 10 am and 5 pm.

## Number of shark attacks by type

The myth of the shark attack hunting people is scary. Let’s look at the data to find out how sharks attack.
<br>
<br>Most of the time, the attacks are unprovoked. Though, I believe the attacks happen when a shark mistakes people for their prey, as we will see in later charts. A surfer sitting on their surfboard waiting for a wave looks and behave like a hurt Seal or Sea Lion from the bottom.

In [14]:
#sharks.Type.unique()

In [15]:
fatal_attack = sharks[sharks['Fatal (Y/N)'] == 'Y'].groupby(['Type']).count().iloc[:, :1]
fatal_attack.rename(index=str, columns={"Date": "Fatal"}, inplace=True)

nonfatal_attack = sharks[sharks['Fatal (Y/N)'] == 'N'].groupby(['Type']).count().iloc[:, :1]
nonfatal_attack.rename(index=str, columns={"Date": "Non-Fatal"}, inplace=True)

fatality_unknown = sharks[sharks['Fatal (Y/N)'] == 'UNKNOWN'].groupby(['Type']).count().iloc[:, :1]
fatality_unknown.rename(index=str, columns={"Date": "Unknown"}, inplace=True)

In [16]:
attacks_type = pd.concat([fatal_attack,nonfatal_attack, fatality_unknown], axis=1).sort_values('Non-Fatal', ascending=True)

In [17]:
trace1 = go.Bar(
    y=attacks_type.index,
    x=attacks_type['Unknown'],
    name='Unknown',
    orientation = 'h', marker=dict(color = light_blue))

trace2 = go.Bar(
    y=attacks_type.index,
    x=attacks_type['Non-Fatal'],
    name='Non-Fatal',
    orientation = 'h',
    marker = dict(color = green))

trace3 = go.Bar(
    y=attacks_type.index,
    x=attacks_type['Fatal'],
    name='Fatal',
    orientation = 'h',
    marker = dict(color = pink))

data = [trace1, trace2, trace3]

layout = go.Layout(barmode='stack',
        xaxis=dict(title='Number of attacks'),
                   
    title='Number of shark attacks by type',
    yaxis=dict(
        title='Type'
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks barplot")

We can see that about ⅓ of all attacks are either provoked or happening while people are on the Boat or at Sea Disaster or when the type is Invalid/Unknown. The boat attacks are usually those when the shark bumps the boat. Sea Disaster is usually suspected that the shark was feasting on cadaver after people were drawn. Thus, we can sum up that only in ⅔ of the cases there is an unprovoked attack. 

## Age distribution of shark attack victims

Let’s take a look at the age distribution of shark attack victims.

In [18]:
male = sharks[sharks.Sex=='M'].Age.values
female = sharks[sharks.Sex=='F'].Age.values

In [19]:
trace0 = go.Box(x=male, name='Male', marker = dict(
        color = dark_blue
    ))
trace1 = go.Box(x=female, name='Female', marker = dict(
        color = pink
    ))
data = [trace0, trace1]

layout = go.Layout(title = 'Age distribution of victim at shark attack',
    yaxis=dict(
        title='Gender',
        zeroline=False
    ), xaxis=dict(title='Age')
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks barplot")

Seems that female victim age distribution is wider than the male one and the medium age is a bit lower. Overall, I think this chart reflects the age distribution of people who are active in water sports earlier in their life.

## Which water activity is the most dangerous and where?

We have seen a peak in the shark attacks during the ’60s and I assumed that it strongly correlates with the increasing popularity of surfing. Let’s analyze if it is true and what other activities people are engaged in when they are attacked by a shark.

In [20]:
top_activities = list(sharks.groupby('Activity').count().sort_values('Date', ascending=False)[:10].index)

In [21]:
top_countries = list(sharks.groupby('Country').count().sort_values('Date', ascending=False)[:15].index)

In [22]:
heatmap = sharks[sharks.Activity.isin(top_activities) & 
           sharks.Country.isin(top_countries)]
heatmap['Count'] =1
heatmap = heatmap[['Country', 'Activity', 'Count']]
a = heatmap.groupby(['Country', 'Activity'], group_keys=False).sum()
heatmap_dict = {}
for i in a.itertuples():
    heatmap_dict[i[0]] =i[1]

In [23]:
#heatmap_dict[('USA', 'Surfing')]

In [24]:
z = []

for act in top_activities:
    new_row = []
    for country in top_countries:
        if (str(country), str(act)) in heatmap_dict:
            new_row.append(heatmap_dict[(str(country), str(act))])
        else:
            new_row.append(0)
    z.append(list(new_row))

In [25]:
trace = go.Heatmap(z=z,
                   x=top_countries,
                   y=top_activities, xgap =5, ygap=5,
                  colorscale=[[0.0, 'rgb(199,204,118)'], 
                              [0.035,'rgb(69,117,180)' ],
                              [0.06, 'rgb(116,173,209)'],
                              [0.085, 'rgb(171,217,233)'],
                              [0.11, 'rgb(224,243,248)'],
                              [0.135, 'rgb(254,224,144)'],
                              [0.16, 'rgb(254,207,173)'],
                              [0.185, 'rgb(244,109,67)'],
                              [0.2, 'rgb(215,48,39)'],
                              [1.0, 'rgb(165,0,38)' ]])
data=[trace]


layout = go.Layout(
    title='Number of Shark attacks by Activity and Country',
    xaxis = dict(ticks='', nticks=20, title = 'Country', tickmode = 'linear'),
    yaxis = dict(ticks='', tickprefix = "", side='left',
        position=0.0, title = 'Activity', tickangle=0, tickfont=dict(
            size=10)))

fig = go.Figure(data=data, layout=layout)
fig['layout']['yaxis']['autorange'] = "reversed"
iplot(fig, filename = "Sharks heatmap")

It seems that surfing in the USA and Australia is by far the most dangerous activity. 
Sharks have been known to attack humans when they are confused or curious. Sharks often mistake surfers or a human splashing in the water for prey. They get curious and may try to investigate.
<br>
<br>
Here is the picture of how surfer on the board looks similar to shark’s prey:

<img src="plots/sealion_vs_surfer2.png" align="center"/>

## When do shark attack? Which time of the year?

Another common misconception is that shark attacks increase when sharks are migrating or
pupping. For example, shark pupping season along SoCal coast is known to be around April
and May and shark migration from SoCal to Central and South America around winter time.
<br>
<br>I wanted to know if it is true that the attacks are increasing due to migration and pupping, so I
grouped attacks by months and separated countries into Northern and Southern Hemisphere as
the distribution for these two groups would be different by months.

In [27]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [28]:
south = sharks[sharks.Hemisphere==1].groupby('Month').count().Date.values
north = sharks[sharks.Hemisphere==0].groupby('Month').count().Date.values

In [29]:
trace1 = go.Scatter(
    x=months,
    y=south,
    fill='tonexty',
    mode= 'none', name='Southern Hemisphere')
trace2 = go.Scatter(
    x=months,
    y=north,
    fill='tozeroy',
    mode= 'none', name='Northern Hemisphere')

data = [trace1, trace2]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by month and hemisphere',
    yaxis=dict(
        title='Count of total shark attacks'),
    bargap=0.1)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

## Let's add type of the attack to the chart above

In [30]:
types = [i.lower() for i in sharks.Type.unique()]
types = [i.replace(' ', '_') for i in types]

for i, t in enumerate(types):
    vars()[types[i]+'_s'] = sharks[(sharks.Hemisphere==1) & (sharks.Type == sharks.Type.unique()[i]) & \
                                   (sharks.Year < 2018)]\
    .groupby('Month').count().Date.values

In [31]:
for i, t in enumerate(types):
    vars()[types[i]+'_n'] = sharks[(sharks.Hemisphere==0) & (sharks.Type == sharks.Type.unique()[i]) & \
                                   (sharks.Year < 2018)]\
    .groupby('Month').count().Date.values

In [32]:
#types

In [201]:
trace0 = dict(
    x=months,
    y=invalid_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = pink),
    stackgroup='one', name = 'Invalid')
trace1 = dict(
    x=months,
    y=questionable_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = light_blue),
    stackgroup='one', name="Questionable")
trace2 = dict(
    x=months,
    y=sea_disaster_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = green),              
    stackgroup='one', name='Sea Disaster')
trace3 = dict(
    x=months,
    y=boat_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = dark_blue),             
    stackgroup='one', name='Boat')
trace4 = dict(
    x=months,
    y=provoked_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = red),          
    stackgroup='one', name='Provoked')
trace5 = dict(
    x=months,
    y=unprovoked_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = orange),              
    stackgroup='one', name='Unprovoked')
data = [trace0, trace1, trace2, trace3, trace4, trace5]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),                  
    title='Number of shark attacks by months in Northern Hemisphere 1900-2017',
    yaxis=dict(
        title='Count of total shark attacks'),
    bargap=0.1)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

In [202]:
trace0 = dict(
    x=months,
    y=invalid_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = pink),
    stackgroup='one', name = 'Invalid')
trace1 = dict(
    x=months,
    y=questionable_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = light_blue),
    stackgroup='one', name="Questionable")
trace2 = dict(
    x=months,
    y=sea_disaster_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = green),              
    stackgroup='one', name='Sea Disaster')
trace3 = dict(
    x=months,
    y=boat_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),marker = dict(color = dark_blue),              
    stackgroup='one', name='Boat')
trace4 = dict(
    x=months,
    y=provoked_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = red),           
    stackgroup='one', name='Provoked')
trace5 = dict(
    x=months,
    y=unprovoked_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = orange),             
    stackgroup='one', name='Unprovoked')
data = [trace0, trace1, trace2, trace3, trace4, trace5]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),                  
    title='Number of shark attacks by months in Southern Hemisphere 1900-2017',
    yaxis=dict(
        title='Count of total shark attacks'),
    bargap=0.1)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

You can see that most attacks happen when people are on vacation and enjoying summer months in both Northern and Southern hemispheres and has very less to do with shark migration and pupping seasons.

## Global fatal shark attacks in 2008-2018

We know that the shark's population is decreasing, thus the patterns in the attacks might be different now than a couple of decades ago. Therefore, in a couple of following graphs, I focused on the shark attacks during the last decade. 
<br>This chloropleth map shows the number of fatal shark attacks. We can clearly see that the attacks are very rare. For example, in the US there were only 9 fatal shark attacks during the last 10 years.

In [244]:
 colorscale=[[0.0, 'rgb(199,204,118)'], 
                              [0.05,'rgb(69,117,180)' ],
                              [0.1, 'rgb(116,173,209)'],
                              [0.2, 'rgb(171,217,233)'],
                              [0.3, 'rgb(224,243,248)'],
                              [0.4, 'rgb(254,224,144)'],
                              [0.5, 'rgb(254,207,173)'],
                              [0.6, 'rgb(244,109,67)'],
                              [0.7, 'rgb(215,48,39)'],
                              [1.0, 'rgb(165,0,38)' ]]

data = [dict (
    type = 'choropleth',
    locations = sharks[(sharks.Year.isin(list(range(2008, 2018, 1))))&(sharks['Fatal (Y/N)']=='Y')].groupby('Country').count().index,
    locationmode='country names',
    colorscale = colorscale,
    z=sharks[(sharks.Year.isin(list(range(2008, 2018, 1))))&(sharks['Fatal (Y/N)']=='Y')].groupby('Country').count().Area)]
layout = go.Layout(
    title='Fatal shark attacks wordwide in 2008-2018')

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks chloropleth world")

## All shark attacks in the US 2008-2018

To dig a bit deeper on the shark attacks in the US during 2008-2018, I plotted state location and the number of attacks on the choropleth map. Here all attacks are included: fatal, non-fatal, and unknown fatality. The number of attacks is pretty low. The attacks happen in places where people tend to enjoy the sea the most: Florida, Hawaii, and California.

In [38]:
sharks[sharks.Area=='Palmyra Atoll'] = 'Hawaii'
df = sharks[(sharks.Year.isin(list(range(2008, 2018, 1))))& (sharks.Country == 'USA')].iloc[:, [4,10]]

In [39]:
df = df.groupby(['Area', 'Fatal (Y/N)']).size().reset_index()

In [40]:
df = df.rename(columns={0:'count'})
df = df.pivot(index='Area', columns='Fatal (Y/N)', values='count').reset_index()
df = df.fillna(0)

In [41]:
df['total'] = df['N']+df['UNKNOWN']+df['Y']

In [42]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [43]:
def convert_state(x):
    try:
        state = us_state_abbrev[x]
        return state
    except:
        return None

In [44]:
df['code'] = df.Area.apply(lambda x: convert_state(x))

In [45]:
df.dropna(inplace=True)

In [276]:
 colorscale=[[0.0, 'rgb(199,204,118)'], 
                              [0.035,'rgb(69,117,180)' ],
                              [0.06, 'rgb(116,173,209)'],
                              [0.085, 'rgb(171,217,233)'],
                              [0.11, 'rgb(224,243,248)'],
                              [0.135, 'rgb(254,224,144)'],
                              [0.16, 'rgb(254,207,173)'],
                              [0.185, 'rgb(244,109,67)'],
                              [0.2, 'rgb(215,48,39)'],
                              [1.0, 'rgb(165,0,38)' ]]

for col in df.columns:
    df[col] = df[col].astype(str)
    
df['text'] = df['Area'] + '<br>' + \
    'Non-Fatal: ' + df['N'] + '<br>' +\
    'Unknown: ' + df['UNKNOWN'] + '<br>' + \
    'Fatal: ' + df['Y']

data = [go.Choropleth(
    colorscale = colorscale,
    autocolorscale = False,
    locations = df.code,
    z = df.total.astype(float),
    locationmode = 'USA-states',
    text = df['text'],
    marker = go.choropleth.Marker(
        line = go.choropleth.marker.Line(
            color = 'rgb(255,255,255)',
            width = 2
        )),
    colorbar = go.choropleth.ColorBar(
        title = "Number of attacks"))]

layout = go.Layout(
    title = go.layout.Title(
        text = 'Shark attacks in the US 2008-2018<br>(Hover for breakdown)'
    ),
    geo = go.layout.Geo(
        scope = 'usa',
        projection = go.layout.geo.Projection(type = 'albers usa'),
        showlakes = True,
        lakecolor = 'rgb(255, 255, 255)'),)


fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks chloropleth US")

## Tree map: Number of shark attacks by species 2008-2018

Let’s take a look, which species of sharks have a higher likelihood to attack humans.

In [192]:
sharks.loc[sharks.Species=="4' shark", 'Species'] = 'Invalid' 
sharks.loc[sharks.Species=="4' to 5' shark", 'Species'] = 'Invalid'
sharks.loc[sharks.Species=="3' shark", 'Species'] = 'Invalid' 

spec = sharks[sharks.Year.isin(list(range(2008, 2018, 1)))].groupby('Species').size().reset_index().sort_values(by=0,ascending=False)[:10]

In [193]:
spec = spec.rename(columns={0:'count'})

In [197]:
s = spec.Species.values

In [198]:
s[8] = 'Wobbegong <br> shark'

In [199]:
s[9] = 'Bronze <br> whaler <br> shark'

In [200]:
import plotly.plotly as py
import plotly.graph_objs as go

import squarify

x = 0.
y = 0.
width = 100.
height = 100.

values = spec['count'].values

normed = squarify.normalize_sizes(values, width, height)
rects = squarify.squarify(normed, x, y, width, height)

color_brewer = ['rgb(165,0,38)',
                'rgb(215,48,39)', 
                'rgb(244,109,67)',
                'rgb(254,207,173)',          
                'rgb(254,224,144)',
                'rgb(224,243,248)', 
                'rgb(171,217,233)',
                'rgb(116,173,209)',
                'rgb(69,117,180)',
                'rgb(199,204,118)']

    
shapes = []
annotations = []
counter = 0

for i, r in enumerate(rects):
    shapes.append( 
        dict(
            type = 'rect', 
            x0 = r['x'], 
            y0 = r['y'], 
            x1 = r['x']+r['dx'], 
            y1 = r['y']+r['dy'],
            fillcolor = color_brewer[counter]
        ) 
    )
    annotations.append(
        dict(
            x = r['x']+(r['dx']/2),
            y = r['y']+(r['dy']/2),
            text = spec.Species.values[i],
            showarrow = False
        )
    )
    counter = counter + 1
    if counter >= len(color_brewer):
        counter = 0

# For hover text
trace0 = go.Scatter(
    x = [ r['x']+(r['dx']/2) for r in rects ], 
    y = [ r['y']+(r['dy']/2) for r in rects ],
    text = spec['count'].values, 
    mode = 'text',
)
        
layout = dict(
    height=590, 
    width=1000,
    xaxis=dict(showgrid=False,zeroline=False),
    yaxis=dict(showgrid=False,zeroline=False),
    shapes=shapes,
    annotations=annotations,
    hovermode='closest', title = go.layout.Title(
        text = 'Shark attacks by species in 2008-2018<br>(Hover for count)'
    )
)

# With hovertext
figure = dict(data=[trace0], layout=layout)

iplot(figure, filename='squarify-treemap')

A white shark is leading followed by a bull shark. Interestingly, that the third place is when shark involvement is unconfirmed. This is mostly the cases where people drown and sharks ate the cadaver or there was some other animal attacking such as pinniped. Invalid is mostly the cases when the shark species is not identified.
<br>Interestingly to know is that in the US each year about 30-50 people die as a consequence of a dog attack. Which is about 300-500 deaths in the US for 2008-2018. This number is doubled the amount of global ! white shark attacks (147) for the last 10 years.

In [None]:
## Connection map: Migration routes of sharks

In [292]:
# I decided to plot the major migration routes for sharks. These routes are based on my knowledge and internet research and do not represent any scientific point of view. There is a Pacific migration, happening from central California coast to other feeding grounds far away in the Pacific Ocean. Another major migration route is along the East coast, where sharks are migrating from the north to the warmer waters of Florida.

In [293]:
# pacific_migration = [go.Scattergeo(
#     lat = [28.822418, 38.170194],
#     lon = [-158.859361, -123.720130],
#     mode = 'lines',
#     line = go.scattergeo.Line(
#         width = 2,
#         color = 'red',),)]

# atlantic_migration = [go.Scattergeo(
#     lat = [25.869109, 44.873876],
#     lon = [-78.021723, -54.650979],
#     mode = 'lines',
#     line = go.scattergeo.Line(
#         width = 2,
#         color = 'red',),)]
# layout = go.Layout(
#     title = go.layout.Title(
#         text = 'Approximate shark migration routes in Pacific and Atlantic side of the US'),
#     showlegend = False,
#     geo = go.layout.Geo(
#         resolution = 50,
#         showland = True,
#         showlakes = True,
#         landcolor = 'rgb(102, 153, 204)',
#         countrycolor = 'rgb(102, 153, 204)',
#         lakecolor = 'rgb(255, 255, 255)',
#         projection = go.layout.geo.Projection(
#             type = "equirectangular"),
#         coastlinewidth = 2,
#         lataxis = go.layout.geo.Lataxis(
#             range = [20, 60],
#             showgrid = True,
#             dtick = 10),
#         lonaxis = go.layout.geo.Lonaxis(
#             range = [-100, 20],
#             showgrid = True,
#             dtick = 20
#         ),
#     ), 
# )

# fig = go.Figure(data = pacific_migration+atlantic_migration, layout = layout)
# iplot(fig, filename = "Sharks stacked")

# How dangerous are sharks really?

This bar plot by hemisphere shows that there are far more shark attacks in Northern hemisphere than in Southern, although deadly attacks are twice as likely in the Southern hemisphere than in the Northern.

In [265]:
#sharks['Fatal (Y/N)'].unique()

In [47]:
south_u = sharks[(sharks.Hemisphere==1) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='UNKNOWN')].count()[0]
south_n = sharks[(sharks.Hemisphere==1) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='N')].count()[0]
south_y = sharks[(sharks.Hemisphere==1) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='Y')].count()[0]
north_u = sharks[(sharks.Hemisphere==0) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='UNKNOWN')].count()[0]
north_n = sharks[(sharks.Hemisphere==0) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='N')].count()[0]
north_y = sharks[(sharks.Hemisphere==0) & (sharks.Year.isin(list(range(2007, 2017, 1)))) 
               & (sharks['Fatal (Y/N)']=='Y')].count()[0]

In [48]:
percentage_nonfatal = (south_n+north_n)/(south_n +south_u+ south_y+north_n+north_u+north_y)*100
f'Percentage of Non-Fatal shark attacks {round(percentage_nonfatal, 2)}%'

'Percentage of Non-Fatal shark attacks 84.36%'

In [49]:
trace1 = go.Bar(
    x=['Southern', 'Northern'],
    y=[south_u, north_u], marker = dict(color=light_blue),
    name='Unknown')
trace2 = go.Bar(
    x=['Southern', 'Northern'],
    y=[south_y, north_y],
    name='Fatal', marker = dict(color=pink))
trace3 = go.Bar(
    x=['Southern', 'Northern'],
    y=[south_n, north_n],
    name='Non-Fatal', marker = dict(color=green))
data = [trace1, trace2, trace3]
layout = go.Layout(
    barmode='stack', title = 'Shark attacks by hepisphere 2007-2017', xaxis=dict(title='Hemisphere'),
    yaxis=dict(
        title='Count of total shark attacks'),)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

Let's break it down and extend the stacked area graphs from before, to get even more insights. I am going to show the shark attack count by month and fatality. 

From the two stacked plots below, you can see that generally, fatal shark attacks are very rare, especially in the Northern Hemisphere, however thereare more attacks in the Northern Hemisphere.
<br>The fatality rate is higher in the Southern Hemisphere, the number of attacks is higher in total for Northern Hemisphere.  Nearly half of the attacks happen while people are surfing.

In [280]:
fatality = [i.lower() for i in sharks['Fatal (Y/N)'].unique()]

In [281]:
for i, t in enumerate(fatality):
    vars()[fatality[i]+'_s'] = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == sharks['Fatal (Y/N)'].unique()[i]) & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values

In [282]:
for i, t in enumerate(fatality):
    vars()[fatality[i]+'_n'] = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == sharks['Fatal (Y/N)'].unique()[i]) & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
    
fatal_surfing = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == 'Y') & (sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))].groupby('Month').count().Date.values

fatal_not_surfing = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == 'Y') &(sharks.Activity != 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
    
nonfatal_surfing = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
nonfatal_not_surfing = sharks[(sharks.Hemisphere==0) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values

trace0 = dict(
    x=months,
    y=unknown_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = dark_blue),
              
    stackgroup='one', name = 'Unknown'
)
trace1 = dict(
    x=months,
    y=nonfatal_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = light_blue),
    stackgroup='one', name="Non-Fatal Surfing"
)
trace2 = dict(
    x=months,
    y=nonfatal_not_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = green),
              
    stackgroup='one', name='Non-Fatal Other'
)

trace3 = dict(
    x=months,
    y=fatal_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = red),
    stackgroup='one', name="Fatal Surfing"
)
trace4 = dict(
    x=months,
    y=fatal_not_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = orange),
              
    stackgroup='one', name='Fatal Other'
)

data = [trace0, trace1, trace2, trace3, trace4]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Northern Hemisphere 2007-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

In [283]:
fatal_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'Y') & (sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))].groupby('Month').count().Date.values

fatal_not_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'Y') &(sharks.Activity != 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
    
nonfatal_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values
nonfatal_not_surfing = sharks[(sharks.Hemisphere==1) & (sharks['Fatal (Y/N)'] == 'N') &(sharks.Activity == 'Surfing') & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values

trace0 = dict(
    x=months,
    y=unknown_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), marker = dict(color = dark_blue),
              
    stackgroup='one', name = 'Unknown'
)
trace1 = dict(
    x=months,
    y=nonfatal_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = light_blue),
    stackgroup='one', name="Non-Fatal Surfing"
)
trace2 = dict(
    x=months,
    y=nonfatal_not_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = green),
              
    stackgroup='one', name='Non-Fatal Other'
)

trace3 = dict(
    x=months,
    y=fatal_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = red),
    stackgroup='one', name="Fatal Surfing"
)
trace4 = dict(
    x=months,
    y=fatal_not_surfing,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),  marker = dict(color = orange),
              
    stackgroup='one', name='Fatal Other'
)

data = [trace0, trace1, trace2, trace3, trace4]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Southern Hemisphere 2007-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

# Summary

While a shark attack is a frightening event, we need to understand that it is very rare and happens by mistake. Most of the shark accidents occur when the shark feels threatened or confused by either the presence of many dead fishes, murky water or mistakes people for food.
<br>
<br>Surfers, swimmers, and fishers in the USA and Australia are the most common case for the shark attacks. Though in 84.4 % the attack is not fatal.
<br>
<br>Many of the attacks are very minor cases recorded such as when shark bumps the boat. There are also many cases where shark attack is not confirmed, either the attack is by other animal or the shark attack happened post-mortem.
<br>
<br>I hope this data and my visualizations could convince you that sharks are not the killer machines waiting for you to enter the waters, but just curious animals which need to be treated with respect.

# Bonus: California shark attacks 2007 - 2017

As a scuba diver in California, I wanted to know what is the statistics of shark attacks in the state for the last 10 years

In [291]:
num_attacks = sharks[(sharks.Area == 'California')& sharks.Year.isin(list(range(2007, 2018, 1)))].count()[0]
f'There were {num_attacks} recorded shark attacks in California in the last 10 yeas'

'There were 86 recorded shark attacks in California in the last 10 yeas'

Let's break it down by fatality, activities and type.

In [271]:
sharks_ca = sharks[(sharks.Area == 'California')& sharks.Year.isin(list(range(2007, 2018, 1)))]

In [248]:
#sharks_ca.Activity.unique()

In [249]:
top_activities_ca = list(sharks_ca.groupby('Activity').count().sort_values('Date', ascending=False)[:20].index)

In [250]:
heatmap_ca = sharks_ca[sharks_ca.Activity.isin(top_activities_ca)]
heatmap_ca['Count'] = 1
heatmap_ca = heatmap_ca[['Month', 'Activity', 'Count']]
a = heatmap_ca.groupby(['Month', 'Activity'], group_keys=False).sum()

heatmap_ca_dict = {}

for i in a.itertuples():
    heatmap_ca_dict[i[0]] =i[1]

In [251]:
month_dict = {}
for index, m in enumerate(months):
    month_dict[index] = m

In [252]:
z_ca = []

for act in top_activities_ca:
    new_row = []
    for m in month_dict.keys():
        if ((m), str(act)) in heatmap_ca_dict:
            new_row.append(heatmap_ca_dict[((m), str(act))])
        else:
            new_row.append(0)
    z_ca.append(list(new_row))

In [253]:
trace = go.Heatmap(z=z_ca,
                   x=months,
                   y=top_activities_ca, xgap =5, ygap=5,
                  colorscale=[[0.0, 'rgb(199,204,118)'], 
                              [0.035,'rgb(69,117,180)' ],
                              [0.06, 'rgb(116,173,209)'],
                              [0.085, 'rgb(171,217,233)'],
                              [0.11, 'rgb(224,243,248)'],
                              [0.135, 'rgb(254,224,144)'],
                              [0.16, 'rgb(254,207,173)'],
                              [0.185, 'rgb(244,109,67)'],
                              [0.2, 'rgb(215,48,39)'],
                              [1.0, 'rgb(165,0,38)' ]])
data=[trace]


layout = go.Layout(
    title='Shark Attacks by Activity and Month in California 2007-2018',
    xaxis = dict(ticks='', nticks=20, title = 'Month', tickmode = 'linear'),
    yaxis = dict(ticks='', tickprefix = "", side='left',
        position=0.0, tickangle=0, tickfont=dict(
            size=10)))

fig = go.Figure(data=data, layout=layout)
fig['layout']['yaxis']['autorange'] = "reversed"
iplot(fig, filename = "Sharks heatmap")

From a total of 86 attacks, there was no attack on scuba divers in California in the past 10 years. Most attacks are on surfers and fishers.

In [254]:
#sharks_ca['Fatal (Y/N)'].unique()

Let's inverstigate fatality and injury.

In [255]:
fatal = []
non_fatal = []
unknown = []
for f in sharks_ca['Fatal (Y/N)'].unique():

    vars()[f] = []
    for act in top_activities_ca[:6]:

        vars()[f].append(sharks_ca[(sharks_ca['Fatal (Y/N)']==f) & (sharks_ca.Activity==act)].count()[0])

In [256]:
trace1 = go.Bar(
    x=top_activities_ca[:6],
    y=UNKNOWN, marker = dict(color=light_blue),
    name='Unknown')
trace2 = go.Bar(
    x=top_activities_ca[:6],
    y=Y,
    name='Fatal', marker = dict(color=pink))
trace3 = go.Bar(
    x=top_activities_ca[:6],
    y=N,
    name='Non-Fatal', marker = dict(color=green))

data = [trace1, trace2, trace3]
layout = go.Layout(
    barmode='stack', title = 'Shark attacks in California 2007-2017 by Activity', xaxis=dict(title='Activity'),
    yaxis=dict(
        title='Count of shark attacks'),)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

You can see that there are only 3 fatal cases in California for the last 10 years.

In [257]:
#sharks_ca[sharks_ca['Fatal (Y/N)']=='Y'].count()[0]

Many attacks have no injury, for example:

In [286]:
invol = []
col = 'Injury'
word = 'No injury'
for i in sharks_ca.Injury.iteritems():
    try:
        x = re.findall(f'({word})|({word.lower()})', i[1])
        if len(x)>0:
            invol.append(i[1])
    except:
        pass
set(invol[:20])

{'Board reportedly bumped by shark. No injury',
 'No injury',
 'No injury to occupant. Kayak bitten by gaffed shark. PROVOKED INCIDENT',
 'No injury, board bitten',
 'No injury, board damaged',
 'No injury, bow of kayak bitten',
 'No injury, kayak bitten',
 'No injury, kayak damaged',
 'No injury, shark bit paddleboard',
 'No injury, shark bumped & damaged board',
 'No injury, shark rammed kayak repeatedly',
 'No injury, shark struck board, tossing her into the sea',
 "No injury, shark struk sufer's leg and his board",
 'No injury, surfboard bitten',
 'No injury. Hull bitten, tooth fragment recovered',
 'Shark bumped boat, no injury to occupants'}

In [289]:
no_inj = len(invol)

In [262]:
inj = 1-no_inj/num_attacks

In [263]:
deadly = sharks_ca[sharks_ca['Fatal (Y/N)']=='Y'].count()[0]/num_attacks

You can see from above that many attacks have no injury at all. This is where a shark bumps a kayak or a surfboard. If I exclude those ‘no injury’ attacks we can see that only 44.2% of all 86 shark attacks in CA in the past 10 years led to injury and only 3.4% of them were deadly.

**Sources**
<br>https://www.projectaware.org/sharks
<br>https://www.sharkwater.com/
<br>https://en.m.wikipedia.org/wiki/Fatal_dog_attacks_in_the_United_States
<br>https://www.dailymail.co.uk/news/Researchers-examine-sharks-mistake-surfers-intended-prey