In [1275]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import re 

# Plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from IPython.display import Image
from IPython.core.display import HTML 

import warnings
warnings.filterwarnings("ignore")

init_notebook_mode(connected=True)

In [9]:
import plotly
plotly.__version__

'3.9.0'

# Data cleaning

In [59]:
sharks = pd.read_csv('attacks.csv', encoding = "ISO-8859-1", engine='python')

In [60]:
sharks.shape

(25723, 24)

In [61]:
sharks['Unnamed: 23'].unique()

array([nan, 'Teramo', 'change filename'], dtype=object)

In [62]:
sharks = sharks.drop(['Case Number', 'Investigator or Source', 'Name','pdf', 'href formula', 'href',
                      'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'], axis=1)

In [63]:
sharks.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m"


In [64]:
sharks = sharks.dropna(how='all') #drops where all rows are NA

In [65]:
sharks.shape

(6302, 13)

### Let's check for NA and null values

In [66]:
# in column Year
sharks.Year.unique()

array([2018., 2017.,   nan, 2016., 2015., 2014., 2013., 2012., 2011.,
       2010., 2009., 2008., 2007., 2006., 2005., 2004., 2003., 2002.,
       2001., 2000., 1999., 1998., 1997., 1996., 1995., 1984., 1994.,
       1993., 1992., 1991., 1990., 1989., 1969., 1988., 1987., 1986.,
       1985., 1983., 1982., 1981., 1980., 1979., 1978., 1977., 1976.,
       1975., 1974., 1973., 1972., 1971., 1970., 1968., 1967., 1966.,
       1965., 1964., 1963., 1962., 1961., 1960., 1959., 1958., 1957.,
       1956., 1955., 1954., 1953., 1952., 1951., 1950., 1949., 1948.,
       1848., 1947., 1946., 1945., 1944., 1943., 1942., 1941., 1940.,
       1939., 1938., 1937., 1936., 1935., 1934., 1933., 1932., 1931.,
       1930., 1929., 1928., 1927., 1926., 1925., 1924., 1923., 1922.,
       1921., 1920., 1919., 1918., 1917., 1916., 1915., 1914., 1913.,
       1912., 1911., 1910., 1909., 1908., 1907., 1906., 1905., 1904.,
       1903., 1902., 1901., 1900., 1899., 1898., 1897., 1896., 1895.,
       1894., 1893.,

In [67]:
sharks[sharks.Year.isna()]

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
187,Reported 08-Jan-2017,,Invalid,AUSTRALIA,Queensland,,Spearfishing,M,35.0,"No attack, shark made a threat display",,,Bull shark
6079,Reported 19-Aug-1836,,Unprovoked,ENGLAND,Cumberland,Whitehaven,Swimming,M,,FATAL,Y,,


In [68]:
# since it's only two values and we see the Year in the Date column, I'm going to impute with the year
sharks.at[187, 'Year'] = 2017
sharks.at[6079, 'Year'] = 1836

In [75]:
# seems like there is 125 rows with zero year. But the year information is partially available in Date.
# with regex I will impute the year from the date information. For the date B.C. I will leave 0 as year
sharks[sharks.Year==0.0].count()

Date           125
Year           125
Type           125
Country        121
Area            99
Location        91
Activity       107
Sex            117
Age             13
Injury         124
Fatal (Y/N)    124
Time             7
Species         35
dtype: int64

In [78]:
sharks[sharks.Year==0.0].head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
6177,Ca. 214 B.C.,0.0,Unprovoked,,Ionian Sea,,Ascending from a dive,M,,"FATAL, shark/s bit him in two",Y,,
6178,Ca. 336.B.C..,0.0,Unprovoked,GREECE,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,
6179,493 B.C.,0.0,Sea Disaster,GREECE,Off Thessaly,,Shipwrecked Persian Fleet,M,,Herodotus tells of sharks attacking men in the...,Y,,
6180,Ca. 725 B.C.,0.0,Sea Disaster,ITALY,Tyrrhenian Sea,Krater found during excavations at Lacco Ameno...,Shipwreck,M,,Depicts shipwrecked sailors attacked by a sha...,Y,,
6181,Before 1939,0.0,Unprovoked,CANADA,,Grand Banks,Fishing,M,,Arm bitten,N,,


In [80]:
# cast year to integer instead of float
sharks['Year'] = sharks['Year'].apply(np.int64)

In [82]:
# add to year the one column before with regex

In [83]:
sharks[sharks['Year'] == 0]

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
6177,Ca. 214 B.C.,0,Unprovoked,,Ionian Sea,,Ascending from a dive,M,,"FATAL, shark/s bit him in two",Y,,
6178,Ca. 336.B.C..,0,Unprovoked,GREECE,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,
6179,493 B.C.,0,Sea Disaster,GREECE,Off Thessaly,,Shipwrecked Persian Fleet,M,,Herodotus tells of sharks attacking men in the...,Y,,
6180,Ca. 725 B.C.,0,Sea Disaster,ITALY,Tyrrhenian Sea,Krater found during excavations at Lacco Ameno...,Shipwreck,M,,Depicts shipwrecked sailors attacked by a sha...,Y,,
6181,Before 1939,0,Unprovoked,CANADA,,Grand Banks,Fishing,M,,Arm bitten,N,,
6182,1990 or 1991,0,Unprovoked,KENYA,Mombasa,Kilindini,Diving,M,,Conway's leg was bitten Higgs injury was FATAL,N,,
6183,Before 2016,0,Unprovoked,KENYA,Mombasa,Kilindini,Diving,M,,FATAL,Y,,
6184,Before Oct-2009,0,Unprovoked,PANAMA,Bocas del Toro Province,Red Frog Beach,Swimming/,M,20,FATAL,Y,,
6185,Before 1934,0,Unprovoked,URUGUAY,Rocha,"Isla Chica, La Paloma",Swimming,,,Foot bitten,N,,
6186,Before 1934,0,Unprovoked,URUGUAY,Rocha,"Playa del Barco, La Pedrera",Swimming,M,,FATAL,Y,,


In [114]:
# Extracting the year from Date information
def extract_year(col): 

    x = re.findall("\d{4}", col)

    if len(x) == 0:
        return 0

    return int(x[-1])

2009

In [136]:
year_0 = sharks.Year ==  0
col = 'Year'
sharks.loc[year_0, col] = [extract_year(x) for x in sharks.loc[year_0, 'Date'].values]

In [140]:
sharks[sharks['Year'] == 0].head(2)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
6177,Ca. 214 B.C.,0,Unprovoked,,Ionian Sea,,Ascending from a dive,M,,"FATAL, shark/s bit him in two",Y,,
6178,Ca. 336.B.C..,0,Unprovoked,GREECE,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,


In [144]:
# Dropping the rest of year = 0
sharks.drop(sharks[sharks.Year == 0].index, inplace=True)

In [150]:
sharks.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m"


In [151]:
sharks[sharks.Type.isna()]

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
85,15-Sep-2017,2017,,SAMOA,Upolu Island,Nofoalii,Fishing,M,,Injuries to hands and legs,N,Night,
382,27-Jul-2015,2015,,AUSTRALIA,Victoria,Tyrendarra Beach,Surfing,M,40.0,Injury to hand,,,
4867,Reported 11-Sep-1936,1936,,VIETNAM,,Saigon,Wreck of a sampam,M,,FATAL,Y,,
5705,Reported 03-Mar-1890,1890,,CEYLON,,,Diving,M,,FATAL,Y,,


In [156]:
sharks.Type.unique()

array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster', nan, 'Boat', 'Boatomg'], dtype=object)

In [None]:
# change Nan to Questionable and Boatomg tp Boat

In [1149]:
sharks[sharks.Type=='Boat']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour,Month,Hemisphere
538,Reported 27-Jun-2014,2014,Boat,ST. MARTIN,,20 miles from shore,Transatlantic Rowing,M,48.0,His boat was holed by a shark,N,,Oceanic whitetip shark',,5,1.0
565,Reported 12-Apr-2014,2014,Boat,SOUTH AFRICA,,,Shark watching,,,"No injury to occupants, shark bit pontoon",N,,White shark,,3,1.0
706,26-Jan-2013,2013,Boat,AUSTRALIA,Victoria,Cape Nelson,Fishing,M,,"No injury to occupants, shark bit propeller",N,,White shark,,0,1.0
818,20-Feb-2012,2012,Boat,SOUTH AFRICA,Western Cape Province,Strandfontein,Fishing,,,"No injury to occupants, boat damaged",N,,White shark,,1,1.0
864,20-Sep-2011,2011,Boat,USA,Hawaii,Kauai,Canoeing,M,,"No injury, canoe bitten by shark",N,15h00,,15.0,8,0.0
956,03-Jan-2011,2011,Boat,AUSTRALIA,Western Australia,Busselton,Fishing,,,"No injury, shark nudged boat and bit propeller",N,12h00,White shark,12.0,0,1.0
971,Reported 12-Nov-2010,2010,Boat,AUSTRALIA,Western Australia,Between Carnac and Garden Islands,Fishing,M,,No injury to occupant. Shark rammed bottom of ...,N,Night,White shark,,10,1.0
1152,01-Mar-2009,2009,Boat,NEW ZEALAND,North Island,Taranaki,Fishing,M,,"No injury to occupants, shark bit propeller",N,,,,2,1.0
1164,24-Jan-2009,2009,Boat,NEW ZEALAND,North Island,Alderman Islands,Fishing,,,"No injury to occupant, shark removed small aux...",N,19h00,,19.0,0,1.0
1168,18-Jan-2009,2009,Boat,AUSTRALIA,Victoria,Off Tower Hill,Fishing,M,,"No injury to occupants, shark bit propeller",N,09h20,White shark,9.0,0,1.0


In [158]:
sharks.loc[sharks.Type.isna(), 'Type'] = 'Questionable'

In [1150]:
sharks.loc[sharks.Type=='Boatomg', 'Type'] = 'Boat'
sharks.loc[sharks.Type=='Boating', 'Type'] = 'Boat'

In [169]:
# there is 49 NA in country, I will ignore it for now
sharks[sharks.Country.isna()].count()

Date           49
Year           49
Type           49
Country         0
Area           14
Location       11
Activity       42
Sex            45
Age             9
Injury         47
Fatal (Y/N)    44
Time            7
Species        12
dtype: int64

In [171]:
sharks.Country.unique()

array(['USA', 'AUSTRALIA', 'MEXICO', 'BRAZIL', 'ENGLAND', 'SOUTH AFRICA',
       'THAILAND', 'COSTA RICA', 'MALDIVES', 'BAHAMAS', 'NEW CALEDONIA',
       'ECUADOR', 'MALAYSIA', 'LIBYA', nan, 'CUBA', 'MAURITIUS',
       'NEW ZEALAND', 'SPAIN', 'SAMOA', 'SOLOMON ISLANDS', 'JAPAN',
       'EGYPT', 'ST HELENA, British overseas territory', 'COMOROS',
       'REUNION', 'FRENCH POLYNESIA', 'UNITED KINGDOM',
       'UNITED ARAB EMIRATES', 'PHILIPPINES', 'INDONESIA', 'CHINA',
       'COLUMBIA', 'CAPE VERDE', 'Fiji', 'DOMINICAN REPUBLIC',
       'CAYMAN ISLANDS', 'ARUBA', 'MOZAMBIQUE', 'FIJI', 'PUERTO RICO',
       'ITALY', 'ATLANTIC OCEAN', 'GREECE', 'ST. MARTIN', 'FRANCE',
       'PAPUA NEW GUINEA', 'TRINIDAD & TOBAGO', 'KIRIBATI', 'ISRAEL',
       'DIEGO GARCIA', 'TAIWAN', 'JAMAICA', 'PALESTINIAN TERRITORIES',
       'GUAM', 'SEYCHELLES', 'BELIZE', 'NIGERIA', 'TONGA', 'SCOTLAND',
       'CANADA', 'CROATIA', 'SAUDI ARABIA', 'CHILE', 'ANTIGUA', 'KENYA',
       'RUSSIA', 'TURKS & CAICOS', 'UNITE

In [173]:
sharks.loc[sharks.Country==' PHILIPPINES', 'Country'] = 'PHILIPPINES'
sharks.loc[sharks.Country=='RED SEA?', 'Country'] = 'RED SEA'

In [182]:
act = sharks.groupby('Activity').count()
act.sort_values('Date', ascending=False)

Unnamed: 0_level_0,Date,Year,Type,Country,Area,Location,Sex,Age,Injury,Fatal (Y/N),Time,Species
Activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Surfing,971,971,971,970,966,959,946,778,965,933,749,591
Swimming,869,869,869,862,824,821,846,564,867,780,448,406
Fishing,431,431,431,431,397,398,338,132,430,412,114,264
Spearfishing,333,333,333,332,304,309,318,221,333,309,162,255
Bathing,162,162,162,160,149,156,156,53,161,149,45,33
Wading,148,148,148,147,144,143,146,123,147,139,107,82
Diving,127,127,127,123,112,100,112,44,126,114,36,68
Standing,99,99,99,99,97,95,99,79,99,98,69,47
Snorkeling,89,89,89,89,84,81,87,68,89,87,61,66
Scuba diving,76,76,76,76,75,73,76,50,76,63,36,63


In [203]:
# Replacing text if there is word fishing with 'Fishing'
def fishing(col): 
    try:
    
        x = re.findall("Fishing", col)

        if len(x) == 0:
            return col

        return 'Fishing'
    except:
        return col

col = 'Activity'
sharks.loc[:, col] = [fishing(x) for x in sharks.loc[:, col].values]

In [204]:
sharks.loc[sharks.Activity=='Freediving', 'Activity'] = 'Free diving' 

In [205]:
act = sharks.groupby('Activity').count()
act.sort_values('Date', ascending=False)

Unnamed: 0_level_0,Date,Year,Type,Country,Area,Location,Sex,Age,Injury,Fatal (Y/N),Time,Species
Activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Surfing,971,971,971,970,966,959,946,778,965,933,749,591
Swimming,869,869,869,862,824,821,846,564,867,780,448,406
Fishing,633,633,633,632,586,595,496,205,632,604,175,392
Spearfishing,333,333,333,332,304,309,318,221,333,309,162,255
Bathing,162,162,162,160,149,156,156,53,161,149,45,33
Wading,148,148,148,147,144,143,146,123,147,139,107,82
Diving,127,127,127,123,112,100,112,44,126,114,36,68
Standing,99,99,99,99,97,95,99,79,99,98,69,47
Snorkeling,89,89,89,89,84,81,87,68,89,87,61,66
Scuba diving,76,76,76,76,75,73,76,50,76,63,36,63


In [808]:
spec = sharks.groupby('Activity').size()
spec.sort_values(ascending=False).head(50)

Activity
Surfing                       1130
Swimming                       987
Fishing                        577
Spearfishing                   333
Wading                         144
Scuba Diving                   125
Diving                         116
Bathing                        109
Standing                        94
Snorkeling                      89
Body boarding                   61
Free diving                     38
Kayaking                        33
Treading water                  32
Boogie boarding                 29
Pearl diving                    25
Surf skiing                     24
Sea Disaster                    22
Skin Diving                     19
Walking                         17
Boogie Boarding                 16
Floating                        14
Shark fishing                   14
Surf fishing                    12
Canoeing                        11
Rowing                          10
Fell overboard                   9
Diving for trochus               9
Paddle boar

In [794]:
sharks[sharks['Activity']=='Scuba diving (but on surface)']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour
2613,03-Nov-1990,1990,Unprovoked,USA,California,"Monastery Beach, Carmel Bay, Monterey County",Scuba diving (but on surface),F,,Leg bitten,N,15h00,White shark,15.0
2631,24-Jun-1990,1990,Unprovoked,SOUTH AFRICA,Western Cape Province,Mossel Bay,Scuba diving (but on surface),F,21.0,"FATAL, thigh bitten",Y,15h45,White shark,15.0
3252,02-Sep-1974,1974,Unprovoked,USA,California,"Franklin Point, San Mateo County",Scuba diving (but on surface),M,41.0,Minor injuries to hand,N,17h30,White shark,17.0
3253,02-Sep-1974,1974,Unprovoked,USA,California,"Franklin Point, San Mateo County",Scuba diving (but on surface),M,48.0,Minor bite on foot & swimfin,N,17h30,White shark,17.0


In [795]:
sharks[sharks['Activity']=='Scuba diving (submerged)']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour
2524,11-Nov-1992,1992,Unprovoked,USA,California,"San Nicholas Island, Santa Barbara County",Scuba diving (submerged),M,40.0,Foot punctured,N,14h00,Unidentified shark,14.0
2993,07-Feb-1982,1982,Unprovoked,USA,California,"Stillwater Cove, Sonoma County",Scuba diving (submerged),M,,Calf & ankle bitten,N,11h00,White shark,11.0
3103,11-Mar-1979,1979,Unprovoked,USA,California,"Ano Nuevo Island, San Mateo, County",Scuba diving (submerged),M,,"No injury, swim fin bitten",N,10h00,White shark,10.0
3309,1973,1973,Unprovoked,PALAU,Aulong Island,Aulong Channel,Scuba diving (submerged),M,,"No injury, shark grabbed scuba tank and descen...",N,,Tiger shark,


In [769]:
def activity(col, word, to_replace): 
    """word to be found and replace by"""
    try:
        x = re.findall(f'({word})|({word.lower()})', col)

        if len(x) == 0:
            return col

        return to_replace
    except:
        return col

col = 'Activity'
word = "Swimming"
sharks.loc[:, col] = [shark(x, word, word) for x in sharks.loc[:, col].values]

In [770]:
word = 'Boogie Boarding'
sharks.loc[:, col] = [shark(x, word, word) for x in sharks.loc[:, col].values]

In [780]:
word = 'Kite surfing'
to_replace = 'Kite Surfing'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [784]:
word = 'Sea disaster'
to_replace = 'Sea Disaster'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [787]:
word = 'Surfing'
to_replace = 'Surfing'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [790]:
word = 'Surf-skiing'
to_replace = 'Surf skiing'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [1046]:
word = 'Scuba Diving'
to_replace = 'Scuba'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [802]:
word = 'Skin diving'
to_replace = 'Skin Diving'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [803]:
word = 'Sitting on surfboard'
to_replace = 'Surfing'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [805]:
word = 'Body-boarding'
to_replace = 'Surfing'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [806]:
word = 'Floating on his back'
to_replace = 'Bathing'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [807]:
word = 'Playing'
to_replace = 'Bathing'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [335]:
sharks.rename(index=str, columns={"Sex ": "Sex", "Species ": "Species"}, inplace=True)

In [336]:
sharks.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species'],
      dtype='object')

In [220]:
sharks.Sex.unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [222]:
sharks.loc[sharks.Sex=='M ', 'Sex'] = 'M' 
sharks.loc[sharks.Sex=='lli', 'Sex'] = None
sharks.loc[sharks.Sex=='.', 'Sex'] = None

In [224]:
sharks[sharks.Sex.isna()].count()

Date           567
Year           567
Type           567
Country        563
Area           505
Location       500
Activity       416
Sex              0
Age             39
Injury         553
Fatal (Y/N)    489
Time           106
Species        321
dtype: int64

In [228]:
sharks.Age.unique()

array(['57', '11', '48', nan, '18', '52', '15', '12', '32', '10', '21',
       '34', '30', '60', '33', '29', '54', '41', '37', '56', '19', '25',
       '69', '38', '55', '35', '46', '45', '14', '40s', '28', '20', '24',
       '26', '49', '22', '7', '31', '17', '40', '13', '42', '3', '8',
       '50', '16', '82', '73', '20s', '68', '51', '39', '58', 'Teen',
       '47', '61', '65', '36', '66', '43', '60s', '9', '72', '59', '6',
       '27', '64', '23', '71', '44', '62', '63', '70', '18 months', '53',
       '30s', '50s', 'teen', '77', '74', '28 & 26', '5', '86', '18 or 20',
       '12 or 13', '46 & 34', '28, 23 & 30', 'Teens', '36 & 26',
       '8 or 10', '84', '\xa0 ', ' ', '30 or 36', '6½', '21 & ?', '75',
       '33 or 37', 'mid-30s', '23 & 20', ' 30', '7      &    31', ' 28',
       '20?', "60's", '32 & 30', '16 to 18', '87', '67', 'Elderly',
       'mid-20s', 'Ca. 33', '74 ', '45 ', '21 or 26', '20 ', '>50',
       '18 to 22', 'adult', '9 & 12', '? & 19', '9 months', '25 to 35',
  

In [317]:
sharks.loc[sharks.Age=='60s', 'Age'] = '60'
sharks.loc[sharks.Age=="60's", 'Age'] = '60'
sharks.loc[sharks.Age=='50s', 'Age'] = '50' 
sharks.loc[sharks.Age=='40s', 'Age'] = '40'
sharks.loc[sharks.Age=='30s', 'Age'] = '30' 
sharks.loc[sharks.Age=='20s', 'Age'] = '20' 
sharks.loc[sharks.Age=='Teen', 'Age'] = '15'
sharks.loc[sharks.Age=='teen', 'Age'] = '15'
sharks.loc[sharks.Age=='Teens', 'Age'] = '15'
sharks.loc[sharks.Age=='18 months', 'Age'] = '1'
sharks.loc[sharks.Age=='\xa0 ', 'Age'] = None
sharks.loc[sharks.Age=='MAKE LINE GREEN', 'Age'] = None
sharks.loc[sharks.Age=='A.M.', 'Age'] = None
sharks.loc[sharks.Age=='X', 'Age'] = None
sharks.loc[sharks.Age=='F', 'Age'] = None
sharks.loc[sharks.Age=='mid-30s', 'Age'] = '35'
sharks.loc[sharks.Age=='28 & 26', 'Age'] = '27'
sharks.loc[sharks.Age=='18 or 20', 'Age'] = '19'
sharks.loc[sharks.Age=='12 or 13', 'Age'] = '13'
sharks.loc[sharks.Age=='46 & 34', 'Age'] = '40'
sharks.loc[sharks.Age=='28, 23 & 30', 'Age'] = '27'
sharks.loc[sharks.Age=='30 or 36', 'Age'] = '33'
sharks.loc[sharks.Age=='6½', 'Age'] = '6'
sharks.loc[sharks.Age=='23 & 20', 'Age'] = '21'
sharks.loc[sharks.Age=='8 or 10', 'Age'] = '9'
sharks.loc[sharks.Age=='7      &    31', 'Age'] = '31'
sharks.loc[sharks.Age=='20?', 'Age'] = '20'
sharks.loc[sharks.Age=='21 & ?', 'Age'] = '21'
sharks.loc[sharks.Age=='36 & 26', 'Age'] = '31'
sharks.loc[sharks.Age=='32 & 30', 'Age'] = '31'
sharks.loc[sharks.Age=='33 or 37', 'Age'] = '35'
sharks.loc[sharks.Age=='16 to 18', 'Age'] = '17'
sharks.loc[sharks.Age=='13 or 18', 'Age'] = '15'
sharks.loc[sharks.Age==' ', 'Age'] = None
sharks.loc[sharks.Age==' 30', 'Age'] = '30'
sharks.loc[sharks.Age=='mid-20s', 'Age'] = '25'
sharks.loc[sharks.Age=='18 to 22', 'Age'] = '20'
sharks.loc[sharks.Age=='Ca. 33', 'Age'] = '33'
sharks.loc[sharks.Age=='74 ', 'Age'] = '74'
sharks.loc[sharks.Age=='45 ', 'Age'] = '45'
sharks.loc[sharks.Age=='21 or 26', 'Age'] = '24'
sharks.loc[sharks.Age=='20 ', 'Age'] = '20'
sharks.loc[sharks.Age=='>50', 'Age'] = '51'
sharks.loc[sharks.Age=='>50', 'Age'] = '51'
sharks.loc[sharks.Age=='9 & 12', 'Age'] = '11'
sharks.loc[sharks.Age=='? & 19', 'Age'] = '19'
sharks.loc[sharks.Age=='9 months', 'Age'] = '1'
sharks.loc[sharks.Age=='25 to 35', 'Age'] = '30'
sharks.loc[sharks.Age=='23 & 26', 'Age'] = '24'
sharks.loc[sharks.Age=='33 & 37', 'Age'] = '35'
sharks.loc[sharks.Age=='25 or 28', 'Age'] = '27'
sharks.loc[sharks.Age=='37, 67, 35, 27,  ? & 27', 'Age'] = '39'
sharks.loc[sharks.Age=='21, 34,24 & 35', 'Age'] = '30'
sharks.loc[sharks.Age=='30 & 32', 'Age'] = '31'
sharks.loc[sharks.Age=='50 & 30', 'Age'] = '40'
sharks.loc[sharks.Age=='17 & 35', 'Age'] = '26'
sharks.loc[sharks.Age=='34 & 19', 'Age'] = '26'
sharks.loc[sharks.Age=='2 to 3 months', 'Age'] = '0'
sharks.loc[sharks.Age=='7 or 8', 'Age'] = '7'
sharks.loc[sharks.Age=='17 & 16', 'Age'] = '16'
sharks.loc[sharks.Age=='Both 11', 'Age'] = '11'
sharks.loc[sharks.Age=='13 or 14', 'Age'] = '13'
sharks.loc[sharks.Age=='2½', 'Age'] = '2'
sharks.loc[sharks.Age==' 43', 'Age'] = '43'
sharks.loc[sharks.Age=='9 or 10', 'Age'] = '10'
sharks.loc[sharks.Age=='36 & 23', 'Age'] = '30'
sharks.loc[sharks.Age=='  ', 'Age'] = None
sharks.loc[sharks.Age=='10 or 12', 'Age'] = '10'
sharks.loc[sharks.Age=='?    &   14', 'Age'] = '14'
sharks.loc[sharks.Age=='31 or 33', 'Age'] = '32'
sharks.loc[sharks.Age=='Elderly', 'Age'] = '70'
sharks.loc[sharks.Age=='(adult)', 'Age'] = '40'
sharks.loc[sharks.Age=='adult', 'Age'] = '40'
sharks.loc[sharks.Age=='"middle-age"', 'Age'] = '50'
sharks.loc[sharks.Age=='"young"', 'Age'] = '20'
sharks.loc[sharks.Age=='young', 'Age'] = '20'

In [318]:
sharks.Age.unique()

array(['57', '11', '48', nan, '18', '52', '15', '12', '32', '10', '21',
       '34', '30', '60', '33', '29', '54', '41', '37', '56', '19', '25',
       '69', '38', '55', '35', '46', '45', '14', '40', '28', '20', '24',
       '26', '49', '22', '7', '31', '17', '13', '42', '3', '8', '50',
       '16', '82', '73', '68', '51', '39', '58', '47', '61', '65', '36',
       '66', '43', '9', '72', '59', '6', '27', '64', '23', '71', '44',
       '62', '63', '70', '1', '53', '77', '74', '5', '86', '84', None,
       '75', ' 28', '87', '67', '0', '81', '78', '2'], dtype=object)

In [324]:
sharks['Fatal (Y/N)'].unique()

array(['N', 'Y', nan, 'M', 'UNKNOWN', '2017', ' N', 'N ', 'y'],
      dtype=object)

In [466]:
sharks[sharks['Fatal (Y/N)']=='M']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species


In [467]:
sharks.loc[sharks['Fatal (Y/N)']=='M', 'Fatal (Y/N)'] = 'UNKNOWN'
sharks.loc[sharks['Fatal (Y/N)']=='2017', 'Fatal (Y/N)'] = 'N'
sharks.loc[sharks['Fatal (Y/N)']=='N ', 'Fatal (Y/N)'] = 'N'
sharks.loc[sharks['Fatal (Y/N)']==' N', 'Fatal (Y/N)'] = 'N'
sharks.loc[sharks['Fatal (Y/N)']=='y', 'Fatal (Y/N)'] = 'Y'
sharks.loc[sharks['Fatal (Y/N)'].isna(), 'Fatal (Y/N)'] = 'UNKNOWN'

In [340]:
spec = sharks.groupby('Species').count()
spec.sort_values('Date', ascending=False)

Unnamed: 0_level_0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
White shark,163,163,163,163,158,157,158,140,95,162,161,98
Shark involvement prior to death was not confirmed,105,105,105,103,94,97,69,86,47,105,1,20
Invalid,102,102,102,102,90,93,86,88,45,102,0,28
Shark involvement not confirmed,88,88,88,86,83,82,77,79,51,87,12,44
Tiger shark,73,73,73,73,68,64,71,69,45,73,73,40
Shark involvement prior to death unconfirmed,68,68,68,67,59,58,41,57,5,67,1,7
Bull shark,52,52,52,52,47,50,46,50,41,52,51,35
6' shark,40,40,40,40,38,38,38,39,29,40,40,19
4' shark,40,40,40,40,40,40,40,39,36,40,40,35
1.8 m [6'] shark,35,35,35,35,34,33,35,34,28,35,35,24


In [419]:
def shark(col, word, to_replace): 
    """word to be found and replace by"""
    try:
        x = re.findall(f'({word})|({word.lower()})', col)

        if len(x) == 0:
            return col

        return to_replace
    except:
        return col

col = 'Species'
word = "Bull shark"
sharks.loc[:, col] = [shark(x, word, word) for x in sharks.loc[:, col].values]

In [420]:
col = 'Species'
word = "Grey nurse shark"
sharks.loc[:, col] = [shark(x, word, word) for x in sharks.loc[:, col].values]

In [720]:
word = "Tiger shark"
to_replace = 'Tiger shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [422]:
word = "Hammerhead"
to_replace = 'Hammerhead'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [423]:
word = "Mako shark"
to_replace = 'Mako shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [424]:
word = "White shark"
to_replace = 'White shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [425]:
word = "Blue shark"
to_replace = 'Blue shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [426]:
word = "Blacktip shark"
to_replace = 'Blacktip shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [430]:
word = "Blacktip"
to_replace = 'Blacktip shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [678]:
word = "Bronze whaler shark"
to_replace = 'Bronze whaler shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [682]:
word = 'Caribbean reef shark'
to_replace = 'Caribbean reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [None]:
word = 'Caribbean reef shark'
to_replace = 'Caribbean reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [691]:
invol = []
col = 'Species'
word = 'Shark involvement'
for i in sharks.Species.iteritems():
    try:
        x = re.findall(f'({word})|({word.lower()})', i[1])
        if len(x)>0:
            invol.append(i[1])
    except:
        pass
set(invol)

{'No shark involvement',
 'Reported by media as shark attack, but shark involvement prior to death was not confirmed',
 'Shark involvement  not confirmed',
 'Shark involvement  questionable',
 'Shark involvement doubtful',
 'Shark involvement highly doubtful',
 'Shark involvement not cofirmed',
 'Shark involvement not confirmed',
 'Shark involvement not confirmed & highly unlikely',
 'Shark involvement not confirmed, injury may be due to a stingray',
 'Shark involvement not confirmed, injury may have been caused by a bluefish',
 'Shark involvement not confirmed; officials considered barracua',
 'Shark involvement not confirmed; thought to be a barracuda bite',
 'Shark involvement prior to death could not be determined',
 'Shark involvement prior to death not confirmed',
 'Shark involvement prior to death remains unconfirmed',
 'Shark involvement prior to death still to be determined',
 'Shark involvement prior to death suspected but not confirmed',
 'Shark involvement prior to death un

In [696]:
word = 'Shark involvement'
to_replace = 'Shark involvement unconfirmed'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [702]:
word = 'Wobbegong shark'
to_replace = 'Wobbegong shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [706]:
word = 'Nurse shark'
to_replace = 'Nurse shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [707]:
word = 'Grey reef shark'
to_replace = 'Grey reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [743]:
spec = sharks.groupby('Species').size()
spec.sort_values(ascending=False).head(50)

Species
White shark                        624
Shark involvement unconfirmed      286
Tiger shark                        281
Bull shark                         207
Blacktip shark                     101
Nurse shark                         97
Invalid                             92
Questionable                        68
Bronze whaler shark                 60
Reef shark                          55
Mako shark                          53
Hammerhead                          46
Wobbegong shark                     46
Raggedtooth                         43
4' shark                            40
Blue shark                          39
6' shark                            39
1.8 m [6'] shark                    35
1.5 m [5'] shark                    31
1.2 m [4'] shark                    27
3' shark                            25
5' shark                            25
2 m shark                           25
4' to 5' shark                      23
3 m [10'] shark                     20
3' to 4' shark   

In [741]:
invol = []
col = 'Species'
word = 'Leopard shark'
for i in sharks.Species.iteritems():
    try:
        x = re.findall(f'({word})|({word.lower()})', i[1])
        if len(x)>0:
            invol.append(i[1])
    except:
        pass
set(invol)

{'Leopard shark',
 "Leopard shark, 3' Triakis semifasciata, identified by J.W. DeWitt (1955)"}

In [710]:
sharks.loc[sharks.Age=='Questionable incident - shark bite may have precipitated drowning', 'Species'] = 'Shark involvement unconfirmed'
sharks.loc[sharks.Age=='Questionable incident; reported as shark attack but thought to involve a pinniped instead ', 'Species'] = 'Shark involvement unconfirmed'

In [711]:
word = 'Questionable'
to_replace = 'Questionable'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [721]:
word = 'Reef shark'
to_replace = 'Reef shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [726]:
word = "Zambesi"
to_replace = 'Bull shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [730]:
word = "Zambezi"
to_replace = 'Bull shark'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [734]:
word = 'Sevengill'
to_replace = 'Sevengill'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [737]:
word = 'Porbeagle'
to_replace = 'Porbeagle'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [744]:
word = 'Raggedtooth'
to_replace = 'Raggedtooth'
sharks.loc[:, col] = [shark(x, word, to_replace) for x in sharks.loc[:, col].values]

In [746]:
sharks[sharks.Species=='Invalid']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour
226,Sep-2016,2016,Invalid,MEXICO,,Guadalupe Island,Cage Diving,,,"No injury to divers, white shark breached cage",UNKNOWN,,Invalid,
578,13-Mar-2014,2014,Invalid,CAYMAN ISLANDS,,,Scuba diving / culling lionfish,M,,"Caribbean reef shark buzzed him. No injury, no...",UNKNOWN,,Invalid,
676,08-May-2013,2013,Invalid,USA,California,"Tourmaline Surf Park, San Diego County",Surfing,M,42,Shark bites were post-mortem,UNKNOWN,,Invalid,
926,Reported 07-May-2011,2011,Invalid,UNITED ARAB EMIRATES (UAE),Umm al Qaywayan Province,Khor Fakkan,Fishing,M,43,Erroneously reported on several internet sites...,UNKNOWN,,Invalid,
1044,04-Feb-2010,2010,Invalid,GUAM,Merizo,Achang Reef,Spearfishing (free diving),M,31,Shark bites were post-mortem,UNKNOWN,11h00,Invalid,11.0
1093,02-Sep-2009,2009,Invalid,NEVIS,,Castle Beach,Swimming,M,,Death was due to drowning. Two days later his ...,UNKNOWN,,Invalid,
1167,23-Jan-2009,2009,Invalid,BRAZIL,Maranhão,Olho d'Água,Swimming,M,17,"Drowned, body scavenged by shark",UNKNOWN,,Invalid,
1301,19-Dec-2007,2007,Invalid,BRITISH VIRGIN ISLANDS,Green Bay,,Scuba diving,M,53,Shark bites were post-mortem,UNKNOWN,,Invalid,
1314,November 2011,2007,Invalid,MEXICO,Baja California,Guadalupe Island,Shark diving,M,,White shark breached cage. No injury to occupants,UNKNOWN,,Invalid,
1427,30-Sep-2006,2006,Invalid,SOUTH AFRICA,Western Cape Province,Miller's Point,Spearfishing,M,36,No injury; 4m white shark made a threat display,UNKNOWN,,Invalid,


### Scatterplot for all recorded attacks from the start of the recording until 2017

In [450]:
sharks.Year.unique()

array([2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008,
       2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997,
       1996, 1995, 1984, 1994, 1993, 1992, 1991, 1990, 1989, 1969, 1988,
       1987, 1986, 1985, 1983, 1982, 1981, 1980, 1979, 1978, 1977, 1976,
       1975, 1974, 1973, 1972, 1971, 1970, 1968, 1967, 1966, 1965, 1964,
       1963, 1962, 1961, 1960, 1959, 1958, 1957, 1956, 1955, 1954, 1953,
       1952, 1951, 1950, 1949, 1948, 1848, 1947, 1946, 1945, 1944, 1943,
       1942, 1941, 1940, 1939, 1938, 1937, 1936, 1935, 1934, 1933, 1932,
       1931, 1930, 1929, 1928, 1927, 1926, 1925, 1924, 1923, 1922, 1921,
       1920, 1919, 1918, 1917, 1916, 1915, 1914, 1913, 1912, 1911, 1910,
       1909, 1908, 1907, 1906, 1905, 1904, 1903, 1902, 1901, 1900, 1899,
       1898, 1897, 1896, 1895, 1894, 1893, 1892, 1891, 1890, 1889, 1888,
       1887, 1886, 1885, 1884, 1883, 1882, 1881, 1880, 1879, 1878, 1877,
       1876, 1875, 1874, 1873, 1872, 1871, 1870, 18

In [468]:
sharks['Fatal (Y/N)'].unique()

array(['N', 'Y', 'UNKNOWN'], dtype=object)

In [535]:
fatal_attack = sharks[sharks['Fatal (Y/N)'] == 'Y'].groupby(['Year']).count().iloc[:, :1]
fatal_attack.rename(index=str, columns={"Date": "Fatal"}, inplace=True)

nonfatal_attack = sharks[sharks['Fatal (Y/N)'] == 'N'].groupby(['Year']).count().iloc[:, :1]
nonfatal_attack.rename(index=str, columns={"Date": "Non-Fatal"}, inplace=True)

fatality_unknown = sharks[sharks['Fatal (Y/N)'] == 'UNKNOWN'].groupby(['Year']).count().iloc[:, :1]
fatality_unknown.rename(index=str, columns={"Date": "Unknown"}, inplace=True)

scatter = pd.concat([fatal_attack,nonfatal_attack, fatality_unknown], axis=1).iloc[126:-4, :]

In [573]:
scatter.head()

Unnamed: 0,Fatal,Non-Fatal,Unknown
1900,3.0,8.0,3.0
1901,3.0,5.0,2.0
1902,7.0,8.0,2.0
1903,10.0,2.0,
1904,10.0,3.0,1.0


In [574]:
init_notebook_mode(connected=True)

In [663]:
fatal = go.Scatter(
    x = scatter.index,
    y = scatter['Fatal'],
    name = 'Fatal',
    mode = 'markers',
    marker = dict(
        size = 12,
        line = dict(
            width = 1,
            color = 'rgb(0, 0, 0)'
        )
    )
)

nonfatal = go.Scatter(
    x = scatter.index,
    y = scatter['Non-Fatal'],
    name = 'Non-fatal',
    mode = 'markers',
    marker = dict(
        size = 12,
        line = dict(
            width = 1,
        )
    )
)
fatality_na = go.Scatter(
    x = scatter.index,
    y = scatter['Unknown'],
    name = 'Unknown',
    mode = 'markers',
    marker = dict(
        size = 12,
        line = dict(
            width = 1,
        )
    )
)

data = [fatal, nonfatal, fatality_na]

layout = dict(title = 'Shark attacks worldwide 1900-2017',
              yaxis = dict(zeroline = False, title='Count of total shark attachs'),
              xaxis = dict(zeroline = False, title='Year')
             )

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Shark attacks scatter plot")

# plotly.offline.plot(data, filename='file.html') # creates html
# plotly.offline.plot(data, include_plotlyjs=False, output_type='div') # for jinja2 embeding

There is a pick in the attacks in around 1960, let's see where the pick is coming from

In [584]:
sharks[sharks['Year'] == 1960].groupby('Country').size()

Country
AUSTRALIA                 21
BAHAMAS                    1
BERMUDA                    2
CARIBBEAN SEA              1
ENGLAND                    1
FIJI                       2
GRENADA                    1
GUAM                       1
IRAQ                       3
JOHNSTON ISLAND            1
MARSHALL ISLANDS           2
MOZAMBIQUE                 2
NEW GUINEA                 1
NEW ZEALAND                2
NICARAGUA                  1
NORTH SEA                  1
PACIFIC OCEAN              1
PANAMA                     1
PAPUA NEW GUINEA          10
PHILIPPINES                2
RED SEA / INDIAN OCEAN     1
SENEGAL                    2
SOUTH AFRICA               9
SRI LANKA                  2
USA                       25
VENEZUELA                  1
dtype: int64

In [583]:
sharks[sharks['Year'] == 1958].groupby('Country').size()

Country
AMERICAN SAMOA                 1
ANDAMAN / NICOBAR ISLANDAS     1
AUSTRALIA                      5
BAHAMAS                        3
CROATIA                        1
FIJI                           1
INDIA                          1
INDONESIA                      1
JAPAN                          1
KENYA                          1
MADAGASCAR                     1
MEXICO                         3
PACIFIC OCEAN                  2
PANAMA                         1
PAPUA NEW GUINEA               9
SAMOA                          1
SOUTH AFRICA                   8
TAIWAN                         1
TONGA                          1
TURKEY                         1
USA                           16
dtype: int64

Looks like the pick is coming from the US and Australia. The 60's is where surfing gain its popularity

In the following analysis I will use only data from 1900 to 2018.

In [809]:
sharks = sharks[sharks.Year>=1900]

In [1047]:
sharks.to_csv('sharks_cleaned.csv', index=False)

sharks = pd.read_csv('sharks_cleaned.csv')
sharks.head()

In [597]:
sharks.Year.unique()

array([2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008,
       2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997,
       1996, 1995, 1984, 1994, 1993, 1992, 1991, 1990, 1989, 1969, 1988,
       1987, 1986, 1985, 1983, 1982, 1981, 1980, 1979, 1978, 1977, 1976,
       1975, 1974, 1973, 1972, 1971, 1970, 1968, 1967, 1966, 1965, 1964,
       1963, 1962, 1961, 1960, 1959, 1958, 1957, 1956, 1955, 1954, 1953,
       1952, 1951, 1950, 1949, 1948, 1947, 1946, 1945, 1944, 1943, 1942,
       1941, 1940, 1939, 1938, 1937, 1936, 1935, 1934, 1933, 1932, 1931,
       1930, 1929, 1928, 1927, 1926, 1925, 1924, 1923, 1922, 1921, 1920,
       1919, 1918, 1917, 1916, 1915, 1914, 1913, 1912, 1911, 1910, 1909,
       1908, 1907, 1906, 1905, 1904, 1903, 1902, 1901, 1900])

### Historgram Time of shark attachs

In [598]:
sharks['Hour'] = sharks['Time'].str.extract("([0-9]+)", expand=False).dropna().astype(int)
sharks['Hour']= sharks[sharks['Hour']<25]['Hour']

In [599]:
time_nonfatal = sharks[sharks['Fatal (Y/N)'] == 'N']
time_fatal = sharks[sharks['Fatal (Y/N)'] == 'Y']
time_na_fatality = sharks[sharks['Fatal (Y/N)'] == 'UNKNOWN']

In [600]:
hour = list(range(0,25,1))
hourtext = ['midnight', '1 am', '2 am', '3 am', '4 am', '5 am', '6 am', '7 am', '8 am', '9 am', '10 am', '11 am',
           'noon', '1 pm', '2 pm', '3 pm', '4 pm', '5 pm', '6 pm', '7 pm', '8 pm', '9 pm', '10 pm', '11 pm']
#[f'{i}h' for i in hour]
trace1 = go.Histogram(
    x=time_nonfatal['Hour'],
    opacity=0.75, name = "Non-fatal"
)
trace2 = go.Histogram(
    x=time_fatal['Hour'],
    opacity=0.75, name = "Fatal"
)

# trace2 = go.Histogram(
#     x=time_fatal['Hour'],
#     opacity=0.75, marker = dict(color = '#FF1493'), name = "Fatal"
# )
trace3 = go.Histogram(
    x=time_na_fatality['Hour'],
    opacity=0.75, name = "Unknown"
)

data = [trace1, trace2, trace3]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Time of the day',
            tickvals=list(range(0,25,1)),
            ticktext = hourtext, tickangle=-45),
                   
    title='Number of shark attacks by hour',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks histogram")

## Barplot

In [603]:
sharks.Type.unique()

array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster', 'Boat'], dtype=object)

In [608]:
fatal_attack = sharks[sharks['Fatal (Y/N)'] == 'Y'].groupby(['Type']).count().iloc[:, :1]
fatal_attack.rename(index=str, columns={"Date": "Fatal"}, inplace=True)

nonfatal_attack = sharks[sharks['Fatal (Y/N)'] == 'N'].groupby(['Type']).count().iloc[:, :1]
nonfatal_attack.rename(index=str, columns={"Date": "Non-Fatal"}, inplace=True)

fatality_unknown = sharks[sharks['Fatal (Y/N)'] == 'UNKNOWN'].groupby(['Type']).count().iloc[:, :1]
fatality_unknown.rename(index=str, columns={"Date": "Unknown"}, inplace=True)


In [615]:
attacks_type = pd.concat([fatal_attack,nonfatal_attack, fatality_unknown], axis=1).sort_values('Non-Fatal', ascending=True)
attacks_type

Unnamed: 0_level_0,Fatal,Non-Fatal,Unknown
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Questionable,1,3,1
Invalid,8,13,458
Sea Disaster,134,52,2
Boat,2,127,4
Boating,5,169,7
Provoked,14,526,6
Unprovoked,914,3179,45


In [1193]:
trace1 = go.Bar(
    y=attacks_type.index,
    x=attacks_type['Fatal'],
    name='Fatal',
    orientation = 'h',
    marker = dict(
        line = dict(
            width = 1)
    )
)
trace2 = go.Bar(
    y=attacks_type.index,
    x=attacks_type['Non-Fatal'],
    name='Non-Fatal',
    orientation = 'h',
    marker = dict(
        line = dict(
            width = 1)
    )
)

trace3 = go.Bar(
    y=attacks_type.index,
    x=attacks_type['Unknown'],
    name='Unknown',
    orientation = 'h', marker=dict(line = dict(
            width = 1))
)
data = [trace1, trace2, trace3]
layout = go.Layout(
    barmode='stack'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks barplot")

## Barplot "Distribution of age of attack victims"

In [627]:
sharks.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour
0,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,18.0
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,14.0
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,7.0
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark,
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,Tiger shark,


In [629]:
male = sharks[sharks.Sex=='M'].Age.values
female = sharks[sharks.Sex=='F'].Age.values

In [640]:
trace0 = go.Box(x=male, name='Male')
trace1 = go.Box(x=female, name='Female')
data = [trace0, trace1]

layout = go.Layout(
    yaxis=dict(
        title='Age',
        zeroline=False
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks barplot")

# Heatmap

In [1049]:
sharks.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour
0,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,18.0
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,14.0
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,7.0
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark,
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,Tiger shark,


In [1057]:
top_activities = list(sharks.groupby('Activity').count().sort_values('Date', ascending=False)[:10].index)

In [1059]:
top_countries = list(sharks.groupby('Country').count().sort_values('Date', ascending=False)[:15].index)

In [1060]:
heatmap = sharks[sharks.Activity.isin(top_activities) & 
           sharks.Country.isin(top_countries)]
heatmap['Count'] =1
heatmap = heatmap[['Country', 'Activity', 'Count']]
a = heatmap.groupby(['Country', 'Activity'], group_keys=False).sum()
heatmap_dict = {}
for i in a.itertuples():
    heatmap_dict[i[0]] =i[1]

In [1061]:
heatmap_dict

{('AUSTRALIA', 'Bathing'): 34,
 ('AUSTRALIA', 'Diving'): 28,
 ('AUSTRALIA', 'Fishing'): 136,
 ('AUSTRALIA', 'Scuba'): 27,
 ('AUSTRALIA', 'Snorkeling'): 21,
 ('AUSTRALIA', 'Spearfishing'): 71,
 ('AUSTRALIA', 'Standing'): 11,
 ('AUSTRALIA', 'Surfing'): 219,
 ('AUSTRALIA', 'Swimming'): 179,
 ('AUSTRALIA', 'Wading'): 15,
 ('BAHAMAS', 'Bathing'): 1,
 ('BAHAMAS', 'Diving'): 5,
 ('BAHAMAS', 'Fishing'): 2,
 ('BAHAMAS', 'Scuba'): 4,
 ('BAHAMAS', 'Snorkeling'): 9,
 ('BAHAMAS', 'Spearfishing'): 34,
 ('BAHAMAS', 'Standing'): 1,
 ('BAHAMAS', 'Surfing'): 2,
 ('BAHAMAS', 'Swimming'): 5,
 ('BAHAMAS', 'Wading'): 1,
 ('BRAZIL', 'Bathing'): 3,
 ('BRAZIL', 'Diving'): 1,
 ('BRAZIL', 'Fishing'): 4,
 ('BRAZIL', 'Scuba'): 1,
 ('BRAZIL', 'Spearfishing'): 2,
 ('BRAZIL', 'Surfing'): 37,
 ('BRAZIL', 'Swimming'): 36,
 ('BRAZIL', 'Wading'): 1,
 ('FIJI', 'Diving'): 1,
 ('FIJI', 'Fishing'): 5,
 ('FIJI', 'Scuba'): 2,
 ('FIJI', 'Snorkeling'): 1,
 ('FIJI', 'Spearfishing'): 11,
 ('FIJI', 'Surfing'): 2,
 ('FIJI', 'Swimmin

In [1062]:
heatmap_dict[('USA', 'Surfing')]

655

In [1063]:
# x - countries
# y - activities
# z data, list of 15 array  (activities) of 25 countries

z = []

for act in top_activities:
    new_row = []
    for country in top_countries:
        if (str(country), str(act)) in heatmap_dict:
            new_row.append(heatmap_dict[(str(country), str(act))])
        else:
            new_row.append(0)
    z.append(list(new_row))

In [1067]:
trace = go.Heatmap(z=z,
                   x=top_countries,
                   y=top_activities, xgap =5, ygap=5,
                  colorscale=[[0.0, 'rgb(49,54,149)'], 
                              [0.035,'rgb(69,117,180)' ],
                              [0.06, 'rgb(116,173,209)'],
                              [0.085, 'rgb(171,217,233)'],
                              [0.11, 'rgb(224,243,248)'],
                              [0.135, 'rgb(254,224,144)'],
                              [0.16, 'rgb(253,174,97)'],
                              [0.185, 'rgb(244,109,67)'],
                              [0.2, 'rgb(215,48,39)'],
                              [1.0, 'rgb(165,0,38)' ]])
data=[trace]


layout = go.Layout(
    title='Number of Shark attacks by Activity and Country',
    xaxis = dict(ticks='', nticks=20, title = 'Country', tickmode = 'linear'),
    yaxis = dict(ticks='', tickprefix = "", side='left',
        position=0.0, title = 'Activity', tickangle=0, tickfont=dict(
            size=10,
        ))
)

fig = go.Figure(data=data, layout=layout)
fig['layout']['yaxis']['autorange'] = "reversed"
iplot(fig, filename = "Sharks heatmap")

# Overlaid area chart

I am going to plot attacks by month, for that I need to assign country to the southern or northen hemisphear since the distribution by hemisphere is different

In [1100]:
# Northen - 0, Southern - 1
countries = {'CROATIA': 0, 'NORWAY': 0, 'FRANCE': 0, 'MARTINIQUE': 0, 'ICELAND': 0, 
            'JAVA': 1, 'Sierra Leone': 0, 'CYPRUS': 0, 'LIBERIA': 0, 'NEW BRITAIN': 1, 
            'URUGUAY': 1, 'NORTH ATLANTIC OCEAN ': 0, 'ADMIRALTY ISLANDS': 1, 
            'PAPUA NEW GUINEA': 1, 'DJIBOUTI': 0, 'TAIWAN': 1, 'EL SALVADOR': 0, 
            'ST. MAARTIN': 0, 'ASIA?': 0, 'NAMIBIA': 1, 'OCEAN': 1, 'CAPE VERDE': 0, 
            'MID ATLANTIC OCEAN': 0, 'MAURITIUS': 1, 'ANTIGUA': 0, 'FRENCH POLYNESIA': 1, 
            'JOHNSTON ISLAND': 0, 'SUDAN': 0, 'SOUTH KOREA': 0, 'TUVALU': 1, 
            'SOUTH ATLANTIC OCEAN': 1, 'UNITED ARAB EMIRATES (UAE)': 0, 'DOMINICAN REPUBLIC': 0, 
            ' PHILIPPINES': 0, 'MALAYSIA': 0, 'BRITISH VIRGIN ISLANDS': 0, 'CHINA': 0, 
            'ATLANTIC OCEAN': 0, 'ITALY': 0, 'VENEZUELA': 0, 'SOLOMON ISLANDS / VANUATU': 1, 
            'SOUTH CHINA SEA': 0, 'Between PORTUGAL & INDIA': 2, 'DIEGO GARCIA': 1, 
            'MEDITERRANEAN SEA?': 0, 'INDIAN OCEAN?': 1, 'INDIA': 0, 'SOUTH AFRICA': 1, 
            'St Helena': 1, 'WESTERN SAMOA': 1, 'TASMAN SEA': 1, 'HONG KONG': 0, 'TONGA': 1, 
            'YEMEN': 0, 'COLUMBIA': 0, 'NORTHERN MARIANA ISLANDS': 0, 'GUAM': 0, 'GUINEA': 0, 
            'CENTRAL PACIFIC': 2, 'GUATEMALA': 0, 'FIJI': 1, 'GULF OF ADEN': 0, 'JAPAN': 0, 
            'MID-PACIFC OCEAN': 0, 'ST. MARTIN': 1, 'USA': 0, 'CRETE': 0, 'BRAZIL': 1, 
            'TURKS & CAICOS': 0, 'SOUTHWEST PACIFIC OCEAN': 1, 'GREENLAND': 0, 
            'BAY OF BENGAL': 1, 'PACIFIC OCEAN': 0, 'LEBANON': 0, 'MALTA': 0, 'NIGERIA': 0, 
            'GREECE': 0, 'MEXICO': 0, 'BERMUDA': 0, 'UNITED KINGDOM': 0, 'SINGAPORE': 0, 
            'BRITISH ISLES': 0, 'TURKEY': 0, 'NEVIS': 1, 'AUSTRALIA': 1, 'ENGLAND': 0, 
            'SIERRA LEONE': 0, 'VANUATU': 1, 'NORTH SEA': 0, 'RUSSIA': 0, 'MICRONESIA': 0, 
            'PORTUGAL': 0, 'RED SEA': 0, 'MONTENEGRO': 0, 'IRAQ': 0, 'SWEDEN': 0, 
            'PERSIAN GULF': 0, 'NORTH ATLANTIC OCEAN': 0, 'Fiji': 1, 'SLOVENIA': 0, 
            'PHILIPPINES': 0, 'IRAN / IRAQ': 0, 'TUNISIA': 0, 'SAN DOMINGO': 1, 'AZORES': 0, 
            'GEORGIA': 0, 'BURMA': 0, 'NEW GUINEA': 1, 'SUDAN?': 0, 'NETHERLANDS ANTILLES': 0, 
            'ALGERIA': 0, 'NICARAGUA': 0, 'SEYCHELLES': 1, 'RED SEA?': 0, 'BRITISH NEW GUINEA': 1, 
            'THAILAND': 0, 'PALESTINIAN TERRITORIES': 0, 'FALKLAND ISLANDS': 1, 'IRELAND': 0, 
            'MONACO': 0, 'PARAGUAY': 1, 'SYRIA': 0, 'EGYPT ': 0, 'MADAGASCAR': 1, 
            'NORTH PACIFIC OCEAN': 0, 'EGYPT / ISRAEL': 0, 'COOK ISLANDS': 1, 
            'TRINIDAD & TOBAGO': 0, 'PACIFIC OCEAN ': 0, 'EQUATORIAL GUINEA / CAMEROON': 0, 
            'ISRAEL': 0, 'SAMOA': 1, 'ECUADOR': 1, 'CARIBBEAN SEA': 0, 'NEW CALEDONIA': 1, 
            'MARSHALL ISLANDS': 0, 'PANAMA': 0, 'UNITED ARAB EMIRATES': 0, 'ITALY / CROATIA': 0, 
            'NEW ZEALAND': 1, 'MALDIVE ISLANDS': 0, 'GHANA': 0, 'MOZAMBIQUE': 0, 'SRI LANKA': 0, 
            'SOLOMON ISLANDS': 1, 'Coast of AFRICA': 1, 'BARBADOS': 0, 'BANGLADESH': 0, 
            'CHILE': 1, 'CANADA': 0, 'HONDURAS': 0, 'PALAU': 0, 'AMERICAN SAMOA': 1, 
            'SAUDI ARABIA': 0, ' TONGA': 1, 'SPAIN': 0, 'ARGENTINA': 1, 'CURACAO': 0, 
            'ANDAMAN / NICOBAR ISLANDAS': 0, 'KENYA': 1, 'EGYPT': 0, 'THE BALKANS': 0, 
            'PUERTO RICO': 0, 'KIRIBATI': 0, 'OKINAWA': 0, 'REUNION': 1, 
            'BRITISH WEST INDIES': 0, 'NICARAGUA ': 0, 'FEDERATED STATES OF MICRONESIA': 0, 
            'IRAN': 0, 'CAYMAN ISLANDS': 0, 'SOMALIA': 0, 'INDONESIA': 1, 'KUWAIT': 0, 
            'Seychelles': 1, 'COSTA RICA': 0, 'INDIAN OCEAN': 1, 'CEYLON (SRI LANKA)': 0, 
            'YEMEN ': 0, 'HAITI': 0, 'SCOTLAND': 0, 'CUBA': 0, 'GUYANA': 0, 'LIBYA': 0, 
            'MEXICO ': 0, 'SENEGAL': 0, 'GRAND CAYMAN': 0, 'GABON': 1, 'GRENADA': 0, 
            'RED SEA / INDIAN OCEAN': 0, 'VIETNAM': 0, 'BAHAMAS': 0, 'BAHREIN': 0, 
            'NORTHERN ARABIAN SEA': 0, 'BELIZE': 0, 'MEDITERRANEAN SEA': 0, 'ANGOLA': 1, 
            'SOUTH PACIFIC OCEAN': 1, 'TANZANIA': 1, 'KOREA': 0, 'JAMAICA': 0, 'ARUBA': 0, 
            'MAYOTTE':1}

In [1089]:
sharks['Month'] = None

In [1098]:
months_dict = {'Jan': 0, 'Feb': 1, 'Mar': 2, 
               'Apr': 3, 'Ap-': 3, 'May': 4, 
               'Jun': 5, 'Jul': 6, 'Aug': 7, 
               'Sep': 8, 'Oct': 9, 'Nov': 10, 
               'Dec': 11}

for index, d in sharks.Date.iteritems():
    date = d.replace(' ', '')
    date = date.replace('July', 'Jul')
    date = date.replace('Sept', 'Sep')
    date = date.replace('--', '-')
    date = date.replace('y2', 'y-2')
    date = date.replace('v2', 'v-2')
    month = 0
    if len(date) >= 11 and len(date) <= 12 and date[2] == '-':
        month = date[3:6]
    elif len(date) == 10 and date[1] == '-':
        month = date[2:5]
    elif len(date) == 19 and date[10] == '-':
        month = date[11:14]
    try:
        sharks['Month'][index] = months_dict[month]
    except:
        pass

In [1099]:
sharks.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour,Month
0,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,18.0,5
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,14.0,5
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,7.0,5
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark,,5
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,Tiger shark,,5


In [1101]:
def hemisphere(x): 
    try:  
        hem = countries[x]
        return hem
    except:
        return None

sharks["Hemisphere"] = sharks["Country"].apply(lambda x: hemisphere(x));

In [1102]:
sharks.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour,Month,Hemisphere
0,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,18.0,5,0.0
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,14.0,5,0.0
2,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,7.0,5,0.0
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark,,5,1.0
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,Tiger shark,,5,0.0


In [1106]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [1109]:
south = sharks[sharks.Hemisphere==1].groupby('Month').count().Date.values
north = sharks[sharks.Hemisphere==0].groupby('Month').count().Date.values

In [1115]:
trace1 = go.Scatter(
    x=months,
    y=south,
    fill='tonexty',
    mode= 'none', name='Southern Hemisphere'
)
trace2 = go.Scatter(
    x=months,
    y=north,
    fill='tozeroy',
    mode= 'none', name='Northern Hemisphere'
)

data = [trace1, trace2]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months and hemispheres',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

In [1151]:
sharks.to_csv('sharks_hem.csv', index=False)
sharks = pd.read_csv('sharks_hem.csv')

## Stacked /  stream

In [1158]:
types = [i.lower() for i in sharks.Type.unique()]
types = [i.replace(' ', '_') for i in types]

for i, t in enumerate(types):
    vars()[types[i]+'_s'] = sharks[(sharks.Hemisphere==1) & (sharks.Type == sharks.Type.unique()[i]) & \
                                   (sharks.Year < 2018)]\
    .groupby('Month').count().Date.values

In [1187]:
for i, t in enumerate(types):
    vars()[types[i]+'_n'] = sharks[(sharks.Hemisphere==0) & (sharks.Type == sharks.Type.unique()[i]) & \
                                   (sharks.Year < 2018)]\
    .groupby('Month').count().Date.values

In [1188]:
types

['boat', 'unprovoked', 'invalid', 'provoked', 'questionable', 'sea_disaster']

In [1189]:
trace0 = dict(
    x=months,
    y=invalid_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name = 'Invalid'
)
trace1 = dict(
    x=months,
    y=questionable_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
    stackgroup='one', name="Questionable"
)
trace2 = dict(
    x=months,
    y=sea_disaster_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name='Sea Disaster'
)
trace3 = dict(
    x=months,
    y=boat_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name='Boat'
)
trace4 = dict(
    x=months,
    y=provoked_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
             
    stackgroup='one', name='Provoked'
)
trace5 = dict(
    x=months,
    y=unprovoked_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name='Unprovoked'
)
data = [trace0, trace1, trace2, trace3, trace4, trace5]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Southern Hemisphere 1900-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

In [1190]:
trace0 = dict(
    x=months,
    y=invalid_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name = 'Invalid'
)
trace1 = dict(
    x=months,
    y=questionable_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
    stackgroup='one', name="Questionable"
)
trace2 = dict(
    x=months,
    y=sea_disaster_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name='Sea Disaster'
)
trace3 = dict(
    x=months,
    y=boat_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name='Boat'
)
trace4 = dict(
    x=months,
    y=provoked_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
             
    stackgroup='one', name='Provoked'
)
trace5 = dict(
    x=months,
    y=unprovoked_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name='Unprovoked'
)
data = [trace0, trace1, trace2, trace3, trace4, trace5]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Nothern Hemisphere 1900-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

## Bubble Map

In [1277]:
Image(url= "./bubble_map.png")

## Chloropleth map

In [1278]:
Image(url= "./chloropleth2.png")

# Tree map

In [1276]:
Image(url= "./tree_map.png")

# Connection map

Since my data doesn't really have any foundation for connection map, I decided to plot the approximate major migration routes for sharks. Unfortunately, there is no public data on migration route, I could easily dounload, therefore below routes are based on my knowledge and internet research and does not represent scintific view. There is pacific migration, happening from central California coast to other feeding grounds far away in the Pacific Ocean. Another major migraton route is along East coast, where sharks are migrating from the north to the warmer waters of Florida.

In [1258]:
pacific_migration = [go.Scattergeo(
    lat = [28.822418, 38.170194],
    lon = [-158.859361, -123.720130],
    mode = 'lines',
    line = go.scattergeo.Line(
        width = 2,
        color = 'red',
    ),
)]

atlantic_migration = [go.Scattergeo(
    lat = [25.869109, 44.873876],
    lon = [-78.021723, -54.650979],
    mode = 'lines',
    line = go.scattergeo.Line(
        width = 2,
        color = 'red',
    ),
)]
layout = go.Layout(
    title = go.layout.Title(
        text = 'Approximate shark migration routes in Pacific and Atlantic side of the US'
    ),
    showlegend = False,
    geo = go.layout.Geo(
        resolution = 50,
        showland = True,
        showlakes = True,
        landcolor = 'rgb(102, 153, 204)',
        countrycolor = 'rgb(102, 153, 204)',
        lakecolor = 'rgb(255, 255, 255)',
        projection = go.layout.geo.Projection(
            type = "equirectangular"
        ),
        coastlinewidth = 2,
        lataxis = go.layout.geo.Lataxis(
            range = [20, 60],
            showgrid = True,
            dtick = 10
        ),
        lonaxis = go.layout.geo.Lonaxis(
            range = [-100, 20],
            showgrid = True,
            dtick = 20
        ),
    )
)
fig = go.Figure(data = pacific_migration+atlantic_migration, layout = layout)
iplot(fig, filename = "Sharks stacked")

## Summary

Overall, I must say, many incidents are not really an attack. Look at baskin shark (which has no teeth!!!) bumped the boat in Scotland. 

In [748]:
sharks[sharks.Species == 'Basking shark']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour
1251,Reported 02-Jun-2008,2008,Boating,SCOTLAND,Easter Ross,Balintore Bay,Fishing,,,No injury to occupants; shark struck their boat,N,,Basking shark,
2135,26-Feb-1999,1999,Boating,USA,North Carolina,Frying Pan Shoals,Cruising,,,"No injury to occupants, boat sank after collid...",N,,Basking shark,
4839,12-Sep-1937,1937,Boating,SCOTLAND,Argyllshire,Arran,Pleasure boating,,,"No injury to occupants, two 5-foot observation...",N,,Basking shark,
4840,11-Sep-1937,1937,Boating,SCOTLAND,Arran,Fallen Rocks,Fishing,,,"No injury to occupants, propeller shaft damaged",N,,Basking shark,
4841,01-Sep-1937,1937,Unprovoked,SCOTLAND,Argyll,"Carradale Bay, Kintyre Peninsula",Rowing,M,,3 people drowned when the boat was capsized by...,N,,Basking shark,
6194,Before 1908,1908,Unprovoked,USA,California,"Monterey, Montery County",Fishing,M,,FATAL PROVOKED INCIDENTS,Y,,Basking shark,


Or a number of incidents where the shark was feasting on human cadavar (**post-mortem**) either on murdered, drowned or shipwreck bodies.

In [750]:
sharks[sharks.Species=='Invalid']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour
226,Sep-2016,2016,Invalid,MEXICO,,Guadalupe Island,Cage Diving,,,"No injury to divers, white shark breached cage",UNKNOWN,,Invalid,
578,13-Mar-2014,2014,Invalid,CAYMAN ISLANDS,,,Scuba diving / culling lionfish,M,,"Caribbean reef shark buzzed him. No injury, no...",UNKNOWN,,Invalid,
676,08-May-2013,2013,Invalid,USA,California,"Tourmaline Surf Park, San Diego County",Surfing,M,42,Shark bites were post-mortem,UNKNOWN,,Invalid,
926,Reported 07-May-2011,2011,Invalid,UNITED ARAB EMIRATES (UAE),Umm al Qaywayan Province,Khor Fakkan,Fishing,M,43,Erroneously reported on several internet sites...,UNKNOWN,,Invalid,
1044,04-Feb-2010,2010,Invalid,GUAM,Merizo,Achang Reef,Spearfishing (free diving),M,31,Shark bites were post-mortem,UNKNOWN,11h00,Invalid,11.0
1093,02-Sep-2009,2009,Invalid,NEVIS,,Castle Beach,Swimming,M,,Death was due to drowning. Two days later his ...,UNKNOWN,,Invalid,
1167,23-Jan-2009,2009,Invalid,BRAZIL,Maranhão,Olho d'Água,Swimming,M,17,"Drowned, body scavenged by shark",UNKNOWN,,Invalid,
1301,19-Dec-2007,2007,Invalid,BRITISH VIRGIN ISLANDS,Green Bay,,Scuba diving,M,53,Shark bites were post-mortem,UNKNOWN,,Invalid,
1314,November 2011,2007,Invalid,MEXICO,Baja California,Guadalupe Island,Shark diving,M,,White shark breached cage. No injury to occupants,UNKNOWN,,Invalid,
1427,30-Sep-2006,2006,Invalid,SOUTH AFRICA,Western Cape Province,Miller's Point,Spearfishing,M,36,No injury; 4m white shark made a threat display,UNKNOWN,,Invalid,


In [None]:
Or there is so many incidents, where the shark involvement is questionable.

In [751]:
sharks[sharks.Species=='Questionable']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour
20,25-Apr-2018,2018,Questionable,AUSTRALIA,New South Wales,Lennox Head,Surfing,M,,No injury,N,07h00,Questionable,7.0
79,24-Sep-2017,2017,Invalid,USA,New York,Rockaway,Surfing,M,33,"Lacerations to right ankle, foot & toe",UNKNOWN,15h30,Questionable,15.0
1366,Jul-2007,2007,Invalid,SENEGAL,,,Murder,,,,UNKNOWN,,Questionable,
1679,13-Apr-2004,2004,Invalid,TONGA,Nuku'alofa,30 nautical miles offshore,Five men on makeshift raft after their 10 m fi...,M,,He was was bitten on the arm by small sharks &...,Y,,Questionable,
1711,26-Dec-2003,2003,Invalid,USA,Florida,"Miami, Dade County",Swimming,M,28,Knee lacerated,UNKNOWN,,Questionable,
1900,16-Sep-2001,2001,Invalid,USA,Florida,"2 miles off Pompano Beach, Broward County",Wreck / Technical diving,M,42,FATAL or drowning & scavenging,Y,13h20,Questionable,13.0
2539,21-Aug-1992,1992,Invalid,USA,Hawaii,"Twin Arches, Hana Ranch, Maui",Fell from cliff while fishing & disappeared in...,M,,Body recovered next morning. Injuries appeared...,UNKNOWN,15h00,Questionable,15.0
2542,Aug-1992,1992,Invalid,USA,Florida,St. Lucie County,Fisherman,M,,No details,UNKNOWN,,Questionable,
2564,09-Feb-1992,1992,Invalid,AUSTRALIA,Tasmania,"Clifton Beach, southwest of Hobart",Surfing,M,19,"No injury, shark allegedly took his surfboard ...",UNKNOWN,19h30,Questionable,19.0
2571,Jan-1992,1992,Invalid,JAPAN,Sea of Japan,Kanazawa?,,,,Survived. questionable incident,UNKNOWN,,Questionable,


In [752]:
sharks[sharks.Species=='Shark involvement unconfirmed']

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Hour
16,09-May-2018,2018,Questionable,AUSTRALIA,New South Wales,"Sharpes Beach, Ballina",Surfing,M,,"No injury, surfboard damaged",N,10h30,Shark involvement unconfirmed,10.0
31,Reported 10-Apr-2018,2018,Invalid,BRAZIL,Alagoas,"Praia de Sauaçuhy, Maceió",Fishing,M,56,Injury to ankle from marine animal trapped in ...,N,,Shark involvement unconfirmed,
98,26-Aug-2017,2017,Invalid,SPAIN,Castellón,Grao de Moncofa,Swimming,F,11,Lacerations to left foot,UNKNOWN,Midday,Shark involvement unconfirmed,
115,20-Jul-2017,2017,Invalid,USA,South Carolina,"Hilton Head Island, Beaufort County",Swimming,F,8,Foot injured,UNKNOWN,,Shark involvement unconfirmed,
124,02-Jul-2017,2017,Invalid,COMOROS,Anjouan,Moya,Fishing,,,"Skull found in shark, a probable drowning & sc...",UNKNOWN,,Shark involvement unconfirmed,
145,03-May-2017,2017,Invalid,USA,California,"Sunset Beach, Orange County",Surfing,F,18,"Laceration to thigh, likely caused by surfboar...",UNKNOWN,14h30,Shark involvement unconfirmed,14.0
153,20-Apr-2017,2017,Invalid,USA,South Carolina,Georgetown County,Swimming,M,,Laceration & puncture wounds to left foot,UNKNOWN,08h50,Shark involvement unconfirmed,8.0
159,12-Apr-2017,2017,Invalid,SOUTH AFRICA,KwaZulu-Natal,Protea Banks,Scuba Diving,M,68,"Fatal, coroner unable to determine if the dive...",UNKNOWN,,Shark involvement unconfirmed,
189,03-Jan-2017,2017,Invalid,AUSTRALIA,New South Wales,Merimbula,Wading,M,20,Minor injuries to foot & toes,UNKNOWN,18h00,Shark involvement unconfirmed,18.0
195,11-Dec-2016,2016,Invalid,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,M,19,Cuts to right ankle & foot,UNKNOWN,Afternoon,Shark involvement unconfirmed,


Those had some comments like "Questionable incident - shark bite may have precipitated drowning', or 'Questionable incident; reported as shark attack but thought to involve a pinniped instead"

I would approximate to about BLA of all accident. To find out how dangerous are sharks really, I will try to exclude this data and show a glims of shark attacks for the past 10 years

# How dangerous are sharks really?

In [1274]:
# filter out above, then show by hemisphere and split fatal into provoked, unprovoked and activity: surfing

In [1260]:
fatality = [i.lower() for i in sharks['Fatal (Y/N)'].unique()]
fatality 

['n', 'y', 'unknown']

In [1272]:
for i, t in enumerate(fatality):
    vars()[types[i]+'_n'] = sharks[(sharks.Hemisphere==0) & (sharks.Type == sharks.Type.unique()[i]) & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values

trace0 = dict(
    x=months,
    y=unknown_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name = 'Unknown'
)
trace1 = dict(
    x=months,
    y=n_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
    stackgroup='one', name="Non-Fatal"
)
trace2 = dict(
    x=months,
    y=y_n,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name='Fatal'
)


data = [trace0, trace1, trace2]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Nothern Hemisphere 2007-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")

In [1273]:
for i, t in enumerate(fatality):
    vars()[types[i]+'_s'] = sharks[(sharks.Hemisphere==1) & (sharks.Type == sharks.Type.unique()[i]) & \
                                   (sharks.Year.isin(list(range(2007, 2017, 1))))]\
    .groupby('Month').count().Date.values

trace0 = dict(
    x=months,
    y=unknown_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name = 'Unknown'
)
trace1 = dict(
    x=months,
    y=n_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
    stackgroup='one', name="Non-Fatal"
)
trace2 = dict(
    x=months,
    y=y_s,
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5),
              
    stackgroup='one', name='Fatal'
)


data = [trace0, trace1, trace2]

layout = go.Layout(barmode='overlay',
        xaxis=dict(title='Month'),
                   
    title='Number of shark attacks by months in Southern Hemisphere 2007-2017',
    yaxis=dict(
        title='Count of total shark attacks'
    ),
    bargap=0.1
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Sharks stacked")