# IMPORTING LIBRARIES AND OBSERVING DATA

In [453]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [454]:
data = pd.read_csv('../GSAF5.csv', engine="python")
print(data.shape)

data.columns


(5992, 24)


Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

## HYPOTHESIS: Male surfers in Florida, USA have suffered the highest incidence of shark attacks AND deaths, with increasing frequency since the 1900's up until today.

## DATA CLEANING AND MANIPULATION

In [455]:
# Renaming columns where unintented spaces might cause problems
data.rename(columns={'Sex ': 'Sex', 'Species ': 'Species'}, inplace=True)
data.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [456]:
# Checking the types of data
data.dtypes

Case Number               object
Date                      object
Year                       int64
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Age                       object
Injury                    object
Fatal (Y/N)               object
Time                      object
Species                   object
Investigator or Source    object
pdf                       object
href formula              object
href                      object
Case Number.1             object
Case Number.2             object
original order             int64
Unnamed: 22               object
Unnamed: 23               object
dtype: object

### START BY DELETING DUPLICATED ROWS AND COLUMNS

In [458]:
# For rows, checking for duplicated rows, dropping them
before = len(data)
data = data.drop_duplicates()
after = len(data)
print('Number of duplicate records dropped: ', str(before - after))

Number of duplicate records dropped:  0


In [459]:
# For columns, first Case Number columns are repeated, I keep only 1 column, check that other two were dropped
data.loc[:, ['Case Number','Case Number.1','Case Number.2']]
print('Before: ', data.shape)
data = data.drop(['Case Number.1', 'Case Number.2'], axis=1)
print('After: ', data.shape)

Before:  (5992, 24)
After:  (5992, 22)


In [460]:
# Href, href formula and pdf represent the same, I keep only 1 column, check that other two were dropped
data.loc[:, ['pdf','href','href formula']]
print('Before: ', data.shape)
data = data.drop(['pdf', 'href formula'], axis=1)
print('After: ', data.shape)


Before:  (5992, 22)
After:  (5992, 20)


### DECIDING USEFUL DATA TO TEST HYPOTHESIS

In [461]:
# Before anything, we drop columns with more than 50% Null Values
print('Before: ', data.shape)
cutoff_null = .5 * len(data)
data.dropna(thresh = cutoff_null, axis = 1, inplace = True)
print('After: ', data.shape)

Before:  (5992, 20)
After:  (5992, 17)


In [462]:
# I then look at what is important for my hypothesis
# Key words in Hypothesis: 'Male', 'Surfer', 'Florida', 'USA' 'Deaths', '1900'
# Therefore keep columns: Sex, Activity, Area, Country, Fatal and Year, Drop all other columns
print('Before: ', data.shape)
data = data.drop(['Name', 'Investigator or Source', 'Date', 'Injury', 'original order', 'href', 'Type', 'Location', 'Age', 'Species', 'Case Number'], axis=1)
print('After: ', data.shape)




Before:  (5992, 17)
After:  (5992, 6)


In [463]:
# Now checking for NULL values in columns, many still have null values, proceed cleaning column by column

data.isnull().sum()

Year             0
Country         43
Area           402
Activity       527
Sex            567
Fatal (Y/N)     19
dtype: int64

### GO COLUMN BY COLUMN CLEANING AND SELECTING USEFUL ROWS

#### YEAR COLUMN

In [464]:
# No unkown values. Dropping row if year below 1900, only interested in the past 2 centuries (to test hypothesis)
data = data[data.Year >1900]
data.shape

(5311, 6)

#### SEX COLUMN

In [465]:
# Sex is a crucial variable for hypothesis, drop rows were sex is unkown, drop row with incorrect 
# values (if undetectable) and adjust separate two M´s to Male 

data['Sex'].value_counts()

M      4254
F       548
M         2
.         1
lli       1
N         1
Name: Sex, dtype: int64

In [466]:
data['Sex']= data['Sex'].fillna('Unknown')
data = data[data['Sex'] != 'lli']
data = data[data['Sex'] != '.']
data = data[data['Sex'] != 'N']
data = data[data['Sex'] != 'Unknown']
data['Sex'] = data['Sex'].str.replace('M ', 'M')
data['Sex'].value_counts()

M    4256
F     548
Name: Sex, dtype: int64

In [467]:
data.shape

(4804, 6)

#### Country Column

In [468]:
# Fill Nulls with Unknown and make some minor cleaning adjustments
data['Country']= data['Country'].fillna('Unknown')
data['Country'] = data['Country'].str.replace(' PHILIPPINES', 'PHILIPPINES')
data['Country'] = data['Country'].str.replace('Sierra Leone', 'SIERRA LEONE')
data['Country'].value_counts()


USA                           1893
AUSTRALIA                      992
SOUTH AFRICA                   470
PAPUA NEW GUINEA               110
BAHAMAS                         88
BRAZIL                          86
NEW ZEALAND                     80
MEXICO                          66
REUNION                         49
ITALY                           49
FIJI                            45
PHILIPPINES                     42
MOZAMBIQUE                      36
NEW CALEDONIA                   33
EGYPT                           29
PANAMA                          26
SPAIN                           26
IRAN                            25
JAPAN                           24
SOLOMON ISLANDS                 24
HONG KONG                       23
CUBA                            23
CROATIA                         22
Unknown                         20
FRENCH POLYNESIA                19
JAMAICA                         18
ENGLAND                         13
INDONESIA                       12
TONGA               

In [469]:
# Removing Countries with less than 10 values
value_counts = data['Country'].value_counts()
to_remove = value_counts[value_counts <= 10].index
data = data[~data.Country.isin(to_remove)]
data['Country'].value_counts()

USA                 1893
AUSTRALIA            992
SOUTH AFRICA         470
PAPUA NEW GUINEA     110
BAHAMAS               88
BRAZIL                86
NEW ZEALAND           80
MEXICO                66
ITALY                 49
REUNION               49
FIJI                  45
PHILIPPINES           42
MOZAMBIQUE            36
NEW CALEDONIA         33
EGYPT                 29
SPAIN                 26
PANAMA                26
IRAN                  25
JAPAN                 24
SOLOMON ISLANDS       24
HONG KONG             23
CUBA                  23
CROATIA               22
Unknown               20
FRENCH POLYNESIA      19
JAMAICA               18
ENGLAND               13
TONGA                 12
INDONESIA             12
COSTA RICA            11
VIETNAM               11
Name: Country, dtype: int64

#### Area Column

In [470]:
# Filling Null as Unkown
data['Area']= data['Area'].fillna('Unknown')
#Removing Areas with less than 10 counts
value_counts1 = data['Area'].value_counts()
to_remove1 = value_counts1[value_counts1 <= 50].index
data = data[~data.Area.isin(to_remove1)]
data = data[data['Area'] != 'Unknown']
data['Area'].value_counts()

Florida                  929
New South Wales          357
California               252
Queensland               249
Hawaii                   245
KwaZulu-Natal            185
Eastern Cape Province    141
Western Cape Province    134
Western Australia        134
South Carolina           124
North Carolina            85
South Australia           72
Texas                     66
Torres Strait             63
Pernambuco                63
Victoria                  59
Name: Area, dtype: int64

#### Activity Column

In [471]:
# Filling Unknowns, then Grouping all acitivities into categories using regex, 
# trying to capture as much of the data as possible

data['Activity']= data['Activity'].fillna('Unknown')
def activity_type(x):
    x=str(x)
    if (re.findall("[Ss]urf\w+", x)):
        return 'Surfing'
    elif (re.findall("[Ss]wim|[Ss]nork|[Bb]ath|[Ff]loat|[Tt]read\w+", x)):
        return 'Swimming'
    elif (re.findall("[Ff]ishi\w+", x)):
        return 'Fishing'
    elif (re.findall("[Dd]iv\w+", x)):
        return 'Diving'
    elif (re.findall("[Bb]oard\w+", x)):
        return 'Boarding'
    elif (re.findall("[Ww]ad|[Ww]alk|[Ss]tan|[Dd]ang\w+", x)):
        return 'Wading'
    elif (re.findall("[Kk]aya|[Cc]anoe|[Rr]ow|[Ss]ail\w+", x)):
        return 'Paddling'
    elif (re.findall("[Bb]oat|[Ss]hip\w+", x)):
        return 'Boat'
    elif (re.findall("[Ss]kii\w+", x)):
        return 'Water-Ski'
    elif (re.findall("Unknown", x)):
        return 'Unknown'
    else:
        return 'Other'
    

data['Activity'] = data['Activity'].apply(activity_type)
data['Activity'].value_counts()


    

Surfing      941
Swimming     708
Fishing      447
Diving       243
Wading       232
Other        206
Unknown      173
Boarding     114
Paddling      43
Water-Ski     37
Boat          14
Name: Activity, dtype: int64

#### Fatal (Y/N) Column

In [472]:
# Fill and then Eliminate Null Values plus make cleaning adjustments
data['Fatal (Y/N)']= data['Fatal (Y/N)'].fillna('UNKNOWN')
data = data[data['Fatal (Y/N)'] != 'UNKNOWN']
data['Fatal (Y/N)']= data['Fatal (Y/N)'].str.replace(' N', 'N')
data['Fatal (Y/N)'].value_counts()


N    2600
Y     532
Name: Fatal (Y/N), dtype: int64

## Further Data Manipulation and Analysis of Hypothesis

### Hypothesis Reminder: Male surfers in Florida, USA have suffered the highest incidence of shark attacks and deaths, with increasing frequency since the 1900's up until today.


In [473]:
# Save cleaned data to Csv
data.to_csv('data_clean.csv', index=False)


In [474]:
# Checking current data:
data.shape

(3132, 6)

In [475]:
# Changing Column order to suit order of the hypothesis: Male-Surfer-FloridaUSA-Deaths-Since1900
column_order = ['Sex','Activity','Area','Country','Fatal (Y/N)', 'Year']
data = data[column_order]
data.head()

Unnamed: 0,Sex,Activity,Area,Country,Fatal (Y/N),Year
0,M,Surfing,Florida,USA,N,2016
1,M,Surfing,Florida,USA,N,2016
2,M,Surfing,Florida,USA,N,2016
3,M,Surfing,Victoria,AUSTRALIA,N,2016
4,M,Surfing,Victoria,AUSTRALIA,N,2016


In [476]:
pd.crosstab(data['Activity'],data["Sex"])[['M','F']]
# We can see that most shark attacks have occurred to Males in all Activities, in accordance with the hypothesis


Sex,M,F
Activity,Unnamed: 1_level_1,Unnamed: 2_level_1
Boarding,90,24
Boat,11,3
Diving,233,9
Fishing,436,9
Other,180,24
Paddling,36,6
Surfing,882,54
Swimming,576,130
Unknown,135,26
Wading,149,82


In [481]:
pd.crosstab(data['Activity'],data["Area"])
# We can see that for surfers, Florida is the province where most attacks have ocurred, in accordance with the hypothesis

Area,California,Eastern Cape Province,Florida,Hawaii,KwaZulu-Natal,New South Wales,North Carolina,Pernambuco,Queensland,South Australia,South Carolina,Texas,Torres Strait,Victoria,Western Australia,Western Cape Province
Activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Boarding,8,5,31,21,5,11,2,4,0,2,11,1,0,0,5,8
Boat,2,0,1,3,0,0,0,0,3,1,1,1,0,0,1,1
Diving,35,3,42,16,11,25,4,1,31,10,0,1,37,3,15,8
Fishing,46,18,82,45,31,46,5,0,40,18,8,17,4,11,31,43
Other,12,4,43,15,20,30,3,1,24,2,15,8,3,4,12,8
Paddling,14,2,2,3,0,6,0,0,5,4,0,0,1,2,3,0
Surfing,100,67,380,69,23,101,22,27,23,25,13,9,5,20,23,29
Swimming,24,29,174,60,61,88,28,24,85,4,39,16,8,13,28,25
Unknown,7,4,50,4,14,18,5,5,23,2,13,3,5,4,2,2
Wading,4,6,114,5,12,13,16,1,12,4,22,10,0,1,7,4


In [492]:

# We can see that attacks have increased in frequency over time ince 1900, in accordance with the hypothesis

KeyError: (1901, 1911, 1921, 1931, 1941)

In [497]:
# We can see that attacks have increased in frequency over time ince 1900, in accordance with the hypothesis
pd.crosstab(data['Activity'],data["Year"])[[1901,1911,1921,1926,1931,1936,1941,1946,1956,1961,1966,1971,1976,1981,1986,1991,1996,2001,2006,2011]]

Year,1901,1911,1921,1926,1931,1936,1941,1946,1956,1961,1966,1971,1976,1981,1986,1991,1996,2001,2006,2011
Activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Boarding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,8,6,5
Boat,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
Diving,0,1,2,2,2,5,0,1,5,2,3,3,5,4,2,4,3,1,6,5
Fishing,0,1,0,0,4,4,3,1,2,15,11,1,3,5,7,3,3,2,12,14
Other,0,3,1,0,3,2,1,1,1,4,7,0,1,3,2,1,0,1,3,4
Paddling,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,3
Surfing,0,0,0,0,0,3,0,1,1,1,2,3,5,14,11,8,16,31,24,24
Swimming,4,6,1,6,5,5,3,9,5,7,3,3,4,6,3,9,9,10,11,9
Unknown,0,1,1,1,2,0,0,1,0,3,0,1,2,1,1,0,1,7,4,3
Wading,0,0,1,0,0,0,0,1,1,8,3,1,1,1,0,1,4,5,9,3


In [482]:
# However, in the next 2 cross tabs we can see that shark Deaths are more prevalent in Swimming, 
# Fishing and Diving than in SURFING, thus rejecting that portion of the hypothesis

pd.crosstab(data['Activity'],data["Fatal (Y/N)"])

Fatal (Y/N),N,Y
Activity,Unnamed: 1_level_1,Unnamed: 2_level_1
Boarding,101,13
Boat,7,7
Diving,184,58
Fishing,388,57
Other,149,55
Paddling,37,5
Surfing,886,50
Swimming,484,222
Unknown,114,47
Wading,213,18


In [483]:
# We can also see that Deaths are more prevalent in New South Wales (Australia), KwaZuluNatal (South Africa) 
# and Hawaii (USA) than in Florida (USA), thus rejecting that poriton of the hypothesis
pd.crosstab(data['Area'],data["Fatal (Y/N)"])

Fatal (Y/N),N,Y
Area,Unnamed: 1_level_1,Unnamed: 2_level_1
California,228,24
Eastern Cape Province,119,22
Florida,872,51
Hawaii,185,56
KwaZulu-Natal,126,57
New South Wales,264,89
North Carolina,74,11
Pernambuco,35,28
Queensland,175,71
South Australia,50,22


## To Conclude, the hypothesis holds for shark attacks, where attacks to Male Surfers in Florida, USA, have been the most prevalent, with increasing frequency since 1900. Nevertheless, the hypothesis does not hold for hark-elated Deaths, where other Activities such as Diving or Fishing, as well as other Areas like New South Wales (Australia), see more deaths.