# Shark KO!
## How likely is a shark attack lethal, and in which top 3 countries are more common.

In [1]:
import pandas as pd 
import numpy as np

### Importing & displaying data

In [2]:
unclean_data=pd.read_csv('GSAF5.csv', encoding = 'ISO-8859-1')# Had to add encoding because of an error of UnicodeDecodeError.
unclean_data.head() # Display first 5 rows of original data frame


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [3]:
unclean_data.shape # Checking the size of my Data Frame

(5992, 24)

In [4]:
unclean_data.columns # Verifying that all my columns are correctly written

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

### Renaming Columns

In [5]:
unclean_data = unclean_data.rename(columns={'Fatal (Y/N)':'Fatal'})# Renaming column 
unclean_data.columns # Checking Changes

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

### Checking for duplicates

In [6]:
unclean_data.duplicated().sum() # Checking for duplicades and adding them.

0

### Checking for null

In [7]:
null_cols = unclean_data.isnull().sum()
null_cols[null_cols > 0]
display(null_cols)# Checking how many columns are null and adding them.

Case Number                  0
Date                         0
Year                         0
Type                         0
Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal                       19
Time                      3213
Species                   2934
Investigator or Source      15
pdf                          0
href formula                 1
href                         3
Case Number.1                0
Case Number.2                0
original order               0
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [8]:
unclean_data.isnull().sum().sum() # Adding total numbers of null

23109

### Dropping Columns

In [9]:
drop_cols = list(null_cols[null_cols > 1000].index)
data = unclean_data.drop(drop_cols, axis=1)
data = data.drop(['Case Number.1','Case Number.2','original order','href formula','pdf','Investigator or Source','href','Case Number','Location','Name', 'Type', 'Date', 'Sex ', 'Activity'], axis=1)
data.head()

Unnamed: 0,Year,Country,Area,Injury,Fatal
0,2016,USA,Florida,Minor injury to thigh,N
1,2016,USA,Florida,Lacerations to hands,N
2,2016,USA,Florida,Lacerations to lower leg,N
3,2016,AUSTRALIA,Victoria,Struck by fin on chest & leg,N
4,2016,AUSTRALIA,Victoria,No injury: Knocked off board by shark,N


### Counting Fatal

In [10]:
fatal_count = data.Fatal.value_counts()
print(fatal_count)

N          4315
Y          1552
UNKNOWN      94
 N            8
#VALUE!       1
F             1
N             1
n             1
Name: Fatal, dtype: int64


### Replacing null cells in Fatal

In [11]:
data['Fatal'].fillna('UNKNOWN', inplace = True)

### Changing strings inside Fatal

In [12]:
data['Fatal']= data['Fatal'].str.replace('n','N')
data['Fatal']= data['Fatal'].str.replace(' N','N')
data['Fatal']= data['Fatal'].str.replace('N ','N')

### Recounting Fatal

In [13]:
fatal_count = data.Fatal.value_counts()
print(fatal_count)

N          4325
Y          1552
UNKNOWN     113
#VALUE!       1
F             1
Name: Fatal, dtype: int64


### Verifying if F is a Fatal

In [14]:
data.loc[data['Fatal'] == 'F']

Unnamed: 0,Year,Country,Area,Injury,Fatal
4693,1935,ISRAEL,Herzliyah,human remains washed ahore,F


### Recounting Fatal with F

In [15]:
data['Fatal']= data['Fatal'].str.replace('F','Y')
fatal_count = data.Fatal.value_counts()
print(fatal_count)

N          4325
Y          1553
UNKNOWN     113
#VALUE!       1
Name: Fatal, dtype: int64


### Verifying if #VALUE is a Fatal

In [16]:
data.loc[data['Fatal'] == '#VALUE!']

Unnamed: 0,Year,Country,Area,Injury,Fatal
5461,1890,SOUTH AFRICA,Eastern Cape Province,Forensic evidence indicated death resulted fro...,#VALUE!


### Recounting Fatal with #VALUE

In [17]:
data['Fatal']= data['Fatal'].str.replace('#VALUE!','Y')
fatal_count = data.Fatal.value_counts()
print(fatal_count)

N          4325
Y          1554
UNKNOWN     113
Name: Fatal, dtype: int64


### Display all cells with UNKNOWN in Fatal

In [18]:
data.loc[data['Fatal'] == 'UNKNOWN']

Unnamed: 0,Year,Country,Area,Injury,Fatal
54,2016,AUSTRALIA,New South Wales,"No injury, but sharks repeatedly hit their fin...",UNKNOWN
401,2013,USA,South Carolina,No details,UNKNOWN
1047,2008,AUSTRALIA,New South Wales,,UNKNOWN
1142,2007,SENEGAL,,,UNKNOWN
1844,2000,NEW ZEALAND,North Island,Reported as shark attack but probable drowning,UNKNOWN
2025,1997,BRAZIL,Rio de Janeiro,,UNKNOWN
2035,1997,USA,Hawaii,No details,UNKNOWN
2042,1996,AUSTRALIA,Queensland,,UNKNOWN
2070,1996,USA,Hawaii,No details,UNKNOWN
2082,1996,USA,Hawaii,No details,UNKNOWN


### Changing 'NaN' to 'No details'

In [19]:
data['Injury']= data['Injury'].str.replace('NaN','No details')

In [20]:
data.loc[data.Injury == 'No details']

Unnamed: 0,Year,Country,Area,Injury,Fatal
401,2013,USA,South Carolina,No details,UNKNOWN
2035,1997,USA,Hawaii,No details,UNKNOWN
2070,1996,USA,Hawaii,No details,UNKNOWN
2082,1996,USA,Hawaii,No details,UNKNOWN
2098,1996,AUSTRALIA,Western Australia,No details,UNKNOWN
2103,1995,USA,Hawaii,No details,UNKNOWN
2163,1995,AUSTRALIA,Western Australia,No details,UNKNOWN
2179,1995,USA,South Carolina,No details,UNKNOWN
2184,1994,VANUATU,Tafea Province,No details,UNKNOWN
2317,1992,USA,Florida,No details,UNKNOWN


### Dropping the NEANDERTHAL way all 'No Details'

In [21]:
data = data.drop([401, 2035, 2070, 2082, 2098, 2103, 2163, 2179, 2184, 2317, 2436, 2581, 2657, 2675, 2718, 
                 2771, 2773, 2781, 2816, 3171, 3186, 3204, 3216, 3319, 3422, 3548, 4043, 4084, 4097, 4552, 4559, 
                 4030, 4560, 4768, 4799, 4873, 4928, 5114, 5373, 5432, 5464, 5465, 5583])

### Displaying all 'UNKNOWN' in Fatal 

In [22]:
data.loc[data.Fatal == 'UNKNOWN']

Unnamed: 0,Year,Country,Area,Injury,Fatal
54,2016,AUSTRALIA,New South Wales,"No injury, but sharks repeatedly hit their fin...",UNKNOWN
1047,2008,AUSTRALIA,New South Wales,,UNKNOWN
1142,2007,SENEGAL,,,UNKNOWN
1844,2000,NEW ZEALAND,North Island,Reported as shark attack but probable drowning,UNKNOWN
2025,1997,BRAZIL,Rio de Janeiro,,UNKNOWN
2042,1996,AUSTRALIA,Queensland,,UNKNOWN
2423,1990,USA,Florida,,UNKNOWN
2425,1989,AUSTRALIA,Northern Territory,"No details, ""recovering in Darwin Hospital""",UNKNOWN
2449,1969,BERMUDA,,FATAL,UNKNOWN
2813,1981,BRAZIL,Rio de Janeiro,,UNKNOWN


### Droping all Null values in Injury
#### Apparently this did not work. data['Injury']= data['Injury'].str.replace('NaN','No details')

In [23]:
data = data.dropna(subset=['Injury'])

### Recounting Fatal without NaN

In [24]:
fatal_count = data.Fatal.value_counts()
print(fatal_count)

N          4324
Y          1554
UNKNOWN      44
Name: Fatal, dtype: int64


In [25]:
data.loc[data.Fatal == 'UNKNOWN']

Unnamed: 0,Year,Country,Area,Injury,Fatal
54,2016,AUSTRALIA,New South Wales,"No injury, but sharks repeatedly hit their fin...",UNKNOWN
1844,2000,NEW ZEALAND,North Island,Reported as shark attack but probable drowning,UNKNOWN
2425,1989,AUSTRALIA,Northern Territory,"No details, ""recovering in Darwin Hospital""",UNKNOWN
2449,1969,BERMUDA,,FATAL,UNKNOWN
3280,1967,ITALY,Brindisi Province,"Diver shot the shark, then it injured his arm ...",UNKNOWN
3281,1967,PAPUA NEW GUINEA,New Ireland Province,"No details, listed as PROVOKED INCIDENT",UNKNOWN
3326,1966,AUSTRALIA,Western Australia,Involved a speared shark but no other details ...,UNKNOWN
3435,1964,,,"Disappeared, probable drowning but sharks in a...",UNKNOWN
3568,1962,ISRAEL,Sharon,"Details unknown, possibly a PROVOKED INCIDENT",UNKNOWN
3901,1958,TURKEY,Ahirkapi coast,Boat damaged,UNKNOWN


### Dropping all 'UNKNOWN' from Fatal column

In [26]:
to_drop=['UNKNOWN']
clean_data = data[~data['Fatal'].isin(to_drop)]

### Recounting Fatal column

In [27]:
fatal_count = clean_data.Fatal.value_counts()
print(fatal_count)

N    4324
Y    1554
Name: Fatal, dtype: int64


In [28]:
fatal_count.describe()

count       2.000000
mean     2939.000000
std      1958.685784
min      1554.000000
25%      2246.500000
50%      2939.000000
75%      3631.500000
max      4324.000000
Name: Fatal, dtype: float64

## Primera Conclusion
#### En una muestra de 5.878 ataques de tiburones alrededor del mundo podemos concluir que aproximadamente 29,39% de los ataques de tiburones son letales.

In [29]:
fatal_count.mean()

2939.0

### Adding up all attacks by country

In [30]:
country_count = clean_data.Country.value_counts()
print(country_count)

USA                        2087
AUSTRALIA                  1251
SOUTH AFRICA                560
PAPUA NEW GUINEA            131
NEW ZEALAND                 122
BRAZIL                      100
BAHAMAS                      98
MEXICO                       81
ITALY                        66
FIJI                         61
REUNION                      57
PHILIPPINES                  57
NEW CALEDONIA                51
MOZAMBIQUE                   43
CUBA                         41
SPAIN                        38
EGYPT                        34
PANAMA                       32
JAPAN                        32
INDIA                        31
CROATIA                      30
IRAN                         29
SOLOMON ISLANDS              28
GREECE                       24
HONG KONG                    24
JAMAICA                      23
FRENCH POLYNESIA             22
INDONESIA                    20
ENGLAND                      19
PACIFIC OCEAN                17
                           ... 
GRAND CA

### Selecting top 3 countries with more attacks registered

In [31]:
clean_data = clean_data[clean_data.Country.isin(['USA', 'AUSTRALIA', 'SOUTH AFRICA'])] 

In [32]:
country_count = clean_data.Country.value_counts()
print(country_count)

USA             2087
AUSTRALIA       1251
SOUTH AFRICA     560
Name: Country, dtype: int64


### Creating a table to visualize the attacks of the top 3 countries that ended up being Fatal vs None Fatal

In [33]:
table = pd.crosstab(clean_data.Country, clean_data.Fatal, margins = True)

In [34]:
table.head()

Fatal,N,Y,All
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AUSTRALIA,910,341,1251
SOUTH AFRICA,422,138,560
USA,1841,246,2087
All,3173,725,3898


## Conclusión
### Podemos concluir que los top 3 paises con mas reporte de muertes de tiburones son Australia con 341(27,26%) seguido de USA con 246(11.79%)y por ultimo South Africa con 138(24,64%).
##### Los porcentajes representan la relacion que existe entre los casos donde mueren con respecto a donde no mueren por cada pais.