# Sharks attacks

In [1]:
# modules import
import re
import pandas as pd
from src.functions import sex_filter,fatal_filter,space_cleaner,age_cleaner,integer,activity_cleaner

In [2]:
# Database import from Kaggle's csv

data = pd.read_csv("./input/GSAF5.csv",encoding='cp1252')

df = pd.DataFrame(data)
df = df[["Year","Country","Activity","Sex ","Age","Fatal (Y/N)"]]

df.head()

Unnamed: 0,Year,Country,Activity,Sex,Age,Fatal (Y/N)
0,2016,USA,Surfing,M,16.0,N
1,2016,USA,Surfing,M,36.0,N
2,2016,USA,Surfing,M,43.0,N
3,2016,AUSTRALIA,Surfing,M,,N
4,2016,AUSTRALIA,Surfing,M,,N


In [3]:
# Checking zeroes

null_cols = df.isnull().sum()
null_cols
null_cols[null_cols > 0]

Country          43
Activity        527
Sex             567
Age            2681
Fatal (Y/N)      19
dtype: int64

In [4]:
#Columns data types

print(df.dtypes)

Year            int64
Country        object
Activity       object
Sex            object
Age            object
Fatal (Y/N)    object
dtype: object


In [5]:
# Filtering by year (1900-2016)

df = df[(df["Year"]>= 1900)]

#number of values matching the report range

print("There are ",df.shape[0]," results.")

There are  5324  results.


In [6]:
# Fixing the name of the columns "Sex", it has an extra space at the end.

df = df.rename(columns={"Sex ":"Sex"})

# Cleaning the "Sex" column

df["Sex"]= df["Sex"].fillna("Unknown")
df["Sex"] = df["Sex"].apply(sex_filter)

In [7]:
# Distribution of sharks attacks by sex of the victim

sex_table = df[["Sex","Year"]]
sex_table = sex_table.rename(columns={"Year":"Count"})
sex_table = sex_table.groupby(["Sex"]).count()
sex_table = sex_table.sort_values(by = ["Count"],ascending = False)
sex_table.head(5)

Unnamed: 0_level_0,Count
Sex,Unnamed: 1_level_1
M,4268
F,548
Unknown,508


In [8]:
# Renaming the "fatal" column 

df = df.rename(columns={"Fatal (Y/N)":"Fatal"})
df["Fatal"]= df["Fatal"].fillna("Unknown")

# Cleaning "fatal" column

df["Fatal"]= df["Fatal"].fillna("Unknown")
df["Fatal"] = df["Fatal"].apply(fatal_filter)

In [9]:
# Distribution of sharks attacks by fatal

fatal_table = df[["Fatal","Year"]]
fatal_table = fatal_table.rename(columns={"Year":"Count"})
fatal_table = fatal_table.groupby(["Fatal"]).count()
fatal_table = fatal_table.sort_values(by = ["Count"],ascending = False)

fatal_table.head(5)

Unnamed: 0_level_0,Count
Fatal,Unnamed: 1_level_1
N,4036
Y,1203
Unknown,85


In [10]:
# Cleaning "Country" column

df["Country"]= df["Country"].fillna("Unknown")
df["Country"] = df["Country"].apply(space_cleaner)

In [11]:
# Distribution of sharks attacks by country

country_table = df[["Country","Year"]]
country_table = country_table.rename(columns={"Year":"Count"})
country_table = country_table.groupby(["Country"]).count()
country_table = country_table.sort_values(by = ["Count"],ascending = False)

country_table.head(5)

Unnamed: 0_level_0,Count
Country,Unnamed: 1_level_1
USA,1992
AUSTRALIA,1124
SOUTH AFRICA,535
PAPUA NEW GUINEA,129
BRAZIL,100


In [12]:
# Cleaning "age" column

df["Age"]= df["Age"].fillna("Unknown")
df["Age"] = df["Age"].apply(age_cleaner)

set(df["Age"])
print(df["Age"].value_counts().sort_values(ascending = False))

Unknown    2113
17          143
18          139
20          133
16          132
           ... 
12            1
17            1
6             1
11            1
0             1
Name: Age, Length: 113, dtype: int64


In [13]:
# Mean age of the vicitim

age_table = df[(df["Age"]!= "Unknown")]
age_table["Age"] = age_table["Age"].apply(integer)
print(round(age_table["Age"].mean(),2))

27.22


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [14]:
#Activity performed by the victim 

df["Activity"]= df["Activity"].fillna("Unknown")
set(df["Activity"])
print(df["Activity"].value_counts().sort_values(ascending = False))

Surfing                                                 901
Swimming                                                716
Unknown                                                 429
Fishing                                                 366
Spearfishing                                            321
                                                       ... 
Fishing boat swamped in storm                             1
Investigating shark sighting                              1
Slipped off rocks and fell into the water                 1
Fishing, standing in 2' of water                          1
Jumped overboard from torpedoed Panamanian freighter      1
Name: Activity, Length: 1302, dtype: int64


In [15]:
#No he conseguido hacer funcionar a esta función, me sustituye todos los valores sin importar su contenido...
df["Activity"] = df["Activity"].apply(activity_cleaner)
print(df["Activity"].value_counts().sort_values(ascending = False))

Surfing    5324
Name: Activity, dtype: int64


In [16]:
#Cleaned data frame

df = df[["Year","Country","Sex","Age","Fatal"]]
display(df)

Unnamed: 0,Year,Country,Sex,Age,Fatal
0,2016,USA,M,16,N
1,2016,USA,M,36,N
2,2016,USA,M,43,N
3,2016,AUSTRALIA,M,Unknown,N
4,2016,AUSTRALIA,M,Unknown,N
...,...,...,...,...,...
5320,1900,USA,M,Unknown,Y
5321,1900,USA,Unknown,Unknown,N
5322,1900,AUSTRALIA,M,Unknown,N
5323,1900,USA,M,Unknown,N


In [17]:
# Export to csv
#df.to_csv("sharksattacks.csv")