# Shark Attack Explorations

In [None]:
from IPython.display import Image

Image(url="img/shark1.png", width = 400, height = 400)

In [None]:
# load packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 1. Import Data

In [None]:
# read csv file 
full_attacks = pd.read_csv("attacks.csv")
full_attacks.head()

## 2. Explore Data

In [None]:
# check data types
full_attacks.dtypes

In [None]:
# check for missing values
full_attacks.isnull().sum()

In [None]:
# size of data
full_attacks.shape

## 3. Clean Data

In [None]:
Image(url="img/shark2.png", width = 400, height = 400)

In [None]:
# copy data and drop nas
attacks_df = full_attacks.copy()
attacks_df = attacks_df.dropna(subset=["Date", "Year"])
print(attacks_df.isnull().sum(), attacks_df.shape)

In [None]:
# check column names
attacks_df.columns

In [None]:
# drop unnecessary columns
drop_columns = ["Case Number", 'Date', 'Time', 'Investigator or Source',
                'pdf', 'href formula', 'href',
                'Case Number.1', 'Case Number.2', 'original order',
                'Unnamed: 22', 'Unnamed: 23']
attacks_df = attacks_df.drop(columns=drop_columns)

attacks_df.head()

In [None]:
# check year format
attacks_df["Year"].unique()

## 4. Filter Data

In [None]:
# clean years and set range to 1700 - 2018
attacks_df["Year"] =  attacks_df["Year"].astype(int)
attacks_df =  attacks_df[attacks_df["Year"] >= 1700]
attacks_df["Year"] = attacks_df["Year"].astype(str)
attacks_df["Year"].unique()

In [None]:
# convert year object to datetime
attacks_df["Year"] =  pd.to_datetime(attacks_df["Year"], format='%Y')
attacks_df.head()

In [None]:
# set the index to year
attacks_df = attacks_df.set_index("Year")

In [None]:
# remove unnecessary white space from df
attacks_df.columns = attacks_df.columns.str.strip()

## 5/6. Sort and Group Data

In [None]:
Image(url="img/shark3.png", width = 400, height = 400)

In [None]:
# sort and number of shark attacks by year
year_group = attacks_df.groupby(attacks_df.index)
year_group_df = year_group["Type"].count().sort_values(ascending=False).head(10)
year_group_df

In [None]:
# sort and group number of attacks by country
country_group = attacks_df.groupby(attacks_df["Country"])
country_group_df = country_group["Type"].count().sort_values(ascending=False).head(10)
country_group_df

In [None]:
# sort and group number of attacks by activity
activity_group = attacks_df.groupby(attacks_df["Activity"])
activity_group = activity_group["Type"].count().sort_values(ascending=False).head(10)
activity_group

In [None]:
# sort and group number of attacks by sex
sex_group = attacks_df.groupby(attacks_df["Sex"])
sex_group_df = sex_group["Type"].count().sort_values(ascending=False).head(3)
sex_group_df

In [None]:
# sort and group number of attacks by severity
fatal_group = attacks_df.groupby(attacks_df["Fatal (Y/N)"])
fatal_group_df = fatal_group["Type"].count().sort_values(ascending=False).head(3)
fatal_group_df

## 7. Tranform Data

In [None]:
# create a new column to extract first name
attacks_df["First_Name"] = attacks_df["Name"].str.split().str.get(0)

In [None]:
# sort and group number of attacks by first name
name_group = attacks_df.groupby(attacks_df["First_Name"])
name_group_df = name_group["Type"].count().sort_values(ascending=False).head(25)
name_group_df = pd.DataFrame(name_group_df)
name_group_df

In [None]:
# group unknown names
unknown_names = ["male", "a", "female",
                 "Mr.", "2", "boat",
                "boat,", "Captain"]

# For any value in our index that matches any value in our unkown names, replace it with
# "unknown", otherwise leave it alone.

name_group_df.index = ["Unknown" if fname in unknown_names else fname for fname in name_group_df.index]
name_group_df = name_group_df.groupby(name_group_df.index).sum().sort_values(by="Type", ascending=False)
name_group_unknown_drop_df = name_group_df.drop(index="Unknown")
name_group_unknown_drop_df

## 9. Visualize Data

In [None]:
# create barplot for shark attacks by activty
ax = sns.barplot(data=activity_group)
print(plt.xticks(rotation=60))
ax.set(ylabel='Count')

In [None]:
# create barplot for shark attacks by name (unknown included)
ax = sns.barplot(data=name_group_df, x=name_group_df.index, y="Type")
print(plt.xticks(rotation=60))
ax.set(ylabel='Count')

In [None]:
# create barplot for shark attacks by activty (unknown not included)
ax = sns.barplot(data=name_group_unknown_drop_df, 
            x=name_group_unknown_drop_df.index, y="Type")
plt.xticks(rotation=60)
ax.set(ylabel='Count')

In [None]:
# create barplot for shark attacks by sex
ax = sns.barplot(data=sex_group_df)
ax.set(ylabel='Count')

In [None]:
# create barplot for shark attacks by severity
ax = sns.barplot(data=fatal_group_df)
ax.set(ylabel='Count')

In [None]:
# create barplot for shark attacks by country
ax = sns.barplot(data=country_group_df)
plt.xticks(rotation=60)
ax.set(ylabel='Count')

In [None]:
Image(url="img/shark4.png", width = 400, height = 400)