# Project - Global Shark Attacks

In [1]:
#importing libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import plotly.express as px
import plotly.graph_objects as go
import os

In [2]:
import src.analysis as analysis
import src.cleaning as cleaning

In [3]:
attacks = analysis.read_df("output/attacks_updated.csv")
attacks

AttributeError: module 'src.cleaning' has no attribute 'read_df'

## Importing and encoding the dataset

In [None]:
#loading the dataset
attacks = pd.read_csv("./input/attacks.csv",encoding='latin1')

In [None]:
attacks

## Exploring the dataset general info

In [None]:
attacks.shape

### Exploring the dataset missing values info


In [None]:
#checking missing values per column
Nan_attacks= attacks.isna().sum()
missing_value_attack = pd.DataFrame({'Feature': attacks.columns,
                                 'Missing values': Nan_attacks})
missing_value_attack

In [None]:
#checking missing values percentages
percent_missing = attacks.isnull().sum() * 100 / len(attacks)
percent_missing
Nan_attack_percentage = pd.DataFrame({'Feature': attacks.columns,
                                 'Missing %': percent_missing})
Nan_attack_percentage 

In [None]:
#checking the number of columns
attacks.shape

###  Deleting ignorable missing values

In [None]:
#poping missing values having age as reference
attacks.dropna(subset=['Age'], inplace = True)

In [None]:
#dataframe updated after ignorable NaN deletion
attacks.shape

In [None]:
#checking updated missing values percentages after poping Unnamed:22 and :23
percent_missing = attacks.isnull().sum() * 100 / len(attacks)
percent_missing
Nan_attack_percentage = pd.DataFrame({'Feature': attacks.columns,
                                 'Missing %': percent_missing})
Nan_attack_percentage 

In [None]:
#checking updated min, max and mean missing values percentages after poping Unnamed:22 and :23
print(Nan_attack_percentage['Missing %'].mean())
print(Nan_attack_percentage['Missing %'].max())
print(Nan_attack_percentage['Missing %'].min())

### Standardizing columns

#### Deleting columns with information that will not be used

In [None]:
#evaluating columns
attacks.columns

In [None]:
#deleting columns with info that will not be needed
cleaning.pop_column(attacks)

In [None]:
#updating evaluating columns
attacks.columns

In [None]:
#check the shape
attacks.shape

### Standardizing columns names 

In [None]:
cleaning.stand_colum_name(attacks)

### Exploring type and values

In [None]:
#checking for duplicates
attacks.duplicated().sum()

In [None]:
#printing types and one example of case
print(attacks.dtypes)
print(attacks.iloc[0])

### Cleaning variables

#### Date

In [None]:
#explore data counts
attacks["date"].value_counts()

In [None]:
 list(attacks["date"].unique())[:10]

In [None]:
#extract month from date
attacks['date'] = attacks['date'].str.extract('(-\D{3}-)', expand=True)

In [None]:
cleaning.regex_date_month(attacks)

In [None]:
#remove - as separator
cleaning.regex_separator_month(attacks,'date')

In [None]:
#create column month
attacks['month']=attacks['date']

In [None]:
#check unique values
attacks["month"].unique()

In [None]:
#count months
attacks["month"].value_counts()

In [None]:
cleaning.plot_month(attacks)

#### Year

In [None]:
#transforming year from float to integer
attacks.dropna(subset=['year'], inplace = True)
attacks.year = attacks.year.astype(int)

In [None]:
cleaning.plot_year(attacks, 'year')

In [None]:
cleaning.plot_year_1850(attacks, 'year')

In [None]:
#plotting in another way - year
attacks = attacks.loc[attacks['year'] > 1850,:]
fig, ax = plt.subplots(figsize=(80, 9))
sns.countplot(ax=ax,x=attacks.year, )
fig.savefig('figures/EDA/Year.png')

#### Sex

In [None]:
#count values to explore
attacks["sex"].value_counts()

In [None]:
#count unique values
attacks["sex"].unique()

In [None]:
#cleaning values
attacks.loc[attacks['sex'].str.contains('M ', case=False, na=False), 'sex'] = 'M'
attacks.drop(attacks[attacks["sex"] == 'lli'].index, inplace=True)
attacks.drop(attacks[attacks["sex"] == 'nan'].index, inplace=True)

In [None]:
attacks["sex"].value_counts()

In [None]:
#ploting gender count
fig, ax = plt.subplots(figsize=(15, 8))
sns.countplot(ax=ax,x=attacks.sex)
fig.savefig('figures/EDA/Gender.png')

In [None]:
#creating a dummy variable
attacks["sex_count"]=attacks["sex"].replace(['M', 'F'],[1, 0], inplace=False)

In [None]:
attacks

#### Age

In [None]:
#count values to explore
attacks["age"].value_counts()

In [None]:
#explore unique values
list(attacks["age"].unique())

In [None]:
#extract age
attacks['age'] = attacks['age'].str.extract('(\d{1,2})', expand=True)

In [None]:
attacks["age"].unique()


In [None]:
#drop missing values
attacks.dropna(subset=['age'], inplace = True)
attacks.age = attacks.age.astype(int)

In [None]:
#plot age
fig, ax = plt.subplots(figsize=(12, 8))
sns.histplot(ax=ax,x=attacks.age, bins=30)
fig.savefig('figures/EDA/Age.png')

In [None]:
attacks.groupby(["age", "sex"])["sex"].count().plot(kind="bar", color=["slategray","coral"])
sns.set(rc={'figure.figsize': (20.,15.)})
sns.set_style('whitegrid')

#### Country and hypotheses testing

In [None]:
#count unique occurrences of countries
set(attacks["country"])

In [None]:
#new column hemisphere
attacks['hemisphere']=attacks['country'].apply(lambda x: analysis.hemisphere(x))

In [None]:
#new column season per month and hemisphere
attacks['season_hemisphere']=attacks.apply(lambda x: analysis.season_hemisphere(x['month'],x['hemisphere']),1)
attacks

In [None]:
analysis.hemisphere_count()

In [None]:
analysis.plot_attacks_season()

In [None]:
analysis.season_hemis_count()

In [None]:
#save dataset updated
attacks.to_csv('output/attacks_updated.csv',index=False)