# PR02_Demonstration of Data Cleaning and Manipulation with Pandas 

### Import Pandas and load data ###

In [1]:
import pandas as pd
import numpy as np

In [2]:
# encoding option to deal with files in different formats
df = pd.read_csv('input/attacks.csv', encoding='ISO-8859-1')
df.head(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


### Exploration of the original DataFrame ###

In [3]:
# dimension
df.shape

(25723, 24)

In [4]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

### Cleaning Text and Removing Special Characters ###

In [5]:
# making changes at the column's names: deleting white spaces/ lowercases
df.columns = df.columns.to_series().apply(lambda x: x.strip().lower())
df.columns

Index(['case number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal (y/n)', 'time',
       'species', 'investigator or source', 'pdf', 'href formula', 'href',
       'case number.1', 'case number.2', 'original order', 'unnamed: 22',
       'unnamed: 23'],
      dtype='object')

In [6]:
# rename 'case number column' to elimnate white spaces
df.rename(columns={'case number':'case_number'}, inplace=True)

### Drop all empy rows ###

In [7]:
# drop all the empty rows
df = df.dropna(how = 'all')
df.head(5)

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,investigator or source,pdf,href formula,href,case number.1,case number.2,original order,unnamed: 22,unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [8]:
df.shape

(8703, 24)

### Searching for duplicate rows ###

In [9]:
# searching duplicates in the column'Case Number'
df['case_number'].duplicated().sum()

2415

In [10]:
df['case_number'].duplicated().value_counts()

False    6288
True     2415
Name: case_number, dtype: int64

In [11]:
# oops! 'case number'==0? 
duplicates = df[df['case_number'].duplicated(keep=False)]['case_number'].tolist()
# duplicates[:50]

In [12]:
# all thes columns can be eliminated because contain irrilevant data
df[df['case_number']=='0']

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,investigator or source,pdf,href formula,href,case number.1,case number.2,original order,unnamed: 22,unnamed: 23
6302,0,,,,,,,,,,...,,,,,,,,6304.0,,
6303,0,,,,,,,,,,...,,,,,,,,6305.0,,
6304,0,,,,,,,,,,...,,,,,,,,6306.0,,
6305,0,,,,,,,,,,...,,,,,,,,6307.0,,
6306,0,,,,,,,,,,...,,,,,,,,6308.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8697,0,,,,,,,,,,...,,,,,,,,,,
8698,0,,,,,,,,,,...,,,,,,,,,,
8699,0,,,,,,,,,,...,,,,,,,,,,
8700,0,,,,,,,,,,...,,,,,,,,,,


In [13]:
# keeping all the rows with at leas 14 No Nan values
df = df.dropna(thresh =14)
df.head(5)

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,investigator or source,pdf,href formula,href,case number.1,case number.2,original order,unnamed: 22,unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [14]:
df.shape

(6302, 24)

In [15]:
# reduction of duplicates to 16
df['case_number'].duplicated().value_counts()

False    6286
True       16
Name: case_number, dtype: int64

In [16]:
duplicates = df[df['case_number'].duplicated(keep=False)].count()
duplicates

case_number               32
date                      32
year                      32
type                      32
country                   32
area                      29
location                  32
activity                  27
name                      30
sex                       31
age                       15
injury                    32
fatal (y/n)               27
time                      11
species                   17
investigator or source    32
pdf                       32
href formula              32
href                      32
case number.1             32
case number.2             32
original order            32
unnamed: 22                0
unnamed: 23                0
dtype: int64

In [17]:
#drop all duplicates considering all columns
df.drop_duplicates(subset=list(df.columns))
df.head(5)

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,investigator or source,pdf,href formula,href,case number.1,case number.2,original order,unnamed: 22,unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [18]:
df.shape

(6302, 24)

In [19]:
# check!
df['case_number'].duplicated().sum()

16

### Missing Values ###

In [20]:
# number of nulls/columns
nulls_cols = df.isnull().sum()
nulls_cols

case_number                  1
date                         0
year                         2
type                         4
country                     50
area                       455
location                   540
activity                   544
name                       210
sex                        565
age                       2831
injury                      28
fatal (y/n)                539
time                      3354
species                   2838
investigator or source      17
pdf                          0
href formula                 1
href                         0
case number.1                0
case number.2                0
original order               0
unnamed: 22               6301
unnamed: 23               6300
dtype: int64

In [21]:
# hp: remove the columns with 40% of missing values (columns NOT relevant for my analysis)
# drop selected columns
drop_cols = list(nulls_cols[nulls_cols > 0.4 * 6302].index)
df = df.drop(drop_cols, axis=1, inplace=False)
df

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,injury,fatal (y/n),investigator or source,pdf,href formula,href,case number.1,case number.2,original order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,"No injury to occupant, outrigger canoe and pad...",N,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,Minor injury to left thigh,N,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,Injury to left lower leg from surfboard skeg,N,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,Minor injury to lower leg,N,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,Lacerations to leg & hand shark PROVOKED INCIDENT,N,A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,FATAL,Y,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6.0
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,FATAL,Y,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,FATAL,Y,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,FATAL,Y,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3.0


In [22]:
# let's analyse the others colums to see what's going on
# number of nulls/columns
nulls_cols = df.isnull().sum()
nulls_cols

case_number                 1
date                        0
year                        2
type                        4
country                    50
area                      455
location                  540
activity                  544
name                      210
sex                       565
injury                     28
fatal (y/n)               539
investigator or source     17
pdf                         0
href formula                1
href                        0
case number.1               0
case number.2               0
original order              0
dtype: int64

In [23]:
# INVESTIGATION!

In [24]:
# 'case number'
null_casenumb = df[(df['case_number'].isnull()==True)]
null_casenumb

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,injury,fatal (y/n),investigator or source,pdf,href formula,href,case number.1,case number.2,original order
5488,,Reported 06-Sep-1905,1905.0,Provoked,USA,Florida,"Fort Pierce, St Lucie County",Fishing,David Curry,M,Lacerations to leg from hooked shark PROVOKED ...,N,"Muncie Evening Press, 8/6/1913",1905.09.06.R-Curry.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1905.09.06.R,1905.09.06.R,815.0


In [25]:
# 'CASE_NUMBER'
df.iloc[5486:5491, :]

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,injury,fatal (y/n),investigator or source,pdf,href formula,href,case number.1,case number.2,original order
5486,1905.12.29,29-Dec-1905,1905.0,Invalid,AUSTRALIA,Western Australia,Geraldton,Bathing,Hugh Carroll,M,"""Bad wound in the leg"" - 7-ft shark caught in ...",,"The Advertiser, 12/30/1905",1905.12.29-Carroll.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1905.12.29,1905.12.29,817.0
5487,1905.09.29,29-Sep-1905,1905.0,Unprovoked,AUSTRALIA,New South Wales,"Waverly, Sydney",Swimming,Jame Crotty,M,FATAL. Shark involvement suspected but not con...,Y,"The Argus, 9/30/1905",1905.09.29-Crotty.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1905.09.29,1905.09.29,816.0
5488,,Reported 06-Sep-1905,1905.0,Provoked,USA,Florida,"Fort Pierce, St Lucie County",Fishing,David Curry,M,Lacerations to leg from hooked shark PROVOKED ...,N,"Muncie Evening Press, 8/6/1913",1905.09.06.R-Curry.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1905.09.06.R,1905.09.06.R,815.0
5489,1905.08.24,24-Aug-1905,1905.0,Invalid,EGYPT,Suez Canal,Port Said,Human head found in shark caught by British st...,,M,Probable drowning & scavenging.,,"C. Moore, GSAF",1905.08.24-Head-in-shark.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1905.08.24,1905.08.24,814.0
5490,1905.08.00.b,Late Aug-1905,1905.0,Unprovoked,USA,New Jersey,"Atlantic City, Atlantic County",Swimming from naptha launch after a day of fis...,George Wright,M,3 toes of right foot were severed,N,The Sun (undated article),1905.08.00.b-Wright.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1905.08.00.b,1905.08.00.b,813.0


In [26]:
# changing the missing 'case number' with the date of the attack
df.case_number = df.case_number.fillna('1905.09.06')

In [27]:
# YEAR
null_year = df[(df['year'].isnull()==True)]
null_year

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,injury,fatal (y/n),investigator or source,pdf,href formula,href,case number.1,case number.2,original order
187,2017.01.08.R,Reported 08-Jan-2017,,Invalid,AUSTRALIA,Queensland,,Spearfishing,Kerry Daniel,M,"No attack, shark made a threat display",,Liquid Vision 1/8/2017,2017.01.08.R-KerryDaniel.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.01.08.R,2017.01.08.R,6116.0
6079,1836.08.19.R,Reported 19-Aug-1836,,Unprovoked,ENGLAND,Cumberland,Whitehaven,Swimming,a boy,M,FATAL,Y,"C. Moore, GSAF",1835.08.19.R-Whitehaven.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1836.08.19.R,1836.08.19.R,224.0


In [28]:
df.year = df.year.fillna(0)

In [29]:
# check the 'year' column data type
df.year.dtype

dtype('float64')

In [30]:
# change from float to int
df.year = df.year.astype(int)
df.year.dtype

dtype('int32')

In [31]:
df.loc[187].replace(0, 2017)
df.loc[6079].replace(0, 1836)

case_number                                                    1836.08.19.R
date                                                   Reported 19-Aug-1836
year                                                                   1836
type                                                             Unprovoked
country                                                             ENGLAND
area                                                             Cumberland
location                                                         Whitehaven
activity                                                           Swimming
name                                                                  a boy
sex                                                                       M
injury                                                                FATAL
fatal (y/n)                                                               Y
investigator or source                                       C. Moore, GSAF
pdf         

In [32]:
# TYPE
null_type = df[(df['type'].isnull()==True)]
# creation a subset with the activity people were doing --> maybe there is a correlation
null_type = null_type[['type','activity']]
null_type

Unnamed: 0,type,activity
85,,Fishing
382,,Surfing
4867,,Wreck of a sampam
5705,,Diving


In [33]:
# comparing similar activities, it is reasonable to think that all the missing activities are 'Unprovoked'
df.groupby('activity').get_group('Fishing')
df.groupby('activity').get_group('Surfing')
df.groupby('activity').get_group('Diving')
df.groupby('activity').get_group('Wreck of a sampam')

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,injury,fatal (y/n),investigator or source,pdf,href formula,href,case number.1,case number.2,original order
4867,1936.09.11.R,Reported 11-Sep-1936,1936,,VIETNAM,,Saigon,Wreck of a sampam,8 crew,M,FATAL,Y,"Lansing State Journal, 9/11/1936",1936.09.11-Saigon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1936.09.11.R,1936.09.11.R,1436.0


In [34]:
# replace missing values with 'Unprovoked'
df.type = df.type.fillna('Unprovoked')

In [35]:
# COUNTRY, AREA, LOCATION
# 
df[(df['country'].isnull()==True) & (df['area'].isnull()==True) & (df['location'].isnull()==True)]

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,injury,fatal (y/n),investigator or source,pdf,href formula,href,case number.1,case number.2,original order
62,2017.11.13.R,Reported 13-Nov-2017,2017,Unprovoked,,,,Surfing,Timur Yunusov,M,Puncture wounds to feet,N,Instagram,2017.11.13.R-Timur.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.11.13.R,2017.11.13.R,6241.0
525,2014.08.00,Aug-2014,2014,Invalid,,,,Sea disaster,Cuban refugees,M,Shark involvement prior to death not confirmed,,"Associated Press, 11/27/2014",2014.08.00-Cuban-refugees.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2014.08.00,2014.08.00,5778.0
3378,1970.11.00,Nov-1970,1970,Unprovoked,,,,,Heinz Plotsky,M,Extensive injuries,N,"H.D. Baldridge (1994), SAF Case #1645",1970.11.00-NV-Plotsky.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.11.00,1970.11.00,2925.0
3388,1970.07.05,5-Jul-1970,1970,Unprovoked,,,,,male,M,Finger or toe severed,N,"H.D. Baldridge (1994), SAF Case #1628",1970.07.05-NV-male.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.07.05,1970.07.05,2915.0
3395,1970.04.00.b,Apr-1970,1970,Provoked,,,,Freediving,Lionel Jarvis,M,Arm abraded & lacerated. Recorded as PROVOKED ...,N,"H.D. Baldridge (1994), SAF Case #1616",1970.04.00.b-NV-Jarvis.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.04.00.b,1970.04.00.b,2908.0
3399,1970.02.05,5-Feb-1970,1970,Unprovoked,,,,Wading,Sally Anne Irvine,F,Lacerations to lower leg,N,H.D. Baldridge (1994) SAF Case #1626,1970.02.05-NV-Irvine.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.02.05,1970.02.05,2904.0
3425,1969.08.00,Aug-1969,1969,Unprovoked,,,,,Rodney Hughes,M,Am lacerated,N,H.D. Baldridge (1994) SAF Case #1602,1969.08.00-NV-Hughes.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1969.08.00,1969.08.00,2878.0
3661,1964.09.27,27-Sep-1964,1964,Invalid,,,,Spearfishing,Giancarlo Griffon,M,"Disappeared, probable drowning but sharks in a...",,C. Moore. GSAF,1964.09.27-Griffon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1964.09.27,1964.09.27,2642.0
4726,1942.00.00.e,1942,1942,Sea Disaster,,,,Jumped overboard from torpedoed Panamanian fre...,male,M,FATAL,Y,"V.M. Coppleson (1962), p.258",1942.00.00.e-seaman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1942.00.00.e,1942.00.00.e,1577.0
4729,1942.00.00.b,1942,1942,Boating,,,,"Days before the surrender of Singapore, the 3 ...","Bombardier J. Hall, Private Green of the Sherw...",M,No injury to occupants. Sharks continually fol...,N,"V.M. Coppleson (1962), p.206",1942.00.00.b-Hall-Green-Jennings.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1942.00.00.b,1942.00.00.b,1574.0


In [36]:
# if the location, area, and country are missing --> 'Unknown'. It's impossible to know the position.
df.loc[(df['country'].isnull()==True) & (df['area'].isnull()==True) & (df['location'].isnull()==True),['country','area','location']] = 'Unknown'

In [37]:
# LOCATION Impossible to know --> 'No location'
null_location = df[(df['location'].isnull()==True)]

In [38]:
df.loc[(df['location'].isnull()==True), ['location']] = 'No Location'

In [39]:
# AREA Impossible to specify it --> 'No area'
null_area = df[(df['area'].isnull()==True)]

In [40]:
df.loc[(df['area'].isnull()==True), ['area']] = 'No Area'

In [41]:
# COUNTRY 

In [42]:
null_country = df[(df['country'].isnull()==True)]
null_country

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,injury,fatal (y/n),investigator or source,pdf,href formula,href,case number.1,case number.2,original order
2956,1983.00.00.d,Ca. 1983,1983,Unprovoked,,English Channel,No Location,Swimming,Padma Shri Taranath Narayan Shenoy,M,Left leg bitten,N,"Times of India, 2/5/2012",1983.00.00.d-Shenoy.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1983.00.00.d,1983.00.00.d,3347.0
3387,1970.08.02,02-Aug-1970,1970,Invalid,,Caribbean Sea,Between St. Kitts & Nevis,Sea Disaster Sinking of ferryboat Christina,,,"Sharks scavenged on bodies, but no record of t...",,"Rome News Tribune, 8/3/1970",1970.08.02-Christina-ferryboat.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.08.02,1970.08.02,2916.0
3605,1965.10.21,21-Oct-1965,1965,Unprovoked,,No Area,Florida Strait,The boat Caribou II sank,Mario Castellanos,M,Survived,N,"Lodi News Sentinel, 10/30/1965",1965.10.21-Castellanos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1965.10.21,1965.10.21,2698.0
4018,1960.01.26,26-Jan-1960,1960,Sea Disaster,,"Between Timor & Darwin, Australia",No Location,Portuguese Airliner with 9 people aboard went ...,,,"As searchers approached wreckage, sharks circl...",N,"V.M. Coppleson (1962), p.260",1960.01.26-Portuguese airliner.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1960.01.26,1960.01.26,2285.0
4231,1956.09.13,13-Sep-1956,1956,Unprovoked,,Near the Andaman & Nicobar Islands,No Location,Climbing back on ship,male,M,FATAL,Y,M. Hosina,1956.09.13-TunaBoat.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1956.09.13,1956.09.13,2072.0
4266,1956.00.00.g,1956,1956,Sea Disaster,,Between Comores & Madagascar,Geyser Bank,Shipwreck,"Captain Eric Hunt, the cook & a French passenger",M,FATAL,Y,dinofish.com,1956.00.00.g-Capt-Hunt.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1956.00.00.g,1956.00.00.g,2037.0
4498,1949.12.00.b,Dec-1949,1949,Sea Disaster,,Caribbean Sea,Between Cuba & Costa Rica,"Sea Disaster, sinking of the motorship Wingate","Albert Battles, James Dean & 4 crew",M,Fatal or drowning or scavenging,Y,"Canberra Times, 1/6/1950",1949.12.00.b-Wingate.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1949.12.00.b,1949.12.00.b,1805.0
4639,1944.10.24,24-Oct-1944,1944,Sea Disaster,,No Area,225 miles east of Hong Kong,Japanese POW ship Arisan Maru with 1800 Americ...,,M,Most of the men drowned & some were taken by s...,Y,internet (multiple),1944.10.24-ArisanMaru.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1944.10.24,1944.10.24,1664.0
4700,1942.11.00.a,Nov-1942,1942,Sea Disaster,,Off South American coast,No Location,Dutch merchant ship Zaandam torpedoed by the ...,,M,FATAL,Y,"M. Murphy; V.M. Coppleson (1962), pp.207-208",1942.11.00.a-Izzi.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1942.11.00.a,1942.11.00.a,1603.0
4712,1942.06.00,Jun-1942,1942,Unprovoked,,300 miles east of St. Thomas (Virgin Islands),No Location,On life raft tethered to lifeboat. A seaman pu...,male,M,Forearm lacerated,N,"V.M. Coppleson (1962), p.258",1942.06.00-on-life-raft.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1942.06.00,1942.06.00,1591.0


In [43]:
b = df[df['area'] == 'Ionian Sea']
b

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,injury,fatal (y/n),investigator or source,pdf,href formula,href,case number.1,case number.2,original order
6175,0077.00.00,77 A.D.,77,Unprovoked,,Ionian Sea,No Location,Sponge diving,males,M,FATAL,Y,Perils mentioned by Pliny the Elder (23 A.D. t...,77AD-Pliny.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0077.00.00,0077.00.00,128.0
6177,0000.0214,Ca. 214 B.C.,0,Unprovoked,,Ionian Sea,No Location,Ascending from a dive,"Tharsys, a sponge diver",M,"FATAL, shark/s bit him in two",Y,"Reported by Greek poet, Leonidas of Tarentum (...",214BC-Tharsys.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0000.0214,0000.0214,126.0


In [52]:
# Mediterranean Sea
df.loc[(df['area']=='Ionian Sea') & (df['area']=='Mediterranean Sea'),'country'] = 'MEDITERRANEAN SEA'

In [45]:
df['country'].unique()

array(['USA', 'AUSTRALIA', 'MEXICO', 'BRAZIL', 'ENGLAND', 'SOUTH AFRICA',
       'THAILAND', 'COSTA RICA', 'MALDIVES', 'BAHAMAS', 'NEW CALEDONIA',
       'ECUADOR', 'MALAYSIA', 'LIBYA', 'Unknown', 'CUBA', 'MAURITIUS',
       'NEW ZEALAND', 'SPAIN', 'SAMOA', 'SOLOMON ISLANDS', 'JAPAN',
       'EGYPT', 'ST HELENA, British overseas territory', 'COMOROS',
       'REUNION', 'FRENCH POLYNESIA', 'UNITED KINGDOM',
       'UNITED ARAB EMIRATES', 'PHILIPPINES', 'INDONESIA', 'CHINA',
       'COLUMBIA', 'CAPE VERDE', 'Fiji', 'DOMINICAN REPUBLIC',
       'CAYMAN ISLANDS', 'ARUBA', 'MOZAMBIQUE', 'FIJI', 'PUERTO RICO',
       'ITALY', 'ATLANTIC OCEAN', 'GREECE', 'ST. MARTIN', 'FRANCE',
       'PAPUA NEW GUINEA', 'TRINIDAD & TOBAGO', 'KIRIBATI', 'ISRAEL',
       'DIEGO GARCIA', 'TAIWAN', 'JAMAICA', 'PALESTINIAN TERRITORIES',
       'GUAM', 'SEYCHELLES', 'BELIZE', 'NIGERIA', 'TONGA', 'SCOTLAND',
       'CANADA', 'CROATIA', 'SAUDI ARABIA', 'CHILE', 'ANTIGUA', 'KENYA',
       'RUSSIA', 'TURKS & CAICOS', 

In [46]:
df.groupby('country','area')

ValueError: No axis named area for object type <class 'pandas.core.frame.DataFrame'>

In [None]:
# INVESTIGATOR OR SOURCE
# I don't think data of this column are not so relevant... I'll replace nulls with a 'Unknown' (because all the other sources are known)
null_inv = df[(df['investigator or source'].isnull()==True)]

In [None]:
investigators = df['investigator or source'].unique().tolist()

for name in investigators:
    if name != 'Unknown':
        print(f'The investigator/souce is known')
        break
    else:
        print(f'Unknown')

In [None]:
# replace 0 with 'Unknown'
df['investigator or source'] = df['investigator or source'].fillna('Unknown')

In [None]:
# HREF FORMULA
# like above
null_hrefform = df[(df['href formula'].isnull()==True)]
null_hrefform

In [None]:
# df.iloc[3243:3246, :]

In [None]:
df['href formula'] = df['href formula'].fillna(0)

In [53]:
nulls_cols = df.isnull().sum()
nulls_cols

case_number                 0
date                        0
year                        0
type                        0
country                    21
area                        0
location                    0
activity                  544
name                      210
sex                       565
injury                     28
fatal (y/n)               539
investigator or source     17
pdf                         0
href formula                1
href                        0
case number.1               0
case number.2               0
original order              0
dtype: int64

### Final Summary ###

In [None]:
# summary
df.info()

In [None]:
# At this point, a good practice is to save the reduced file 'attacks-reduced.csv' for consulting it, if necessary.
df.to_csv('input/attacks-reduced.csv')

## Analysis

In [None]:
# selection of the columns to use in the analysis
# df = df[['case_number','date','year','type','country','area','activity','sex','injury','fatal (y/n)']]
# df.head()