In [1]:
import numpy as np
import pandas as pd

In [2]:
# read downloaded csv file. Encountered "UnicodeDecodeError: 'utf-8'.....". 
# With read_csv method we can chooose encoding option to deal with different format. 


df = pd.read_csv("shark_attack.csv", encoding = "ISO-8859-1")

In [95]:
# Check datatypes 
df.dtypes

Case Number               object
Date                      object
Year                       int64
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Injury                    object
Fatal (Y/N)               object
Investigator or Source    object
href                      object
dtype: object

# Approaches:
    1. Handling Missing Values
    2. Check duplicated Columns
    3. Redefine index
    4. Clean up data in several columns e.g Date, Year
    5. Split columns
    6. Check duplicated rows 
    7. Rename and format columns
    8. Rearrange columns

### 1) Missing Values 

In [4]:
# Check percentage of missing values in each column 

cols_missing_values = ((df.isnull().sum())[df.isnull().sum()>0]/len(df)).sort_values()
cols_missing_values

href formula              0.000167
href                      0.000501
Investigator or Source    0.002503
Fatal (Y/N)               0.003171
Injury                    0.004506
Country                   0.007176
Name                      0.033378
Area                      0.067089
Location                  0.082777
Activity                  0.087951
Sex                       0.094626
Age                       0.447430
Species                   0.489653
Time                      0.536215
Unnamed: 23               0.999666
Unnamed: 22               0.999833
dtype: float64

In [5]:
# 5 columns contains over 40% of missing values. It's safe to say that they are no longer representative for analysis. 
# Decision: dropping 5 columns 
# *'Time' column will be very useful to analysize behavior of sharks. However % of missing values are too high. 
#   Decided to still drop it. 

In [6]:
# Create to drop list with columns to be dropped 

to_drop = cols_missing_values[cols_missing_values > 0.4].index
to_drop

Index(['Age', 'Species ', 'Time', 'Unnamed: 23', 'Unnamed: 22'], dtype='object')

In [7]:
df.drop(to_drop,axis=1,inplace=True)

In [8]:
df.isnull().sum()[df.isnull().sum()>0]

Country                    43
Area                      402
Location                  496
Activity                  527
Name                      200
Sex                       567
Injury                     27
Fatal (Y/N)                19
Investigator or Source     15
href formula                1
href                        3
dtype: int64

In [9]:
# For columns that are not related to other columns: Activity, Name, Sex, Investigator, href formula, href 
#   or columns that are the 'base' of other columns e.g. Location, Injury
#  --- > Fill in 'N/A' or 'Unknown' for missing values since they are not available. 

df['Location'][df['Location'].isnull()] = 'Unknown'
df['Activity'][df['Activity'].isnull()] = 'Unknown'
df['Name'][df['Name'].isnull()] = 'Unknown'
df['Sex '][df['Sex '].isnull()] = 'Unknown'
df['Injury'][df['Injury'].isnull()] = 'Unknown'
df['href formula'][df['href formula'].isnull()] = 'N/A'
df['href'][df['href'].isnull()] = 'N/A'
df['Investigator or Source'][df['Investigator or Source'].isnull()] = 'Unknown'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_g

In [10]:
df.isnull().sum()[df.isnull().sum()>0]

Country         43
Area           402
Fatal (Y/N)     19
dtype: int64

In [11]:
# Now we can also assign 'N/A' for values of 'Area' and 'Country' for records whose 'Location' is N/A

df['Area'][(df['Area'].isnull()) & (df['Location']=='Unknown')] = 'Unknown'
df['Country'][(df['Area']=='Unknown') & (df['Location']=='Unknown')] = 'Unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [12]:
df.isnull().sum()[df.isnull().sum()>0]/len(df)

Country        0.003171
Area           0.031208
Fatal (Y/N)    0.003171
dtype: float64

In [13]:
#Check if we can determine the country with values of Area and Location since no. of missing values is not too big 
df[['Country','Area','Location']][df['Country'].isnull()]

Unnamed: 0,Country,Area,Location
2731,,English Channel,Unknown
3162,,Caribbean Sea,Between St. Kitts & Nevis
3379,,,Florida Strait
3792,,"Between Timor & Darwin, Australia",Unknown
4005,,Near the Andaman & Nicobar Islands,Unknown
4040,,Between Comores & Madagascar,Geyser Bank
4271,,Caribbean Sea,Between Cuba & Costa Rica
4412,,,225 miles east of Hong Kong
4473,,Off South American coast,Unknown
4485,,300 miles east of St. Thomas (Virgin Islands),Unknown


In [14]:
# Insert known values determined by area and location

input_countries = {3792:'AUSTRALIA',4412:'HONG KONG',4790: 'FRANCE',5560:'USA',5847:'BARBADOS'}

for x,y in input_countries.items():
    df.loc[x,"Country"]=y



In [15]:
df[['Country','Area','Location']][df['Country'].isnull()]


Unnamed: 0,Country,Area,Location
2731,,English Channel,Unknown
3162,,Caribbean Sea,Between St. Kitts & Nevis
3379,,,Florida Strait
4005,,Near the Andaman & Nicobar Islands,Unknown
4040,,Between Comores & Madagascar,Geyser Bank
4271,,Caribbean Sea,Between Cuba & Costa Rica
4473,,Off South American coast,Unknown
4485,,300 miles east of St. Thomas (Virgin Islands),Unknown
5189,,,Near the equator
5370,,Mediterranean Sea,Unknown


In [16]:
# The rest of cases tend to happen in international seas
# Assign 'INTERNATIONAL_WATER' in Country columns for those records to indicate that


df['Country'][df['Country'].isnull()] = "INTERNATIONAL_WATER"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [17]:
df.isnull().sum()[df.isnull().sum()>0]/len(df)

Area           0.031208
Fatal (Y/N)    0.003171
dtype: float64

In [18]:
# Looking into the locations values for the rest of missing values of 'Area', 
# inputs are usually still not precise enough for us to determine the Area. 
# Assign 'Unknown' 
df['Area'][df['Area'].isnull()] ='Unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [19]:
# Colunm 'Fatal' should contain only Y/N values 
# Assign 'N' to 'Fatal' if 'Injury' is unknown. 

df['Fatal (Y/N)'][df['Injury']=='Unknown'] = 'N'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [20]:
# Looking into injuries data, A lot of no injuries and others cases victims are dead... 
# Since the remaining percentage of missing values is very low, assign 'N' to the rest of missing values 
# df['Fatal (Y/N)'][df['Fatal (Y/N)'].isnull()] = 'N'

In [21]:
# Checking the injury values and determine the missing values in Fatal column if possible
df['Injury'][df['Fatal (Y/N)'].isnull()]

54      No injury, but sharks repeatedly hit their fin...
1844      Reported as shark attack but probable drowning 
2449                                                FATAL
3280    Diver shot the shark, then it injured his arm ...
3435    Disappeared, probable drowning but sharks in a...
3901                                         Boat damaged
4107    No injury to occupants. Shark tore nets & traw...
4112                         Human remains found in shark
5307       Disappeared, but shark involvement unconfirmed
5437                                 No injury, no attack
5468              Human remains found in 4m, 900 kg shark
5642                                            No injury
5699                     Possible drowning and scavenging
5718                                                FATAL
5747                     Death preceded shark involvement
5793                 Shark caught contained human remains
5794                Shark caught, contained human remains
5820          

In [22]:
fatal_value = {5939:'N',4149:'N',2713:'N',2092:'N',1886:'N',556:'N',351:'N',294:'N',173:'N'}

for x,y in fatal_value.items():
    df.loc[x,"Fatal (Y/N)"]=y

In [23]:
df['Fatal (Y/N)'][df['Fatal (Y/N)'].isnull()]='Y'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [24]:
df.isnull().sum()

Case Number               0
Date                      0
Year                      0
Type                      0
Country                   0
Area                      0
Location                  0
Activity                  0
Name                      0
Sex                       0
Injury                    0
Fatal (Y/N)               0
Investigator or Source    0
pdf                       0
href formula              0
href                      0
Case Number.1             0
Case Number.2             0
original order            0
dtype: int64

### 2. Verify supected duplicates of columns
Suspects:
   - 'Case Number', 'Case Number.1','Case Number.2' 
   - 'pdf','href formula','href'

In [25]:
# Checked quickly if they are identical, result is False
df['Case Number'].equals(df['Case Number.1'])
df['Case Number.1'].equals(df['Case Number.2'])
df['Case Number.2'].equals(df['Case Number'])

False

In [26]:
# Locate the difference
df.loc[(df['Case Number']==df['Case Number.1'])==False][['Case Number','Case Number.1','Case Number.2']]


Unnamed: 0,Case Number,Case Number.1,Case Number.2
4,2016.09.15,2016.09.16,2016.09.15
33,2016.07.14.4,2016.07.14.R,2016.07.14.4
97,2016.01.24.b,2015.01.24.b,2016.01.24.b
116,2015.12.23,2015.11.07,2015.12.23
121,2015.10.28.a,2015.10.28,2015.10.28.a
169,2015.07-10,2015.07.10,2015.07.10
3296,1967.07.05,1967/07.05,1967.07.05
3569,"1962,08.30.b",1962.08.30.b,"1962,08.30.b"
3654,1961.09.02.R,"1961.09,06.R",1961.09.02.R
4177,1952.08.05,1952.08.04,1952.08.05


In [27]:
# Count of difference is really minor and Column'Case Number' contains either values in Case number 1 or 2
# Keep Case number and drop the other two
df.drop(['Case Number.1','Case Number.2'],axis=1,inplace=True)

In [28]:
df.loc[(df['href']==df['href formula'])==False][['href','href formula']].count()/len(df)


href            0.009012
href formula    0.009012
dtype: float64

In [29]:
df.loc[(df['href']==df['href formula'])==False][['href','href formula']]

Unnamed: 0,href,href formula
20,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
27,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
61,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
107,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
114,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
134,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
180,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
193,,http://sharkattackfile.net/spreadsheets/pdf_di...
232,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
262,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...


In [30]:
# 'href' and 'href formula' are almost identical. A few values are missing in either one of the column. 
# Combine the values in the two columns then drop 'href formula'

combine_values = {(193,'href'):(193,'href formula'),
                  (365,'href'):(365,'href formula'),
                  (448,'href formula'):(448,'href'),
                  (2274,'href'):(2274,'href formula'),
                  (3019,'href formula'):(3019,'href'),
                  (5686,'href'):(5686,'href formula')}

for x,y in combine_values.items():
    df.loc[x]=df.loc[y]




In [31]:
df.drop('href formula',axis=1,inplace=True)

In [33]:
# Comparing the values in 'pdf' and in 'href', values in 'pdf' tend to be the filenames of the path in 'href'. 
# Check percentage of values in 'href' contains values of 'pdf' 
pd.Series([b in a for a , b in zip(df['href'], df['pdf'])]).value_counts()/len(df)

# Result: Around 99.5% of values in 'href' contains the values in 'pdf'
# Action: Drop 'pdf'
# Reason: 99.5% chances that the user can refer to 'href' to find the path to pdf.file of the shark attack cases.

True     0.99466
False    0.00534
dtype: float64

In [34]:
df.drop('pdf',axis=1,inplace=True)

### 3. Reset index
- Looking into Case number, they contain usually date of the attach cases. However the user input is too inconsistant and therefore it is not recommended to use this column as index reference. Nonetheless we will keep this column as it provides information in case the input in date column is not complete. 
- Column 'original order' seems to be a good candidate. They seem to follow the order when attacks were reported. 

In [35]:
# Check value counts in 'original order' 
df['original order'].value_counts().sort_values(ascending=False)

5661    2
569     2
3847    2
5739    2
2057    1
       ..
2031    1
4082    1
2035    1
4086    1
3385    1
Name: original order, Length: 5988, dtype: int64

In [36]:
# Out of 5988 records, values of 4 records are duplicated. 
# Action: drop one of the duplicate cases, sort the column and make it as index 

df[(df['original order']==5661) | (df['original order']==569)|(df['original order']==3847) | (df['original order']==5739)]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Injury,Fatal (Y/N),Investigator or Source,href,original order
253,2014.11.16,16-Nov-14,2014,Unprovoked,USA,Florida,"Indian Harbor Beach, Brevard County",Surfing,male,M,Laceration to left hand,N,"USA Today, 11/16/2014",http://sharkattackfile.net/spreadsheets/pdf_di...,5739
254,2014.11.17,Reported 17-Nov-2014,2014,Boat,USA,California,"Franklin Point, San Mateo County",Unknown,Boat: occupants: Matt Mitchell & 2 other people,Unknown,"Shark bumped boat, no injury to occupants",N,"Inquisitr, 11/17/2014",http://sharkattackfile.net/spreadsheets/pdf_di...,5739
331,2014.05.22,22-May-14,2014,Provoked,AUSTRALIA,New South Wales,The Australian Shark and Ray Centre,Teasing a shark,male,M,Cut to tip of finger by a captive shark PROVOK...,N,"Newcastle Herald, 5/22/2014",http://sharkattackfile.net/spreadsheets/pdf_di...,5661
332,2015.11.20,20-Nov-15,2015,Unprovoked,ECUADOR,Galapagos Islands,"Punta Vicente Roca, Isabella Island",Snorkeling,Graham Hurley,M,Lacerations to left calf,N,G. Hurley,http://sharkattackfile.net/spreadsheets/pdf_di...,5661
2146,1984.11.08,08-Nov-84,1984,Unprovoked,SOUTH AFRICA,Eastern Cape Province,Gonubie River Mouth,Surfing,Wayne Monk,M,Wetsuit lacerated,N,"W. Monk, M. Levine, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,3847
2147,1995.07.07,07-Jul-95,1995,Unprovoked,BRAZIL,Pernambuco,Candeias,Surfing,Clélio Rosendo Falcão Filho,M,"Arm bitten, FATAL",Y,JCOnline,http://sharkattackfile.net/spreadsheets/pdf_di...,3847
5423,1893.06.22,22-Jun-1893,1893,Unprovoked,LEBANON,Unknown,Off Tripoli,HMS Victoria collided with the HMS Camperdown,crew,M,FATAL,Y,"The Sydney Morning Herald, 6/29/1893",http://sharkattackfile.net/spreadsheets/pdf_di...,569
5424,1893.06.22.R,Reported 22-Jun-1893,1893,Unprovoked,NIGERIA,Bayelsa State,Mouth of the Nun River,A barque wrecked,mate & crew,M,FATAL,Y,"Otago Witness, 6/22/1893",http://sharkattackfile.net/spreadsheets/pdf_di...,569


In [37]:
rows_to_drop = [254,331,2146,5424]
df = df.drop(rows_to_drop)

In [38]:
df['original order'].is_unique

True

In [39]:
df = df.set_index('original order',drop=True)

### 4. Clear data contained in the columns


1. Date
    - 'reported' appeared multiple times in date and provide no useful information
    - remove 'reported'


In [41]:
remove_reported = {'Reported ':'',
                   'Reported':'',
                   'reported ':'',
                   'repored':''}

for x,y in remove_reported.items():
    df['Date'] = df['Date'].str.replace(x,y)

In [56]:
df['Date']

original order
5993      18-Sep-16
5992      18-Sep-16
5991      18-Sep-16
5990      17-Sep-16
5989      16-Sep-16
           ...     
6       Before 1903
5       Before 1903
4         1900-1905
3         1883-1889
2         1845-1853
Name: Date, Length: 5988, dtype: object

2. Country
    - Capitalize the first letter


In [57]:
df['Country']=df['Country'].str.capitalize()

3. Sex
    - Remove extra space and incomprehensible values

In [44]:
df['Sex '].value_counts()

M          4832
F           585
Unknown     566
M             2
lli           1
.             1
N             1
Name: Sex , dtype: int64

In [45]:
df['Sex '] = df['Sex '].str.replace(' ','')

In [46]:
df['Sex '][(df['Sex ']!='M') & (df['Sex ']!='F') & (df['Sex ']!='Unknown')] = 'Unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [74]:
df['Sex '].value_counts()

M          4834
F           585
Unknown     569
Name: Sex , dtype: int64

4. Fatal (Y/N)
    - Replace unwanted format and empty value

In [59]:
df['Fatal (Y/N)'].value_counts()

N          4341
Y          1569
UNKNOWN      66
 N            8
n             1
#VALUE!       1
F             1
N             1
Name: Fatal (Y/N), dtype: int64

In [72]:
df['Fatal (Y/N)'].str.replace(' ','').str.replace('F','Y').str.replace('n','N').str.replace('#VALUE!','UNKNOWN').value_counts()

N          4351
Y          1570
UNKNOWN      67
Name: Fatal (Y/N), dtype: int64

In [70]:
df['Fatal (Y/N)'].str.replace('#VALUE!','')

original order
5993    N
5992    N
5991    N
5990    N
5989    N
       ..
6       Y
5       Y
4       Y
3       Y
2       Y
Name: Fatal (Y/N), Length: 5988, dtype: object

### 5. Split columns
    - 'Investigator or Source' Column contains several informations in each value
    - Split column into two:
        - Investigator or Source
        - Reference page/date

In [106]:
df_copy = df.copy()

In [108]:
df_copy['Investigator or Source'].str.split(pat=', ',expand=True)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
original order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5993,Orlando Sentinel,9/19/2016,,,,,,,,
5992,Orlando Sentinel,9/19/2016,,,,,,,,
5991,Orlando Sentinel,9/19/2016,,,,,,,,
5990,The Age,9/18/2016,,,,,,,,
5989,The Age,9/16/2016,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
6,H. Taunton; N. Bartlett,p. 234,,,,,,,,
5,H. Taunton; N. Bartlett,pp. 233-234,,,,,,,,
4,F. Schwartz,p.23; C. Creswell,GSAF,,,,,,,
3,The Sun,10/20/1938,,,,,,,,


In [119]:
# Keep the first two columns with more important information
df['Investigator or Source'] = df_copy['Investigator or Source'].str.split(pat=', ',expand=True)[0]

In [121]:
df = df.assign(Ref_date_or_page=df_copy['Investigator or Source'].str.split(pat=', ',expand=True)[1])

### 6. Check duplicated rows 

In [122]:
df.duplicated().sum()

0

### 7. Rename and format columns 
    - Rename 'href' to 'Case file path
    - Remove unessary space in 'Sex ' column name

In [123]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Injury', 'Fatal (Y/N)',
       'Investigator or Source', 'href', 'Ref_date_or_page'],
      dtype='object')

In [128]:
df.columns = ['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Injury', 'Fatal (Y/N)',
       'Investigator or Source','Case file path','Ref date or page']

In [129]:
df.head()

Unnamed: 0_level_0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Injury,Fatal (Y/N),Investigator or Source,Case file path,Ref date or page
original order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5993,2016.09.18.c,18-Sep-16,2016,Unprovoked,Usa,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Minor injury to thigh,N,Orlando Sentinel,http://sharkattackfile.net/spreadsheets/pdf_di...,9/19/2016
5992,2016.09.18.b,18-Sep-16,2016,Unprovoked,Usa,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,Lacerations to hands,N,Orlando Sentinel,http://sharkattackfile.net/spreadsheets/pdf_di...,9/19/2016
5991,2016.09.18.a,18-Sep-16,2016,Unprovoked,Usa,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Lacerations to lower leg,N,Orlando Sentinel,http://sharkattackfile.net/spreadsheets/pdf_di...,9/19/2016
5990,2016.09.17,17-Sep-16,2016,Unprovoked,Australia,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,Struck by fin on chest & leg,N,The Age,http://sharkattackfile.net/spreadsheets/pdf_di...,9/18/2016
5989,2016.09.15,16-Sep-16,2016,Unprovoked,Australia,Victoria,Bells Beach,Surfing,male,M,No injury: Knocked off board by shark,N,The Age,http://sharkattackfile.net/spreadsheets/pdf_di...,9/16/2016


### 8. Rearrange columns

    - Move 'Ref date or page' after 'Investigator or Source'

In [132]:
df = df[['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Injury', 'Fatal (Y/N)',
       'Investigator or Source','Ref date or page','Case file path']]

In [134]:
df.head()

Unnamed: 0_level_0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Injury,Fatal (Y/N),Investigator or Source,Ref date or page,Case file path
original order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5993,2016.09.18.c,18-Sep-16,2016,Unprovoked,Usa,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Minor injury to thigh,N,Orlando Sentinel,9/19/2016,http://sharkattackfile.net/spreadsheets/pdf_di...
5992,2016.09.18.b,18-Sep-16,2016,Unprovoked,Usa,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,Lacerations to hands,N,Orlando Sentinel,9/19/2016,http://sharkattackfile.net/spreadsheets/pdf_di...
5991,2016.09.18.a,18-Sep-16,2016,Unprovoked,Usa,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Lacerations to lower leg,N,Orlando Sentinel,9/19/2016,http://sharkattackfile.net/spreadsheets/pdf_di...
5990,2016.09.17,17-Sep-16,2016,Unprovoked,Australia,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,Struck by fin on chest & leg,N,The Age,9/18/2016,http://sharkattackfile.net/spreadsheets/pdf_di...
5989,2016.09.15,16-Sep-16,2016,Unprovoked,Australia,Victoria,Bells Beach,Surfing,male,M,No injury: Knocked off board by shark,N,The Age,9/16/2016,http://sharkattackfile.net/spreadsheets/pdf_di...


## Export to pickle and csv files

In [135]:
df.to_pickle('cleaned_shark_attacks.pkl')

In [136]:
df.to_csv('cleaned_shark_attacks.csv')