# 1. First Things First
(importing libraries and taking a first look at the data)

In [1]:
# I start with importing pandas, numpy, pymysql 
# to later perform operations with the aid of the pandas library, numpy and pymysql.

import pandas as pd
import numpy as np
import pymysql

# this is just to display the whole table in jupyter and not only a couple of rows:

pd.set_option('display.max_rows', 5400)

In [2]:
# Here, I assign the variable "sharkattack" to the data set and import it via pandas.

sharkattack = pd.read_csv('GSAF5.csv', encoding = "cp1252")

In [3]:
# With the use of the ".head" funktion and the ".describe" function, I take a first look at the data.

sharkattack.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [4]:
sharkattack.describe()

Unnamed: 0,Year,original order
count,5992.0,5992.0
mean,1925.204606,2997.548899
std,286.473712,1729.86021
min,0.0,2.0
25%,1942.0,1499.75
50%,1975.0,2997.5
75%,2003.0,4495.25
max,2016.0,5993.0


In [5]:
# checking data types per column

sharkattack.dtypes

Case Number               object
Date                      object
Year                       int64
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Age                       object
Injury                    object
Fatal (Y/N)               object
Time                      object
Species                   object
Investigator or Source    object
pdf                       object
href formula              object
href                      object
Case Number.1             object
Case Number.2             object
original order             int64
Unnamed: 22               object
Unnamed: 23               object
dtype: object

In [6]:
# Checking for NaN's or null values per column:

sharkattack.isna().sum()

Case Number                  0
Date                         0
Year                         0
Type                         0
Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
pdf                          0
href formula                 1
href                         3
Case Number.1                0
Case Number.2                0
original order               0
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

# 2. Drop Columns, Set New Index

In [7]:
# First, I want to check the rows for duplicates. There might be row duplicates which could be dropped. Therefore, I apply the duplicated method:
# No duplicated rows could be detected.

sharkattack[sharkattack.duplicated()]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23


In [8]:
# While checking for NaN's in the previous chapter 1, it is noted that the last columns "Unnamed: 22" and "Unnamed: 23" 
# do almost only contain null values and are therefore out of interest.
# Also more than half of the values in the column "Time" contain NaN's.
# They don't provide any relevant information. Therefore, I drop them:

sharkattack = sharkattack.drop(['Unnamed: 22', 'Unnamed: 23', 'Time'], axis=1)

In [9]:
# to find out if column 'Case Number' equals 'Case Number.1':

sharkattack[sharkattack['Case Number'] != sharkattack['Case Number.2']]

# the results show that they contain the same values exept for two rows. 
# Hence, I decided to drop 'Case Number.2'.

sharkattack = sharkattack.drop(['Case Number.2'], axis=1)

In [10]:
# I did also realize that 'Case Number' and 'Case Number.1' contain the date which is already provided in 'Date'.
# It is not useful to take them as unique numeric identifyer for each case as they also contain letters.
# Thus, I do also drop them.

sharkattack = sharkattack.drop(['Case Number', 'Case Number.1'], axis=1)

# 'href' and 'pdf' seem to be unnecessary as well. I decided to keep 'href formula' as it provides the same links as 'href' and rename it to "Link Case Summary".

sharkattack = sharkattack.drop(['href', 'pdf'], axis=1)
sharkattack.rename(columns={"href formula": "Link Case Summary"}, inplace=True)


In [11]:
# Now, I check if I can use the 'original order' as unique row identifyer.

check_original_order = sharkattack['original order'].value_counts()

# It appears that there are eight rows where the 'original order value' appears twice.

check_original_order[check_original_order>1]

# I check for the index of the rows to make sure for which rows I want to update the 'original order value'.

sharkattack.loc[sharkattack['original order'] == 5661]
sharkattack.loc[sharkattack['original order'] == 569]
sharkattack.loc[sharkattack['original order'] == 3847]
sharkattack.loc[sharkattack['original order'] == 5739]

# I assign a new 'original order value' for four out of the eight rows to have a unique identifier.
# Starting at index 332, I change the 'original order value' to 5994. 

sharkattack.at[332,'original order']= 5994
sharkattack.at[5424,'original order']= 5995
sharkattack.at[2147,'original order']= 5996
sharkattack.at[254,'original order']= 5997

# to check if the new 'original order values' are included in the data frame:

sharkattack.sort_values(by=['original order'])

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Species,Investigator or Source,Link Case Summary,original order
5991,1845-1853,0,Unprovoked,CEYLON (SRI LANKA),Eastern Province,"Below the English fort, Trincomalee",Swimming,male,M,15,"FATAL. ""Shark bit him in half, carrying away t...",Y,,S.W. Baker,http://sharkattackfile.net/spreadsheets/pdf_di...,2
5990,1883-1889,0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,,FATAL,Y,,"The Sun, 10/20/1938",http://sharkattackfile.net/spreadsheets/pdf_di...,3
5989,1900-1905,0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,FATAL,Y,,"F. Schwartz, p.23; C. Creswell, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,4
5988,Before 1903,0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,FATAL,Y,,"H. Taunton; N. Bartlett, pp. 233-234",http://sharkattackfile.net/spreadsheets/pdf_di...,5
5987,Before 1903,0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,FATAL,Y,,"H. Taunton; N. Bartlett, p. 234",http://sharkattackfile.net/spreadsheets/pdf_di...,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16,Minor injury to thigh,N,,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5993
332,20-Nov-15,2015,Unprovoked,ECUADOR,Galapagos Islands,"Punta Vicente Roca, Isabella Island",Snorkeling,Graham Hurley,M,55,Lacerations to left calf,N,Galapagos shark,G. Hurley,http://sharkattackfile.net/spreadsheets/pdf_di...,5994
5424,Reported 22-Jun-1893,1893,Unprovoked,NIGERIA,Bayelsa State,Mouth of the Nun River,A barque wrecked,mate & crew,M,,FATAL,Y,,"Otago Witness, 6/22/1893",http://sharkattackfile.net/spreadsheets/pdf_di...,5995
2147,07-Jul-95,1995,Unprovoked,BRAZIL,Pernambuco,Candeias,Surfing,Clélio Rosendo Falcão Filho,M,18,"Arm bitten, FATAL",Y,,JCOnline,http://sharkattackfile.net/spreadsheets/pdf_di...,5996


In [12]:
sharkattack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 16 columns):
Date                      5992 non-null object
Year                      5992 non-null int64
Type                      5992 non-null object
Country                   5949 non-null object
Area                      5590 non-null object
Location                  5496 non-null object
Activity                  5465 non-null object
Name                      5792 non-null object
Sex                       5425 non-null object
Age                       3311 non-null object
Injury                    5965 non-null object
Fatal (Y/N)               5973 non-null object
Species                   3058 non-null object
Investigator or Source    5977 non-null object
Link Case Summary         5991 non-null object
original order            5992 non-null int64
dtypes: int64(2), object(14)
memory usage: 749.1+ KB


In [13]:
# Now I like to rename the column 'original order' to 'Case Number' and use it as my new index.

sharkattack.rename(columns={"original order": "Case Number"}, inplace=True)
sharkattack.set_index('Case Number', inplace=True)

In [14]:
# Save data frame in a copy

sharkattack_new = sharkattack.copy()

# 3. Modify 'Sex'

In [15]:
# I noticed that 'Sex' and 'Species' out of the column headers have spaces in their names: 

list(sharkattack.columns.values)

# I rename the columns to delete the spaces:

sharkattack.rename(columns={"Sex ": "Sex", "Species ": "Species"}, inplace=True)

In [16]:
# Check for missing values in 'Sex' or any other values than 'M' or 'F':

sharkattack['Sex'].value_counts()

# I standardize all values in 'Sex' to either 'M' or 'F' or 'N/S' if not specified

sharkattack['Sex'].replace('M ', 'M', inplace=True)
sharkattack['Sex'].replace('.', 'N/S', inplace=True)
sharkattack['Sex'].replace('N', 'N/S', inplace=True)
sharkattack['Sex'].replace('lli', 'M', inplace=True)

In [17]:
# Now, I change all NaN values in 'Sex' to 'N/S' with the method .fillna:

sharkattack['Sex'].fillna(value='N/S', inplace=True)

In [18]:
# And change the data type of 'Sex' into 'category'

sharkattack['Sex'] = sharkattack['Sex'].astype('category')

# 4. Modify 'Country'

In [19]:
# By looking at the country names, I noted that not all of them are written in upper case and some are indented:

sharkattack['Country'].value_counts()

# I convert all country names to upper case and delete the indent:

sharkattack['Country'] = sharkattack['Country'].str.upper()
sharkattack['Country'] = sharkattack['Country'].str.lstrip(' ')
sharkattack['Country'] = sharkattack['Country'].str.rstrip(' ')

# 5. Modify 'Fatal (Y/N)'

In [20]:
# Taking a look at 'Fatal (Y/N)', it can be seen that some values need to be adjusted:

sharkattack['Fatal (Y/N)'].value_counts()

# Therefore, I convert all missing / unknown values into 'N/S' for not specified and generalize the values for 'Y' and 'N':

sharkattack['Fatal (Y/N)'] = sharkattack['Fatal (Y/N)'].str.upper()
sharkattack['Fatal (Y/N)'] = sharkattack['Fatal (Y/N)'].str.lstrip(' ')
sharkattack['Fatal (Y/N)'] = sharkattack['Fatal (Y/N)'].str.rstrip(' ')

sharkattack['Fatal (Y/N)'].replace('UNKNOWN', 'N/S', inplace=True)
sharkattack['Fatal (Y/N)'].replace('#VALUE!', 'N/S', inplace=True)
sharkattack['Fatal (Y/N)'].replace('F', 'N/S', inplace=True)

In [21]:
# And change the data type of 'Fatal (Y/N)' into 'category'

sharkattack['Fatal (Y/N)'] = sharkattack['Fatal (Y/N)'].astype('category')

# 6. Replace NaN's and Export File

In [22]:
# Here, I replace missing values in the whole data set with 'N/S':

sharkattack.fillna(value='N/S', inplace=True)

In [23]:
# to export the csv file as 'clean_csv':

clean_csv = sharkattack.to_csv ('clean_csv.csv', index = None, header=True)

# Final Evaluation and Problems faced

In general, I am quite satisfied with how I cleaned and manipulated this chaotic data set. However I know that I had many more points on my To do list which I could not figure out how to realize them. I would have liked to clean up and combine the 'Date' and 'Year' column but could not figure out a way to do so. Additionally, I thought planned to group some values of the 'Activity' column to reduce the amount of unique values in this column and to provide a better owerview with less categories. The same applies to the 'Injury' column. I did not do so because there are so many unique values stored in both columns and it would take me very long to categorize all the unique values. There is probably a smarter, time saving way to do so which I am not aware of. It would be great to receive some feedback on how to put my pain points into action :).