In [1]:
# 1. Importing pandas, numpy, re
import pandas as pd
import numpy as np
import re

In [2]:
# 2. Reading the downloaded GSAF5.csv file containing very messy data about shark attacks
df = pd.read_csv('GSAF5.csv', encoding = "ISO-8859-1")

In [3]:
# 3. Creating a working copy
df_copy = df.copy()

In [4]:
# Sanity check :)
len(df_copy) == len(df)

True

In [5]:
colnames = df_copy.columns
colnames

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [6]:
# 4. Cleaning the column names
colnames = df_copy.columns
newcolnames = [i
               .capitalize()
               .replace('.',' ')
               .replace(':', '')
               .replace('(y/n)', '')
               .replace('Investigator or source', 'Source')
               .strip() for i in colnames]
print(newcolnames)

['Case number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal', 'Time', 'Species', 'Source', 'Pdf', 'Href formula', 'Href', 'Case number 1', 'Case number 2', 'Original order', 'Unnamed 22', 'Unnamed 23']


In [7]:
# 5. Reasigning the cleaed column names 
df_copy.columns = newcolnames
df_copy.head(2)

Unnamed: 0,Case number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Source,Pdf,Href formula,Href,Case number 1,Case number 2,Original order,Unnamed 22,Unnamed 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,


In [8]:
# 6. Checking general content of the dataframe
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 24 columns):
Case number       5992 non-null object
Date              5992 non-null object
Year              5992 non-null int64
Type              5992 non-null object
Country           5949 non-null object
Area              5590 non-null object
Location          5496 non-null object
Activity          5465 non-null object
Name              5792 non-null object
Sex               5425 non-null object
Age               3311 non-null object
Injury            5965 non-null object
Fatal             5973 non-null object
Time              2779 non-null object
Species           3058 non-null object
Source            5977 non-null object
Pdf               5992 non-null object
Href formula      5991 non-null object
Href              5989 non-null object
Case number 1     5992 non-null object
Case number 2     5992 non-null object
Original order    5992 non-null int64
Unnamed 22        1 non-null object
U

In [9]:
# 7. Check how the 'Time' column is structured
df_copy['Time'].unique()[:10]

array(['13h00', '11h00', '10h43', nan, '15h15', '14h30', '15h40',
       'Late afternoon', '15h00', '14h00'], dtype=object)

In [10]:
# 8. Defining function to categorize the time in categories:

def daytime(hour):
    if (hour > 4) and (hour <= 8):
        return 'Early morning'
    elif (hour > 8) and (hour <= 12):
        return 'Morning'
    elif (hour > 12) and (hour <= 16):
        return 'Afternoon'
    elif (hour > 16) and (hour <= 20):
        return 'Evening'
    elif (hour > 20) and (hour <= 24):
        return 'Night'
    elif (hour >= 0) and (hour <= 4):
        return 'Late might'
    else:
        return i

In [11]:
# 9 Starting to clean up the time column
time_vals = df_copy['Time']

time_cleaning = []

for i in time_vals[:]:
    if type(i) == float:
        time_cleaning.append(i)
        continue
    if i.split(' ')[0].isalpha() | i.isalpha() :
        time_cleaning.append(i)
        continue
    if type(i) != float and 'h' in i and len(i.strip()) == 5:
        hour = int(i.split('h')[0])
        time_cleaning.append(daytime(hour))
        continue
    else:
        time_cleaning.append(i)

In [12]:
# Sanity check
len(time_cleaning) == len(time_vals)

True

In [13]:
set(time_cleaning)

{' ',
 '  ',
 '   ',
 '"After dark"',
 '"After lunch"',
 '"Early evening"',
 '"Evening"',
 '"Just before 11h00"',
 '"Night"',
 '"shortly before dusk"',
 '--',
 '03h45 - 04h00',
 '06h00 -- 07h00',
 '06j00',
 '07h00 - 08h00',
 '08h00 / 09h30',
 '09h00 - 09h30',
 '09h00 -10h00',
 '09h30 / 10h00',
 '09h30 / 15h30',
 '10h00 -- 11h00',
 '10h00 / 11h00',
 '10h00 or 14h00',
 '10h30 or 13h30',
 '10h45-11h15',
 '11h00 / 11h30',
 '11h01 -time of ship sinking',
 '11h115',
 '12h00 to 14h00',
 '12h45 / 13h45',
 '1300',
 '13h345',
 '14h00 - 15h00',
 '14h30 / 15h30',
 '1500',
 '15h00 or 15h45',
 '15h00j',
 '15j45',
 '1600',
 '16h30 or 18h00',
 '17h00 Sunset',
 '17h00 or 17h40',
 '17h00-18h00',
 '18h15 to 21h30',
 '18h15-18h30',
 '18h30 (Sunset)',
 '18h30?',
 '19h00 / 20h00',
 '19h00-20h00',
 '2 hours after Opperman',
 '2 hrs before sunset',
 '20h45 (Sunset)',
 '30 minutes after 1992.07.08.a',
 '500',
 '830',
 '8:04 PM',
 '<07h30',
 '>06h45',
 '>08h00',
 '>12h00',
 '>14h30',
 '>17h00',
 '>17h30',
 'A.M

In [14]:
a = []
for i in time_cleaning:
    if type(i) == str:
        if re.findall('^[0-9][0-9]h', i) :
            a.append(i[:2])
        else:
            a.append(i)
    else:
        a.append(i)

In [15]:
# Sanity check
len(a) == len(time_cleaning)

True

In [16]:
time_cleaning = a

In [17]:
a = []
for i in time_cleaning:
    if type(i) == str:
        if (re.findall('[0-9][0-9]', i)):
            a.append(daytime(int(re.findall('[0-9][0-9]', i)[0])))
        else:
            a.append(i)
    else:
        a.append(str(i))

In [18]:
# Sanity check
len(a) == len(time_cleaning)

True

In [19]:
time_cleaning = a
set(time_cleaning)

{' ',
 '  ',
 '   ',
 '"After dark"',
 '"After lunch"',
 '"Early evening"',
 '"Evening"',
 '"Night"',
 '"shortly before dusk"',
 '--',
 '2 hours after Opperman',
 '2 hrs before sunset',
 '30 minutes after 1992.07.08.a',
 '500',
 '830',
 'A.M.',
 'AM',
 'After Dusk',
 'After dusk',
 'After midnight',
 'After noon',
 'Afternoon',
 'Before daybreak',
 'Dark',
 'Dawn',
 'Daybreak',
 'Daytime',
 'Dusk',
 'Early Morning',
 'Early afternoon',
 'Early morning',
 'Evening',
 'FATAL  (Wire netting installed at local beaches after this incident.)',
 'Just before dawn',
 'Just before noon',
 'Just before sundown',
 'Late Afternoon',
 'Late afternon',
 'Late afternoon',
 'Late might',
 'Late morning',
 'Late night',
 'Lunchtime',
 'Mid afternoon',
 'Mid morning',
 'Mid-morning',
 'Midday',
 'Midday.',
 'Midnight',
 'Morning',
 'Morning ',
 'Night',
 'Nightfall',
 'Noon',
 'P.M.',
 'Shortly after midnight',
 'Sunset',
 'X',
 'dusk',
 'nan',
 'night',
 '\xa0 '}

In [20]:
time_cleaning = [i.lower().capitalize() for i in time_cleaning]
b = []
for i in time_cleaning:
    i = i.lower()
    if 'aftern' in i or 'noon' in i or 'lunch' in i or 'P.m' in i or 'midday' in i:
        b.append(daytime(13)) # afternoon
    elif 'might' in i or 'night' in i:
        b.append(daytime(23)) # night
    elif 'morning' in i or 'daybreak' in i or 'dawn' in i:
        b.append(daytime(10)) # morning
    elif 'dark' in i or 'dusk' in i or 'evening' in i or 'sunset' in i:
        b.append(daytime(20)) # evening
    else:
        b.append('NaN')

In [21]:
# safety check
len(b) == len(time_cleaning)

True

In [22]:
time_cleaning = b
set(time_cleaning)

{'Afternoon', 'Evening', 'Morning', 'NaN', 'Night'}

In [23]:
df_copy_copy = df_copy.copy()

In [24]:
df_copy_copy['Time'] = time_cleaning
df_copy_copy['Time'].unique()

array(['Afternoon', 'Morning', 'NaN', 'Evening', 'Night'], dtype=object)

In [25]:
# Sanity check
len(df_copy) == len(df_copy_copy)

True

In [26]:
df_copy_copy.head(2)

Unnamed: 0,Case number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Source,Pdf,Href formula,Href,Case number 1,Case number 2,Original order,Unnamed 22,Unnamed 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,


In [27]:
df_copy = df_copy_copy
df_copy_copy = df_copy.copy()

In [28]:
# 10. Removing potential duplicated columns
df_copy_copy = df_copy_copy.drop_duplicates(subset=list(df_copy_copy.columns))

In [29]:
# 11. Checking how many NaN, empty values there are in the dataset
empty_cols = df_copy_copy.isnull().sum()
empty_cols

Case number          0
Date                 0
Year                 0
Type                 0
Country             43
Area               402
Location           496
Activity           527
Name               200
Sex                567
Age               2681
Injury              27
Fatal               19
Time                 0
Species           2934
Source              15
Pdf                  0
Href formula         1
Href                 3
Case number 1        0
Case number 2        0
Original order       0
Unnamed 22        5991
Unnamed 23        5990
dtype: int64

In [30]:
# 12.Removing columns with empty values
df_copy_copy = df_copy_copy.drop(axis = 1, columns =  ['Type','Age','Species', 'Unnamed 22', 'Unnamed 23', 'Original order'])

In [31]:
df_copy_copy.head(2)

Unnamed: 0,Case number,Date,Year,Country,Area,Location,Activity,Name,Sex,Injury,Fatal,Time,Source,Pdf,Href formula,Href,Case number 1,Case number 2
0,2016.09.18.c,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Minor injury to thigh,N,Afternoon,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c
1,2016.09.18.b,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,Lacerations to hands,N,Morning,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b


In [32]:
# 13. Removing rows with NaNs and missing info
df_copy_copy_copy = df_copy_copy.dropna( axis=0, how='any', thresh=None, subset=None, inplace=False)

In [33]:
len(df_copy_copy_copy)

4488

In [34]:
df_copy = df_copy_copy_copy
df_copy_copy = df_copy

In [35]:
# 14. Checking the Country column
df_copy_copy['Country'].unique()

array(['USA', 'AUSTRALIA', 'NEW CALEDONIA', 'BAHAMAS', 'SPAIN', 'JAPAN',
       'SOUTH AFRICA', 'EGYPT', 'NEW ZEALAND', 'INDONESIA',
       'FRENCH POLYNESIA', 'BRAZIL', 'DOMINICAN REPUBLIC',
       'CAYMAN ISLANDS', 'UNITED ARAB EMIRATES', 'MOZAMBIQUE', 'THAILAND',
       'REUNION', 'MEXICO', 'FRANCE', 'ECUADOR', 'ISRAEL', 'JAMAICA',
       'PHILIPPINES', 'NIGERIA', 'TONGA', 'SCOTLAND', 'TRINIDAD & TOBAGO',
       'CANADA', 'ITALY', 'SAUDI ARABIA', 'CHILE', 'TAIWAN', 'ANTIGUA',
       'PAPUA NEW GUINEA', 'RUSSIA', 'SEYCHELLES', 'COLUMBIA',
       'TURKS & CAICOS', 'COSTA RICA', 'MALAYSIA',
       'UNITED ARAB EMIRATES (UAE)', 'FIJI', 'SOUTH KOREA', 'VIETNAM',
       'MADAGASCAR', 'GUAM', 'PANAMA', 'UNITED KINGDOM',
       'SOLOMON ISLANDS', 'KENYA', 'CUBA', 'CROATIA', 'ENGLAND', 'NORWAY',
       'BELIZE', 'Sierra Leone', 'ST. MAARTIN', 'GRAND CAYMAN',
       'Seychelles', 'CHINA', 'VANUATU', 'MEXICO ', 'HONDURAS',
       'SRI LANKA', ' TONGA', 'URUGUAY', 'VENEZUELA', 'INDIA',
       '

In [36]:
# 15. Cleaning the Country column
country_colnames =[
    i
    .upper()
    .replace('?' , '')
    .replace('(UAE)', '')
    .replace('CEYLON (SRI LANKA)', 'SRI LANKA')
    .strip()
    for i in df_copy_copy['Country']]


In [37]:
# 16. Reasigning the columns
df_copy_copy['Country'] = country_colnames

In [38]:
# 17. Checking the Sex column
df_copy_copy.head(2)
#df_copy['Sex'].unique()

Unnamed: 0,Case number,Date,Year,Country,Area,Location,Activity,Name,Sex,Injury,Fatal,Time,Source,Pdf,Href formula,Href,Case number 1,Case number 2
0,2016.09.18.c,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Minor injury to thigh,N,Afternoon,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c
1,2016.09.18.b,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,Lacerations to hands,N,Morning,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b


In [39]:
# 18. Cleaning up the Sex column 
sex_colnames = [ i 
                .strip()
                .replace('N', 'NaN')
                .replace('lli', 'NaN')
                for i in df_copy_copy['Sex']]
set(sex_colnames)

{'F', 'M', 'NaN'}

In [40]:
# 19. Reasigining the names
df_copy_copy['Sex'] = sex_colnames

In [41]:
df_copy_copy.head(2)

Unnamed: 0,Case number,Date,Year,Country,Area,Location,Activity,Name,Sex,Injury,Fatal,Time,Source,Pdf,Href formula,Href,Case number 1,Case number 2
0,2016.09.18.c,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Minor injury to thigh,N,Afternoon,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c
1,2016.09.18.b,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,Lacerations to hands,N,Morning,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b


In [42]:
# 20. The whole activity column is not very informative, it is obvious the victims were somehow swimming...
df_copy_copy['Activity'].unique()

array(['Surfing', 'Wading', 'Swimming', ...,
       'Swimming around anchored ship',
       'Crew swimming alongside their anchored ship',
       '4 men were bathing'], dtype=object)

In [43]:
# 21. Removing 'Activity' column
df_copy_copy = df_copy_copy.drop( axis = 1, columns= 'Activity')

In [44]:
df_copy_copy.head()

Unnamed: 0,Case number,Date,Year,Country,Area,Location,Name,Sex,Injury,Fatal,Time,Source,Pdf,Href formula,Href,Case number 1,Case number 2
0,2016.09.18.c,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",male,M,Minor injury to thigh,N,Afternoon,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c
1,2016.09.18.b,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Chucky Luciano,M,Lacerations to hands,N,Morning,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b
2,2016.09.18.a,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",male,M,Lacerations to lower leg,N,Morning,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a
3,2016.09.17,17-Sep-16,2016,AUSTRALIA,Victoria,Thirteenth Beach,Rory Angiolella,M,Struck by fin on chest & leg,N,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17
4,2016.09.15,16-Sep-16,2016,AUSTRALIA,Victoria,Bells Beach,male,M,No injury: Knocked off board by shark,N,,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15


In [45]:
# 22. The Pdf, Href formula and Href seems kind off redundant
df_copy_copy[['Pdf', 'Href formula', 'Href']]

Unnamed: 0,Pdf,Href formula,Href
0,2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
1,2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
3,2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
4,2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...,...
5980,ND-0012-Durban-Scotsman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
5986,ND-0006-ArabBoy-Prymount.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
5987,ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
5989,ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...


In [46]:
pdfs = [str(i).strip() for i in df_copy_copy['Pdf']]
href_formulas = [str(i).strip() for i in df_copy_copy['Href formula']]
hrefs = [str(i).strip() for i in df_copy_copy['Href']]

In [47]:
len(pdfs), len(href_formulas), len(hrefs)

(4488, 4488, 4488)

In [48]:
pdfs[0], href_formulas[0], hrefs[0]

('2016.09.18.c-NSB.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2016.09.18.c-NSB.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2016.09.18.c-NSB.pdf')

In [49]:
# Checking how redundat are the informations in the columns
[pdfs[i] for i in range(len(pdfs)) if pdfs[i] not in href_formulas[i]]

[]

In [50]:
# There are some mismatches, but I assume that the hrefs don't contain the true row information
[(pdfs[i], hrefs[i]) for i in range(len(pdfs)) if pdfs[i] not in hrefs[i]][:5]

[('2016.07.23.a-Cutbirth.pdf',
  'http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.23-Cutbirth.pdf'),
 ('2015.10.05.a-Slaughter.pdf',
  'http://sharkattackfile.net/spreadsheets/pdf_directory/2015.10.05-Slaughter.pdf'),
 ('2015.06.27.a-Swanepoel.pdf',
  'http://sharkattackfile.net/spreadsheets/pdf_directory/2015.06.27-Swanepoel.pdf'),
 ('2014.10.18-Roberson.pdf',
  'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.18-Roberson'),
 ('2014.10.17-Olsson, pdf',
  'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.14-Bandy.pdf')]

In [51]:
# 23. Removing the Pdfs and Hrefs columns
df_copy_copy = df_copy_copy.drop(axis = 1, columns =  ['Pdf', 'Href'])
df_copy_copy.head(2)

Unnamed: 0,Case number,Date,Year,Country,Area,Location,Name,Sex,Injury,Fatal,Time,Source,Href formula,Case number 1,Case number 2
0,2016.09.18.c,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",male,M,Minor injury to thigh,N,Afternoon,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c
1,2016.09.18.b,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Chucky Luciano,M,Lacerations to hands,N,Morning,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b


In [52]:
# 24. Renaming the "Href formula" column 

df_copy_copy.rename(columns={'Href formula':'Link'}, inplace=True)

In [53]:
df_copy_copy.head(2)

Unnamed: 0,Case number,Date,Year,Country,Area,Location,Name,Sex,Injury,Fatal,Time,Source,Link,Case number 1,Case number 2
0,2016.09.18.c,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",male,M,Minor injury to thigh,N,Afternoon,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c
1,2016.09.18.b,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Chucky Luciano,M,Lacerations to hands,N,Morning,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b


In [54]:
df_copy = df_copy_copy
df_copy_copy = df_copy.copy()

In [55]:
# 25. Checking the 'Case number', 'Case number 1' and 'Case number 2' columns
df_copy_copy[['Case number', 'Case number 1', 'Case number 2']]

Unnamed: 0,Case number,Case number 1,Case number 2
0,2016.09.18.c,2016.09.18.c,2016.09.18.c
1,2016.09.18.b,2016.09.18.b,2016.09.18.b
2,2016.09.18.a,2016.09.18.a,2016.09.18.a
3,2016.09.17,2016.09.17,2016.09.17
4,2016.09.15,2016.09.16,2016.09.15
...,...,...,...
5980,ND.0012,ND.0012,ND.0012
5986,ND.0006,ND.0006,ND.0006
5987,ND.0005,ND.0005,ND.0005
5989,ND.0003,ND.0003,ND.0003


In [56]:
# Seems kind of redundant, let's examine it a bit more
case = [str(i).strip() for i in df_copy_copy['Case number']]
case_1 = [str(i).strip() for i in df_copy_copy['Case number 1']]
case_2 = [str(i).strip() for i in df_copy_copy['Case number 2']]

In [57]:
[(case[i], case_1[i]) for i in range(len(case)) if case[i] != case_1[i]]

[('2016.09.15', '2016.09.16'),
 ('2016.01.24.b', '2015.01.24.b'),
 ('2015.12.23', '2015.11.07'),
 ('2015.10.28.a', '2015.10.28'),
 ('2015.07-10', '2015.07.10'),
 ('1967.07.05', '1967/07.05'),
 ('1961.09.02.R', '1961.09,06.R'),
 ('1934.01.08.R', '1934.02.08.R'),
 ('1911.07.31.R', '1911.07.31.T')]

In [58]:
[(case[i], case_2[i]) for i in range(len(case)) if case[i] != case_2[i]]

[('2015.07-10', '2015.07.10'), ('1934.01.08.R', '1934.02.08.R')]

In [59]:
# Let's check what is the reference doscumentation file
[(case[i], case_1[i] ,case_2[i] , hrefs[i]) for i in range(len(case)) if case[i] != case_1[i]][:2]

[('2016.09.15',
  '2016.09.16',
  '2016.09.15',
  'http://sharkattackfile.net/spreadsheets/pdf_directory/2016.09.16-BellsBeach.pdf'),
 ('2016.01.24.b',
  '2015.01.24.b',
  '2016.01.24.b',
  'http://sharkattackfile.net/spreadsheets/pdf_directory/2016.01.24.b-Love.pdf')]

In [60]:
# 26. Cleaning up that mess by using the 'Link' column to extract the real Case numbers

links = [str(i).strip() for i in df_copy_copy['Link']]

In [61]:
links[:5]

['http://sharkattackfile.net/spreadsheets/pdf_directory/2016.09.18.c-NSB.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2016.09.18.b-Luciano.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2016.09.18.a-NSB.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2016.09.17-Angiolella.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2016.09.16-BellsBeach.pdf']

In [62]:
len([j for j in [i.split('-')[0].split('/')[-1] for i in links] if j])

4488

In [63]:
case_nrs = [i.split('-')[0].split('/')[-1] for i in links]

In [64]:
# 27. Adding 'Case nr' column at the first position
df_copy_copy.insert(loc=0, column='Case nr',value=case_nrs)

In [65]:
# 28. Deleting the 3 Case number columns
df_copy_copy = df_copy_copy.drop(axis = 1, columns =  ['Case number',
                                                       'Case number 1', 
                                                       'Case number 2'])

In [66]:
# 29. Checking which Case nrs are duplicated, they should be uniqe...
df_copy_copy[df_copy_copy.duplicated(['Case nr'], keep=False)]


Unnamed: 0,Case nr,Date,Year,Country,Area,Location,Name,Sex,Injury,Fatal,Time,Source,Link
19,2016.06.04,04-Aug-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Nolan Tyler,M,Big toe bitten,N,,"News 965, 8/5/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
53,2016.06.04,04-Jun-16,2016,EGYPT,Suez,Ain Sokhna,Omar Abdel Qader,M,"Leg severely bitten, surgically amputated",N,Morning,"Ahram Online, 6/4/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
297,2014.08.08,08-Aug-14,2014,USA,Louisiana,"Lake Ponchartain off Southshore Harbor, New Or...",Trent Trentacosta,M,Minor lacerations to left heel and big toe,N,Afternoon,"The Times-Picayune, 8/9/2014",http://sharkattackfile.net/spreadsheets/pdf_di...
300,2014.08.08,02-Aug-14,2014,USA,Florida,"South of Cocoa Beach, Brevard County",male,M,Foot bitten,N,,"Florida Today, 8/8/2014",http://sharkattackfile.net/spreadsheets/pdf_di...
412,2013.08.25,25-Aug-2013,2013,USA,Florida,"Winterhaven Park, Ponce Inlet, Volus...",Riley Breihan,F,Minor injury to left lower leg & heel,N,Evening,"Daytona Beach News-Journal, 8/28/2013",http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5979,ND,No date (3 days after preceding incident) & pr...,0,SOUTH AFRICA,KwaZulu-Natal,Durban,a native fisherman,M,"FATAL, body not recovered but shark was caught...",Y,,"Rural New Yorker, 7/19/1913",http://sharkattackfile.net/spreadsheets/pdf_di...
5980,ND,Before 19-Jul-1913,0,SOUTH AFRICA,KwaZulu-Natal,Durban,a young Scotsman,M,"FATAL, leg stripped of flesh",Y,,"Rural New Yorker, 7/19/1913",http://sharkattackfile.net/spreadsheets/pdf_di...
5986,ND,Before 1906,0,AUSTRALIA,New South Wales,,Arab boy,M,FATAL,Y,,"L. Becke in New York Sun, 9/9/1906; L. Schultz...",http://sharkattackfile.net/spreadsheets/pdf_di...
5987,ND,Before 1903,0,AUSTRALIA,Western Australia,Roebuck Bay,male,M,FATAL,Y,,"H. Taunton; N. Bartlett, p. 234",http://sharkattackfile.net/spreadsheets/pdf_di...


In [67]:
# 30. Converting the non unique Case Nrs to unique
unique_case_nrs = []
index = 0
for i in df_copy_copy['Case nr']:
    if i not in unique_case_nrs:
        unique_case_nrs.append(i.strip())
        index = 0
        continue
    else:
        unique_case_nrs.append(i.strip() + '_' + str(index))
        index += 1
        continue
        

In [68]:
len(set(unique_case_nrs))

4488

In [69]:
df_copy_copy['Case nr'] = unique_case_nrs

In [70]:
# 31. Making the unique case nr column the index
df_copy_copy.set_index('Case nr', inplace=True)

In [71]:
df_copy_copy.head(2)

Unnamed: 0_level_0,Date,Year,Country,Area,Location,Name,Sex,Injury,Fatal,Time,Source,Link
Case nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016.09.18.c,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",male,M,Minor injury to thigh,N,Afternoon,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
2016.09.18.b,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Chucky Luciano,M,Lacerations to hands,N,Morning,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...


In [72]:
df_copy = df_copy_copy
df_copy_copy = df_copy.copy()

In [73]:
# 32. Checking the 'Injury' column
df_copy_copy['Injury'][:10]

Case nr
2016.09.18.c                         Minor injury to thigh
2016.09.18.b                          Lacerations to hands
2016.09.18.a                      Lacerations to lower leg
2016.09.17                    Struck by fin on chest & leg
2016.09.16           No injury: Knocked off board by shark
2016.09.11                             Minor injury to arm
2016.09.07        Severe lacerations to shoulder & forearm
2016.09.06                                           FATAL
2016.09.05.b    Lacerations & punctures to lower right leg
2016.09.05.a       No inury, board broken in half by shark
Name: Injury, dtype: object

In [74]:
# 33. Creating a list with body parts to categorize the injuries

body_parts = ['fatal',
              'death',
              'remain',
              'drown',
              'disappeared',
              'abrasion',
              'bod', # bodies etc
              'corpse',
              'recover', # for not recovered
              'head',
              'scalp',
              'teeth',
              'tooth',
              'chin',
              'neck', 
              'arm',
              'hand',
              'wrist',
              'finger',
              'finger',
              'chest',
              'shoulder',
              'torso',
              'bit',
              'abdomen',
              'buttock',
              'hip',
              'thigh',
              'knee',
              'calf',
              'lacerat',
              'leg',
              'ankle',
              'feet',
              'foot',
              'toe',
              'heel',
              'bruise',
              'back',
              'minor',
              'missing',
              'no injury', 
              'no inury',
              'not injured',
              'no details',
              'no shark',
              'not a shark',
              'multiple',
             ]
            
injuries = [i.lower().strip() for i in df_copy_copy['Injury']]


In [75]:
# 34. Cleaning up the 'Injury' column
injuries_cleaner = []
for i in injuries:
    for j in body_parts:
        b = 0
        if j in i:
            injuries_cleaner.append(j)
            b += 1
            break
    if not b:
        injuries_cleaner.append('NaN')


In [76]:
len(injuries), len(injuries_cleaner), set(injuries_cleaner)


(4488,
 4488,
 {'NaN',
  'abdomen',
  'abrasion',
  'ankle',
  'arm',
  'back',
  'bit',
  'bod',
  'bruise',
  'buttock',
  'calf',
  'chest',
  'chin',
  'corpse',
  'death',
  'disappeared',
  'drown',
  'fatal',
  'feet',
  'finger',
  'foot',
  'hand',
  'head',
  'heel',
  'hip',
  'knee',
  'lacerat',
  'leg',
  'minor',
  'missing',
  'multiple',
  'neck',
  'no details',
  'no injury',
  'no inury',
  'no shark',
  'not a shark',
  'not injured',
  'recover',
  'remain',
  'scalp',
  'shoulder',
  'teeth',
  'thigh',
  'toe',
  'tooth',
  'torso',
  'wrist'})

In [77]:
# 35. Creating a dictionary with body parts to categorize the injuries even better 
body_parts_dict = {'dead' : ['fatal','death','remain','drown','disappeared', 'bod', 'corpse','recover', 'missing'],
              'lacerations': ['abrasion', 'bit', 'lacerat', 'bruise', 'minor', 'multiple'],
              'head': ['head','scalp', 'teeth', 'tooth', 'chin', 'neck'],
              'upper body': ['arm', 'hand', 'wrist', 'finger', 'chest', 'shoulder', 'torso', 'abdomen'],
              'lower body': ['buttock', 'hip','thigh','knee','calf','leg','ankle','feet', 'foot', 'toe','heel', 'back'],
              'no injury': ['no injury', 'no inury','not injured'],
              'no shark' : ['no shark', 'not a shark'], 
             }
injuries_cleaner[:5]

['thigh', 'hand', 'lacerat', 'chest', 'no injury']

In [78]:
clean_injuries = []
for i in injuries_cleaner:            
    for keys, vals in body_parts_dict.items():    # for name, age in dictionary.iteritems():  (for Python 2.x)
        c = 0
        if i in vals:
            clean_injuries.append(keys)
            c+=1
            break
    if not c:
        clean_injuries.append('NaN')

In [79]:
len(clean_injuries)


4488

In [80]:
set(clean_injuries)

{'NaN',
 'dead',
 'head',
 'lacerations',
 'lower body',
 'no injury',
 'no shark',
 'upper body'}

In [81]:
# 36. Updating the 'Injury' column with cleaned data

df_copy_copy['Injury'] = clean_injuries

In [82]:
df_copy_copy.head(5)

Unnamed: 0_level_0,Date,Year,Country,Area,Location,Name,Sex,Injury,Fatal,Time,Source,Link
Case nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016.09.18.c,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",male,M,lower body,N,Afternoon,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
2016.09.18.b,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",Chucky Luciano,M,upper body,N,Morning,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
2016.09.18.a,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",male,M,lacerations,N,Morning,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
2016.09.17,17-Sep-16,2016,AUSTRALIA,Victoria,Thirteenth Beach,Rory Angiolella,M,upper body,N,,"The Age, 9/18/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
2016.09.16,16-Sep-16,2016,AUSTRALIA,Victoria,Bells Beach,male,M,no injury,N,,"The Age, 9/16/2016",http://sharkattackfile.net/spreadsheets/pdf_di...


In [83]:
# 37. Saving the cleaned data as csv file
df = df_copy_copy
df.to_csv('data-wrangling.csv')