# Proyecto de la semana 2

In [13]:
# El propósito de este proyecto es depurar los datos de un archivo csv, para que después
# sea copiado en un archivo nuevo.

# Generalidades

In [None]:
# En esta parte del proyecto la corrección de los datos va a ser para más de una columna,
# es decir, con instrucciones o comandos que afectan a más de una columna a la vez. Por
# ejemplo: La eliminación de registros nulos o el cambio de nombre de algunas columnas.

In [1]:
# Se importan las librerías necesarias.
import pandas as pd
import numpy as np
import re
import math
#import warnings as wn

In [2]:
# Se almacena el contenido del archivo csv en una tabla.
tiburones = pd.read_csv('attacks.csv', encoding = 'ISO-8859-1')

In [3]:
# Se revisa el contenido de la tabla, para hallar datos con problemas.
tiburones.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [19]:
# Se observa la cantidad de filas y columnas que posee el dataframe.
tiburones.shape

(25723, 24)

In [84]:
# Se visualiza el tipo de dato de cada columna.
tiburones.dtypes

Case Number                object
Date                       object
Year                      float64
Type                       object
Country                    object
Area                       object
Location                   object
Activity                   object
Name                       object
Sex                        object
Age                        object
Injury                     object
Fatal (Y/N)                object
Time                       object
Species                    object
Investigator or Source     object
pdf                        object
href formula               object
href                       object
Case Number.1              object
Case Number.2              object
original order            float64
Unnamed: 22                object
Unnamed: 23                object
dtype: object

In [20]:
# Se visualizan las columnas del dataframe.
tiburones.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [4]:
# Se cambia el nombre de las columnas que así lo requieran.
tiburones = tiburones.rename(columns={'Sex ':'Sex', 'Species ':'Species', 'Fatal (Y/N)':'Fatal', 'Investigator or Source':'Source'})
tiburones.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal', 'Time', 'Species',
       'Source', 'pdf', 'href formula', 'href', 'Case Number.1',
       'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'],
      dtype='object')

In [22]:
# Se muestra el total de valores nulos por cada columna.
tiburones.isnull().sum()

Case Number       17021
Date              19421
Year              19423
Type              19425
Country           19471
Area              19876
Location          19961
Activity          19965
Name              19631
Sex               19986
Age               22252
Injury            19449
Fatal             19960
Time              22775
Species           22259
Source            19438
pdf               19421
href formula      19422
href              19421
Case Number.1     19421
Case Number.2     19421
original order    19414
Unnamed: 22       25722
Unnamed: 23       25721
dtype: int64

In [5]:
# Se filtran los registros nulos y se almacenan sus índices en una lista.
tiburones_2 = tiburones.T
indices_nulos = []
for x in tiburones.index:
    if tiburones_2[x].isnull().all():
        indices_nulos.append(x)

In [6]:
# Se eliminan los registros nulos.
tiburones = tiburones.drop(indices_nulos,axis=0)

In [26]:
# Se verifica el número de valores nulos por columna.
tiburones.isnull().sum()

Case Number          1
Date              2401
Year              2403
Type              2405
Country           2451
Area              2856
Location          2941
Activity          2945
Name              2611
Sex               2966
Age               5232
Injury            2429
Fatal             2940
Time              5755
Species           5239
Source            2418
pdf               2401
href formula      2402
href              2401
Case Number.1     2401
Case Number.2     2401
original order    2394
Unnamed: 22       8702
Unnamed: 23       8701
dtype: int64

In [27]:
# Se habilita el dataframe para mostrar todas las filas.
pd.set_option('display.max_rows', None)

In [28]:
# Se habilita el dataframe para mostrar todas las columnas.
pd.set_option('display.max_columns', None)

# Columna Case Number

In [29]:
# Se imprimen en pantalla los registros donde la columna Case Number equivale a '0'.
tiburones[tiburones['Case Number'] == '0']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
6302,0,,,,,,,,,,,,,,,,,,,,,6304.0,,
6303,0,,,,,,,,,,,,,,,,,,,,,6305.0,,
6304,0,,,,,,,,,,,,,,,,,,,,,6306.0,,
6305,0,,,,,,,,,,,,,,,,,,,,,6307.0,,
6306,0,,,,,,,,,,,,,,,,,,,,,6308.0,,
6307,0,,,,,,,,,,,,,,,,,,,,,6309.0,,
6308,0,,,,,,,,,,,,,,,,,,,,,6310.0,,
6309,0,,,,,,,,,,,,,,,,,,,,,,,
6310,0,,,,,,,,,,,,,,,,,,,,,,,
6311,0,,,,,,,,,,,,,,,,,,,,,,,


In [30]:
# Los registros en donde la columna Case Number es '0', tienen valores nulos en las columnas
# restantes, por lo que es necesario eliminarlos, y para ello se almacenan sus índices en
# una lista.
indices_case_number_cero = tiburones[tiburones['Case Number'] == '0'].index
indices_case_number_cero

Int64Index([6302, 6303, 6304, 6305, 6306, 6307, 6308, 6309, 6310, 6311,
            ...
            8692, 8693, 8694, 8695, 8696, 8697, 8698, 8699, 8700, 8701],
           dtype='int64', length=2400)

In [31]:
# Se procede a eliminar los registros irrelevantes.
tiburones = tiburones.drop(indices_case_number_cero,axis=0)

In [7]:
# Se imprime el dataframe y puede observarse que los registros mencionados ya no están.
tiburones[tiburones['Case Number'] == '0']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
6302,0,,,,,,,,,,...,,,,,,,,6304.0,,
6303,0,,,,,,,,,,...,,,,,,,,6305.0,,
6304,0,,,,,,,,,,...,,,,,,,,6306.0,,
6305,0,,,,,,,,,,...,,,,,,,,6307.0,,
6306,0,,,,,,,,,,...,,,,,,,,6308.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8697,0,,,,,,,,,,...,,,,,,,,,,
8698,0,,,,,,,,,,...,,,,,,,,,,
8699,0,,,,,,,,,,...,,,,,,,,,,
8700,0,,,,,,,,,,...,,,,,,,,,,


In [34]:
# Se visualiza y almacena el índice de un registro que tiene valores nulos en todos sus
# campos, excepto Case Number.
index_case_number_xx = tiburones[tiburones['Case Number'] == 'xx'].index
index_case_number_xx

Int64Index([25722], dtype='int64')

In [35]:
# Se borra el registro mencionado.
tiburones = tiburones.drop(index_case_number_xx,axis=0)
tiburones[tiburones['Case Number'] == 'xx']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23


# Columna Date

In [36]:
# Con las acciones ejecutadas para la columna Case Number se eliminaron los datos nulos de
# la columna Date.
tiburones[tiburones['Date'].isnull()]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23


In [37]:
# Se imprimen los valores de la columna Date.
tiburones['Date'].value_counts()

1957                                                                11
1942                                                                 9
1956                                                                 8
1950                                                                 7
1941                                                                 7
1958                                                                 7
1949                                                                 6
No date                                                              6
05-Oct-2003                                                          5
28-Jul-1995                                                          5
1955                                                                 5
1940                                                                 5
1959                                                                 5
No date, Before 1963                                                 5
Oct-19

In [38]:
# Se elabora una función para extraer día y mes del ataque, debido a que el año ya está
# en otra columna.
def dia_mes(fecha):
    date_without_year = re.findall('[0-9]{2}-[A-Z][a-z]{2}', fecha)
    if len(date_without_year) > 0:
        return date_without_year[0]
    else:
        return 'Unknown'

In [39]:
# Se aplica la función dia_mes en la columna Date.
tiburones['Date'] = tiburones['Date'].apply(dia_mes)
tiburones['Date'].value_counts()

Unknown    925
02-Sep      33
15-Jul      33
27-Jul      31
29-Aug      31
09-Apr      28
26-Jun      28
26-Jul      28
07-Jul      26
25-Jul      25
28-Jul      25
15-Jun      25
19-Jul      25
03-Sep      25
24-Jun      25
24-Aug      24
02-Jun      24
26-Dec      24
16-Sep      24
23-Jul      24
13-Sep      24
05-Oct      24
26-Aug      24
19-Aug      23
10-Aug      23
04-Jul      23
27-Jan      23
06-Sep      23
22-Aug      23
30-Aug      22
25-Nov      22
01-Jan      22
17-Jul      22
09-Jan      22
14-Jan      21
21-Dec      21
05-Jan      21
20-Apr      21
15-Aug      21
23-Jan      21
30-Nov      21
04-Sep      21
24-May      20
25-Jun      20
13-Apr      20
27-Sep      20
30-Sep      20
27-Aug      20
12-Apr      20
08-Jul      20
02-Jul      20
28-Aug      20
10-Oct      19
11-Dec      19
19-Sep      19
25-Jan      19
06-Jul      19
24-Jul      19
31-Dec      19
24-Jan      19
28-Dec      19
08-Aug      19
31-May      19
12-Nov      19
14-Jul      19
10-Mar      19
03-Jan    

# Columna Year

In [40]:
# Se revisan los valores nulos contenidos en la columna.
tiburones[tiburones['Year'].isnull()]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
187,2017.01.08.R,08-Jan,,Invalid,AUSTRALIA,Queensland,,Spearfishing,Kerry Daniel,M,35.0,"No attack, shark made a threat display",,,Bull shark,Liquid Vision 1/8/2017,2017.01.08.R-KerryDaniel.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.01.08.R,2017.01.08.R,6116.0,,
6079,1836.08.19.R,19-Aug,,Unprovoked,ENGLAND,Cumberland,Whitehaven,Swimming,a boy,M,,FATAL,Y,,,"C. Moore, GSAF",1835.08.19.R-Whitehaven.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1836.08.19.R,1836.08.19.R,224.0,,


In [41]:
# Se asigna la palabra Unknown a el campo Year de estos dos registros.
tiburones['Year'] = tiburones['Year'].fillna('Unknown')
tiburones[tiburones['Year'].isnull()]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23


In [42]:
# Se visualizan los valores de la columna Year.
tiburones['Year'].value_counts()

2015.0     143
2017.0     136
2016.0     130
2011.0     128
2014.0     127
0.0        125
2013.0     122
2008.0     122
2009.0     120
2012.0     117
2007.0     112
2006.0     103
2005.0     103
2010.0     101
2000.0      97
1960.0      93
1959.0      93
2003.0      92
2001.0      92
2004.0      92
2002.0      88
1962.0      86
1961.0      78
1995.0      76
1964.0      66
1999.0      66
1998.0      65
1996.0      61
1963.0      61
1966.0      58
1997.0      57
1994.0      56
1992.0      56
1993.0      56
1988.0      55
1958.0      54
2018.0      53
1989.0      53
1965.0      51
1956.0      51
1983.0      50
1975.0      49
1981.0      49
1967.0      48
1968.0      47
1950.0      43
1955.0      43
1954.0      42
1970.0      42
1942.0      41
1957.0      41
1984.0      41
1982.0      40
1976.0      39
1986.0      39
1929.0      38
1991.0      38
1974.0      38
1990.0      38
1985.0      37
1953.0      36
1987.0      35
1980.0      35
1972.0      35
1935.0      32
1951.0      32
1936.0    

In [43]:
# Se observa que los años están en formato flotante, por lo que se procede a eliminar la
# parte decimal.
tiburones = tiburones.astype({'Year':'str'})
tiburones['Year'] = tiburones['Year'].apply(lambda x : x.replace('.0', ''))
tiburones['Year'].value_counts()

2015       143
2017       136
2016       130
2011       128
2014       127
0          125
2013       122
2008       122
2009       120
2012       117
2007       112
2005       103
2006       103
2010       101
2000        97
1960        93
1959        93
2004        92
2003        92
2001        92
2002        88
1962        86
1961        78
1995        76
1999        66
1964        66
1998        65
1963        61
1996        61
1966        58
1997        57
1993        56
1994        56
1992        56
1988        55
1958        54
2018        53
1989        53
1956        51
1965        51
1983        50
1981        49
1975        49
1967        48
1968        47
1950        43
1955        43
1954        42
1970        42
1957        41
1984        41
1942        41
1982        40
1986        39
1976        39
1991        38
1990        38
1974        38
1929        38
1985        37
1953        36
1980        35
1972        35
1987        35
1935        32
1951        32
1936      

In [44]:
# Hay registros con 0 en la columna Year, se imprimen estos para analizarlos y determinar
# si conservan ese valor.
tiburones[tiburones['Year'] == '0']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
6177,0000.0214,Unknown,0,Unprovoked,,Ionian Sea,,Ascending from a dive,"Tharsys, a sponge diver",M,,"FATAL, shark/s bit him in two",Y,,,"Reported by Greek poet, Leonidas of Tarentum (...",214BC-Tharsys.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0000.0214,0000.0214,126.0,,
6178,0000.0336,Unknown,0,Unprovoked,GREECE,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,A candidate for initiation,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,,Plutarch (45 - 125 A.D.) in Life of Phoecion (...,336-BC-Carnathus.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0000.0336,0000.0336,125.0,,
6179,0000.0493,Unknown,0,Sea Disaster,GREECE,Off Thessaly,,Shipwrecked Persian Fleet,males,M,,Herodotus tells of sharks attacking men in the...,Y,,,Herodotus (485 - 425 B.C.),493BC-PersianFleet.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0000.0493,0000.0493,124.0,,
6180,0000.0725,Unknown,0,Sea Disaster,ITALY,Tyrrhenian Sea,Krater found during excavations at Lacco Ameno...,Shipwreck,males,M,,Depicts shipwrecked sailors attacked by a sha...,Y,,,"V.M. Coppleson (1958), p.262, et al",725BC-vase.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0000.0725,0000.0725,123.0,,
6181,ND-0154,Unknown,0,Unprovoked,CANADA,,Grand Banks,Fishing,Joe Folsom,M,,Arm bitten,N,,,"C.E.Russell, pp. 310-311",ND-0154-Folsom.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0154,ND-0154,122.0,,
6182,ND-0153,Unknown,0,Unprovoked,KENYA,Mombasa,Kilindini,Diving,Conway Plough & Dr. Jonathan Higgs,M,,Conway's leg was bitten Higgs injury was FATAL,N,,,A.J. Venter,ND-0153-Plough-Higgs.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0153,ND-0153,121.0,,
6183,ND-0152,Unknown,0,Unprovoked,KENYA,Mombasa,Kilindini,Diving,Hamisi Njenga,M,,FATAL,Y,,,eadestination,ND-0152-Kenya.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0152,ND-0152,120.0,,
6184,ND-0151,Unknown,0,Unprovoked,PANAMA,Bocas del Toro Province,Red Frog Beach,Swimming/,male,M,20,FATAL,Y,,,C. Mendieta & A. Duarte,ND-0151-Panama.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0151,ND-0151,119.0,,
6185,ND-0150,Unknown,0,Unprovoked,URUGUAY,Rocha,"Isla Chica, La Paloma",Swimming,,,,Foot bitten,N,,,"Di Candia, 2004",ND-0150-Uruguay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0150,ND-0150,118.0,,
6186,ND-0149,Unknown,0,Unprovoked,URUGUAY,Rocha,"Playa del Barco, La Pedrera",Swimming,Maciello,M,,FATAL,Y,,,"Di Candia, 2004",ND-0149-Maciello.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0149,ND-0149,117.0,,


In [45]:
# Se observa que los registros no tienen alguna columna en la que se constate que el ataque
# fue en el año 0 de la era cristiana, por lo que este valor será sustituido por la palabra
# Unknown.
indices = tiburones[tiburones['Year'] == '0'].index
for x in indices:
    tiburones['Year'][x] = 'Unknown'
tiburones[tiburones['Year'] == 'Unknown']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
187,2017.01.08.R,08-Jan,Unknown,Invalid,AUSTRALIA,Queensland,,Spearfishing,Kerry Daniel,M,35,"No attack, shark made a threat display",,,Bull shark,Liquid Vision 1/8/2017,2017.01.08.R-KerryDaniel.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.01.08.R,2017.01.08.R,6116.0,,
6079,1836.08.19.R,19-Aug,Unknown,Unprovoked,ENGLAND,Cumberland,Whitehaven,Swimming,a boy,M,,FATAL,Y,,,"C. Moore, GSAF",1835.08.19.R-Whitehaven.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1836.08.19.R,1836.08.19.R,224.0,,
6177,0000.0214,Unknown,Unknown,Unprovoked,,Ionian Sea,,Ascending from a dive,"Tharsys, a sponge diver",M,,"FATAL, shark/s bit him in two",Y,,,"Reported by Greek poet, Leonidas of Tarentum (...",214BC-Tharsys.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0000.0214,0000.0214,126.0,,
6178,0000.0336,Unknown,Unknown,Unprovoked,GREECE,Piraeus,In the haven of Cantharus,Washing his pig in preparation for a religious...,A candidate for initiation,M,,"FATAL, shark ""bit off all lower parts of him u...",Y,,,Plutarch (45 - 125 A.D.) in Life of Phoecion (...,336-BC-Carnathus.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0000.0336,0000.0336,125.0,,
6179,0000.0493,Unknown,Unknown,Sea Disaster,GREECE,Off Thessaly,,Shipwrecked Persian Fleet,males,M,,Herodotus tells of sharks attacking men in the...,Y,,,Herodotus (485 - 425 B.C.),493BC-PersianFleet.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0000.0493,0000.0493,124.0,,
6180,0000.0725,Unknown,Unknown,Sea Disaster,ITALY,Tyrrhenian Sea,Krater found during excavations at Lacco Ameno...,Shipwreck,males,M,,Depicts shipwrecked sailors attacked by a sha...,Y,,,"V.M. Coppleson (1958), p.262, et al",725BC-vase.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,0000.0725,0000.0725,123.0,,
6181,ND-0154,Unknown,Unknown,Unprovoked,CANADA,,Grand Banks,Fishing,Joe Folsom,M,,Arm bitten,N,,,"C.E.Russell, pp. 310-311",ND-0154-Folsom.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0154,ND-0154,122.0,,
6182,ND-0153,Unknown,Unknown,Unprovoked,KENYA,Mombasa,Kilindini,Diving,Conway Plough & Dr. Jonathan Higgs,M,,Conway's leg was bitten Higgs injury was FATAL,N,,,A.J. Venter,ND-0153-Plough-Higgs.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0153,ND-0153,121.0,,
6183,ND-0152,Unknown,Unknown,Unprovoked,KENYA,Mombasa,Kilindini,Diving,Hamisi Njenga,M,,FATAL,Y,,,eadestination,ND-0152-Kenya.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0152,ND-0152,120.0,,
6184,ND-0151,Unknown,Unknown,Unprovoked,PANAMA,Bocas del Toro Province,Red Frog Beach,Swimming/,male,M,20,FATAL,Y,,,C. Mendieta & A. Duarte,ND-0151-Panama.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND-0151,ND-0151,119.0,,


# Columna Type

In [46]:
# Se visualizan los valores de la columna.
tiburones = tiburones.astype({'Type':'str'})
tiburones['Type'].value_counts()

Unprovoked      4595
Provoked         574
Invalid          547
Sea Disaster     239
Boating          203
Boat             137
nan                4
Questionable       2
Boatomg            1
Name: Type, dtype: int64

In [47]:
# Los valores Boat y Boatomg hacen referencia al mismo tipo de incidente (Boating), por lo
# tanto será conveniente reemplazarlos por Boating, también se sustituyen los que contienen
# nan por Unknown.
tiburones['Type'] = tiburones['Type'].apply(lambda x : 'Boating' if 'Boa' in x else x)
tiburones['Type'] = tiburones['Type'].apply(lambda x : 'Unknown' if 'nan' in x else x)
tiburones['Type'].value_counts()

Unprovoked      4595
Provoked         574
Invalid          547
Boating          341
Sea Disaster     239
Unknown            4
Questionable       2
Name: Type, dtype: int64

In [48]:
# Se visualizan los registros de tipo Invalid.
tiburones[tiburones['Type'] == 'Invalid']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
2,2018.06.09,09-Jun,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
13,2018.05.13.a,13-May,2018,Invalid,ENGLAND,Cornwall,Off Land's End,Fishing,Max Berryman,M,21,Injured by teeth of a dead porbeagle shark he ...,N,08h15,Invalid incident,"K. McMurray, TrackingSharks.com",2018.05.13.a-Berryman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.13.a,2018.05.13.a,6290.0,,
31,2018.04.10.R,10-Apr,2018,Invalid,BRAZIL,Alagoas,"Praia de Sauaçuhy, Maceió",Fishing,Josias Paz,M,56,Injury to ankle from marine animal trapped in ...,N,,Shark involvement not confirmed,"K. McMurray, TrackingSharks.com",2018.04.10.R-Paz.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.04.10.R,2018.04.10.R,6272.0,,
43,2018.02.14,14-Feb,2018,Invalid,AUSTRALIA,Queensland,Mooloolaba Beach,Swimming,Sharna Babd,F,,"Collision / No injury, no attack",N,18h00,Possibly a wobbegong,"Sunshine Coast Daily, 2/15/2018",2018.02.14-Babd.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.02.14,2018.02.14,6260.0,,
44,2018.02.11,11-Feb,2018,Invalid,BRAZIL,"Boi Island, Victoria",Espirito Santo,Cleaning fish,Rosalida Souza,F,46,Lacerations to 4 toes of right foot,N,,"Injury believed caused by an eel, not a shark",TrackingSharks.com,2018.02.11-Rosilda.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.02.11,2018.02.11,6259.0,,
46,2018.02.01,01-Feb,2018,Invalid,AUSTRALIA,Western Australia,"Avalon Point, Manurah",Spearfishing,Lucas Martin,M,14,"No injury no attack. This is considerd an ""enc...",N,,2m shark,"The West Australian, 2/2/2018",2018.02.01-Martin.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.02.01,2018.02.01,6257.0,,
76,2017.10.01,01-Oct,2017,Invalid,SOUTH AFRICA,Western Cape Province,Dyer Island,Scuba Diving,Bradley Fick,M,31,FATAL,,,Death may have been due to drowning,"All Africa, 10/11/2017",2017.10.01-Fick.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.10.01,2017.10.01,6227.0,,
79,2017.09.24.b,24-Sep,2017,Invalid,USA,New York,Rockaway,Surfing,Michah Behrend,M,33,"Lacerations to right ankle, foot & toe",,15h30,Questionable,"S. Curatolo-Wageman, GSAF",2017.09.24.b-Behrend.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.09.24.b,2017.09.24.b,6224.0,,
84,2017.09.15.b,15-Sep,2017,Invalid,SOUTH AFRICA,Western Cape Province,Hawston,Scuba Diving,Wayon Love,M,25,"FATAL, but death was probably due to drowning",,Afternoon,,"Ground Up, 9/20/2017",2017.09.15.b-Love.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.09.15.b,2017.09.15.b,6219.0,,
98,2017.08.26.b,26-Aug,2017,Invalid,SPAIN,Castellón,Grao de Moncofa,Swimming,female,F,11,Lacerations to left foot,,Midday,Shark involvement questionable,"El Periodico Mediterraneo, 8/27/2017",2017.08.26.b-Spain.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.08.26.b,2017.08.26.b,6205.0,,


In [49]:
# Se sustituye ese valor por la palabra Unknown.
tiburones['Type'] = tiburones['Type'].apply(lambda x : 'Unknown' if 'Invalid' in x else x)
tiburones['Type'].value_counts()

Unprovoked      4595
Provoked         574
Unknown          551
Boating          341
Sea Disaster     239
Questionable       2
Name: Type, dtype: int64

In [50]:
# Se imprimen los de tipo Questionable.
tiburones[tiburones['Type'] == 'Questionable']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
16,2018.05.09,09-May,2018,Questionable,AUSTRALIA,New South Wales,"Sharpes Beach, Ballina",Surfing,male,M,,"No injury, surfboard damaged",N,10h30,Shark involvement not confirmed,"B. Myatt, GSAF",2018.05.09-SharpesBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.09,2018.05.09,6287.0,,
20,2018.04.25.b,25-Apr,2018,Questionable,AUSTRALIA,New South Wales,Lennox Head,Surfing,Matthew Lee,M,,No injury,N,07h00,Questionable,"B. Myatt, GSAF",2018.04.25.b-Lee.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.04.25.b,2018.04.25.b,6283.0,,


In [51]:
# En ambos registros no está confirmado que haya sido un tiburón el animal que realizó el
# ataque, por lo que será necesario eliminarlos.
indices = tiburones[tiburones['Type'] == 'Questionable'].index
tiburones.drop(indices, inplace = True)
tiburones[tiburones['Type'] == 'Questionable']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23


# Columna Country

In [52]:
# Se visualizan los valores de la columna.
tiburones['Country'].value_counts()

USA                                      2229
AUSTRALIA                                1336
SOUTH AFRICA                              579
PAPUA NEW GUINEA                          134
NEW ZEALAND                               128
BRAZIL                                    112
BAHAMAS                                   109
MEXICO                                     89
ITALY                                      71
FIJI                                       62
PHILIPPINES                                61
REUNION                                    60
NEW CALEDONIA                              53
CUBA                                       46
MOZAMBIQUE                                 45
SPAIN                                      44
INDIA                                      40
EGYPT                                      38
JAPAN                                      34
CROATIA                                    34
PANAMA                                     32
SOLOMON ISLANDS                   

In [53]:
# Se revisan los valores nulos.
tiburones[tiburones['Country'].isnull()]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
62,2017.11.13.R,13-Nov,2017,Unprovoked,,,,Surfing,Timur Yunusov,M,24.0,Puncture wounds to feet,N,,,Instagram,2017.11.13.R-Timur.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2017.11.13.R,2017.11.13.R,6241.0,,
525,2014.08.00,Unknown,2014,Unknown,,,,Sea disaster,Cuban refugees,M,,Shark involvement prior to death not confirmed,,,Shark involvement not confirmed,"Associated Press, 11/27/2014",2014.08.00-Cuban-refugees.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2014.08.00,2014.08.00,5778.0,,
2956,1983.00.00.d,Unknown,1983,Unprovoked,,English Channel,,Swimming,Padma Shri Taranath Narayan Shenoy,M,,Left leg bitten,N,,,"Times of India, 2/5/2012",1983.00.00.d-Shenoy.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1983.00.00.d,1983.00.00.d,3347.0,,
3378,1970.11.00,Unknown,1970,Unprovoked,,,,,Heinz Plotsky,M,,Extensive injuries,N,,,"H.D. Baldridge (1994), SAF Case #1645",1970.11.00-NV-Plotsky.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.11.00,1970.11.00,2925.0,,
3387,1970.08.02,02-Aug,1970,Unknown,,Caribbean Sea,Between St. Kitts & Nevis,Sea Disaster Sinking of ferryboat Christina,,,,"Sharks scavenged on bodies, but no record of t...",,Afternoon,Shark involvement prior to death was not confi...,"Rome News Tribune, 8/3/1970",1970.08.02-Christina-ferryboat.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.08.02,1970.08.02,2916.0,,
3388,1970.07.05,Unknown,1970,Unprovoked,,,,,male,M,,Finger or toe severed,N,Night,Mako shark,"H.D. Baldridge (1994), SAF Case #1628",1970.07.05-NV-male.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.07.05,1970.07.05,2915.0,,
3395,1970.04.00.b,Unknown,1970,Provoked,,,,Freediving,Lionel Jarvis,M,,Arm abraded & lacerated. Recorded as PROVOKED ...,N,,Wobbegong shark,"H.D. Baldridge (1994), SAF Case #1616",1970.04.00.b-NV-Jarvis.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.04.00.b,1970.04.00.b,2908.0,,
3399,1970.02.05,Unknown,1970,Unprovoked,,,,Wading,Sally Anne Irvine,F,8.0,Lacerations to lower leg,N,,Carpet shark,H.D. Baldridge (1994) SAF Case #1626,1970.02.05-NV-Irvine.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.02.05,1970.02.05,2904.0,,
3425,1969.08.00,Unknown,1969,Unprovoked,,,,,Rodney Hughes,M,25.0,Am lacerated,N,,,H.D. Baldridge (1994) SAF Case #1602,1969.08.00-NV-Hughes.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1969.08.00,1969.08.00,2878.0,,
3605,1965.10.21,21-Oct,1965,Unprovoked,,,Florida Strait,The boat Caribou II sank,Mario Castellanos,M,39.0,Survived,N,,,"Lodi News Sentinel, 10/30/1965",1965.10.21-Castellanos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1965.10.21,1965.10.21,2698.0,,


In [54]:
# Se procede a su sustitución por Unknown.
tiburones['Country'] = tiburones['Country'].fillna('Unknown')
tiburones[tiburones['Country'].isnull()]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23


In [55]:
# Se corrigen los valores inconsistentes.
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'FIJI' if 'Fiji' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'JAPAN' if 'OKINAWA' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'SPAIN' if 'AZORES' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'SAINT HELENA' if 'British overseas territory' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'UNITED ARAB EMIRATES' if '(UAE)' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'GREECE' if 'CRETE' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'SUDAN' if 'SUDAN?' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'ASIA' if 'ASIA?' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'INDIAN OCEAN' if 'INDIAN OCEAN?' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'RED SEA' if 'RED SEA?' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'SRI LANKA' if 'CEYLON' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'SIERRA LEONE' if 'Sierra Leone' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'INDONESIA' if 'JAVA' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'UNKNOWN' if 'DIEGO GARCIA' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'SEYCHELLES' if 'Seychelles' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'UNKNOWN' if 'Between PORTUGAL & INDIA' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'PHILIPPINES' if ' PHILIPPINES' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'TONGA' if ' TONGA' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'AFRICA' if 'Coast of AFRICA' in x else x)
tiburones['Country'] = tiburones['Country'].apply(lambda x : 'UNKNOWN' if 'Unknown' in x else x)
tiburones['Country'].value_counts()

USA                               2229
AUSTRALIA                         1336
SOUTH AFRICA                       579
PAPUA NEW GUINEA                   134
NEW ZEALAND                        128
BRAZIL                             112
BAHAMAS                            109
MEXICO                              89
ITALY                               71
FIJI                                65
PHILIPPINES                         62
REUNION                             60
NEW CALEDONIA                       53
UNKNOWN                             52
SPAIN                               49
CUBA                                46
MOZAMBIQUE                          45
JAPAN                               40
INDIA                               40
EGYPT                               38
CROATIA                             34
PANAMA                              32
SOLOMON ISLANDS                     30
IRAN                                29
GREECE                              27
JAMAICA                  

# Columna Area

In [56]:
# Se imprimen los valores de la columna.
tiburones['Area'].value_counts()

Florida                                                           1037
New South Wales                                                    484
Queensland                                                         311
Hawaii                                                             298
California                                                         290
KwaZulu-Natal                                                      213
Western Cape Province                                              195
Western Australia                                                  189
South Carolina                                                     160
Eastern Cape Province                                              160
South Australia                                                    104
North Carolina                                                     101
Victoria                                                            90
Pernambuco                                                          74
Texas 

In [57]:
# Se corrige la columna.
tiburones['Area'] = tiburones['Area'].fillna('Unknown')
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Bengal' if '22ºN, 88ºE' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Split-Dalmatia County' if ' Split-Dalmatia Count' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Split-Dalmatia County' if ' Split-Dalmatia County' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Lau Province' if ' Lau Province' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'North Carolina' if ' North Carolina' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Upolu Island' if ' Upolu Island' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Kikori River mouth' if ' Kikori River mouth' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Nusa Tenggara' if ' Nusa Tenggara' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'North Atlantic Ocean' if '33N, 68W' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Southwestern Pacific' if '(Southwestern Pacific)' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'West of Ceylon' if 'West of Ceylon (Sri  Lanka)' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Unknown' if '9.35N 79.35W' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Manila Bay' if ' Manila Bay' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Loyalty Islands' if ' Loyalty Islands' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'New Jersey' if ' New Jersey' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'La Libertad' if ' La Libertad' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Gulf of Guinea' if "35º39 : 165º8'" in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'South Island' if ' South Island' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Panama' if 'PANAMA' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Nalotu' if '19S, 178?E' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'North Atlantic Ocean' if '04.05N-13.23W' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Primorje-Gorski Kotar County' if ' Primorje-Gorski Kotar County' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Torres Strait' if '10ºS, 142ºE' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Indian Ocean' if '18S / 50E' in x else x)
tiburones['Area'] = tiburones['Area'].apply(lambda x : 'Cuba' if 'CUBA' in x else x)
tiburones['Area'].value_counts()

Florida                                                           1037
New South Wales                                                    484
Unknown                                                            456
Queensland                                                         311
Hawaii                                                             298
California                                                         290
KwaZulu-Natal                                                      213
Western Cape Province                                              195
Western Australia                                                  189
South Carolina                                                     160
Eastern Cape Province                                              160
South Australia                                                    104
North Carolina                                                     102
Victoria                                                            90
Pernam

# Columna Location

In [58]:
# Se revisa el contenido de la columna.
tiburones['Location'].value_counts()

New Smyrna Beach, Volusia County                                                                                           163
Daytona Beach, Volusia County                                                                                               30
Ponce Inlet, Volusia County                                                                                                 20
Melbourne Beach, Brevard County                                                                                             18
Myrtle Beach, Horry County                                                                                                  17
Durban                                                                                                                      16
Boa Viagem, Recife                                                                                                          14
Isle of Palms, Charleston County                                                                               

In [59]:
# Se corrige la columna.
tiburones['Location'] = tiburones['Location'].fillna('Unknown')
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Long Island' if "Long Island near Madang, about 500 km (310 miles) north of the South Pacific nation's capital of Port Moresby" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Margaret River District' if 'Perth? (Margaret River District)' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Madras' if ' Chennai (formerly Madras)' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Hadrians Island' if 'Hadrians (Haidana?) Island, near Port Moresby' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Lundy Island' if ' Lundy Island' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Port Isabela de Basilan' if ' Port Isabela de Basilan' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'New Smyrna Beach, Volusia County' if ' New Smyrna Beach, Volusia County' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Unknown' if ' ' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Chioggia' if ' Chioggia' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Bangkok' if 'Shark caught in Indonesia, offloaded to trawler that docked at Samut Prakan, 20 miles south of Bangkok, Thailand' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Vũng Tàu' if 'V?ng Tàu' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Kanazawa' if 'Kanazawa?' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'A pearl farm in Roebuck Bay' if ' A pearl  farm in Roebuck Bay' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Florida' if 'Victim was on said to be on tanker that collided off Florida coast on 10-20-1943' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : 'Black Head, south of Taree' if ' Black Head, south of Taree' in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "D'Entrecasteaux islands, 20 miles off the coast" if " D'Entrecasteaux islands, 20 miles off the coast" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "Palm Beach" if " Palm Beach?" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "Huntington Beach, Orange County" if " Huntington Beach, Orange County" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "Kalepolepo, Kihei, Maui" if " Kalepolepo, Kihei, Maui" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "Cirali Beach" if "?nciralti Beach, ?zmir" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "Botany Bay" if " Botany Bay, ?zmir" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "Huon Gulf" if " Huon Gulf" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "Perth" if " Perth" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "Côte-Blanche, Nouméa" if " Côte-Blanche, Nouméa" in x else x)
tiburones['Location'] = tiburones['Location'].apply(lambda x : "Côte-Blanche, Nouméa" if " Côte-Blanche, Nouméa" in x else x)
tiburones['Location'].value_counts()

Unknown              5470
Durban                 16
Piedade                11
Sydney                 10
Nahoon                 10
Amanzimtoti             8
Bunbury                 7
Charleston              7
Townsville              6
Acapulco                6
Brisbane                6
Bondi                   6
Wollongong              6
Coogee                  6
Albany                  5
Newcastle               5
Winkelspruit            5
Arniston                5
Cojimar                 5
Macassar                5
Nassau                  5
Muizenberg              5
Isipingo                5
Darwin                  5
Galveston               5
Margate                 5
Noumea                  5
Tahiti                  4
Tampico                 4
Maui                    4
Scottburgh              4
Geraldton               4
Cronulla                4
Madras                  4
Balian                  4
Paiva                   4
Hartenbos               4
Glenelg                 3
Manuelita   

# Columna Activity

In [60]:
# Se visualizan los valores.
tiburones['Activity'].value_counts()

Surfing                                                                                                                                                                                                                                                           969
Swimming                                                                                                                                                                                                                                                          869
Fishing                                                                                                                                                                                                                                                           431
Spearfishing                                                                                                                                                                                                          

In [61]:
# Se extraen los verbos de acuerdo con el patrón ing.
tiburones = tiburones.astype({'Activity':'str'})
def extraer_verbo(oracion):
    verbos = re.findall('\w+ing', oracion)
    a = len(verbos)
    if a > 0:
        verbo = ''
        for x in range(a):
            verbos[x] = verbos[x].capitalize()
            verbo += verbos[x]
            verbo += '/'
        verbo = verbo.rstrip('/')
        return verbo
    else:
        return 'Unknown'
tiburones['Activity'] = tiburones['Activity'].apply(extraer_verbo)
tiburones['Activity'].value_counts()

Surfing                                        1050
Swimming                                       1041
Unknown                                         948
Fishing                                         655
Diving                                          450
Spearfishing                                    365
Bathing                                         183
Wading                                          156
Boarding                                        132
Standing                                        117
Snorkeling                                       90
Floating                                         43
Treading                                         36
Skiing                                           36
Kayaking                                         35
Attempting                                       27
Playing                                          22
Walking                                          21
Sitting                                          20
Spearfishing

# Columna Name

In [62]:
# Se visualizan los valores de la columna.
tiburones['Name'].value_counts()

male                                                                                                                                                                                                                              549
female                                                                                                                                                                                                                             97
boy                                                                                                                                                                                                                                23
2 males                                                                                                                                                                                                                            16
boat                                                                            

In [63]:
# Se procede a su corrección.
tiburones = tiburones.astype({'Name':'str'})
tiburones['Name'] = tiburones['Name'].apply(lambda x : x.lstrip().rstrip().title())
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Woman' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Man' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Female' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Male' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Youth' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Boy' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Nan' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Anonymous' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Girl' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Sailor' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Child' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Native' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Diver' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'boy' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Fisherman' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Soldier' in x else x)
tiburones['Name'] = tiburones['Name'].apply(lambda x : 'Unknown' if 'Fishermen' in x else x)
tiburones['Name'].value_counts()

Unknown                                                                                                                                                                                                                          1337
Boat                                                                                                                                                                                                                               14
Unidentified                                                                                                                                                                                                                        6
John Williams                                                                                                                                                                                                                       3
Andre Hartman                                                                   

# Columna Sex

In [64]:
# Se visualizan los datos de la columna.
tiburones['Sex'].value_counts()

M      5092
F       637
N         2
M         2
lli       1
.         1
Name: Sex, dtype: int64

In [65]:
# Se corrige la columna.
tiburones['Location'] = tiburones['Location'].fillna('Unknown')
tiburones['Sex'] = tiburones['Sex'].apply(lambda x : x if x == 'M' or x == 'F' else 'Unknown')
tiburones['Sex'].value_counts()

M          5092
F           637
Unknown     571
Name: Sex, dtype: int64

# Columna Age

In [66]:
# Se imprimien los datos.
tiburones['Age'].value_counts()

17                         154
18                         150
19                         142
20                         141
15                         139
16                         138
21                         119
22                         117
25                         108
24                         106
14                         101
13                          94
26                          83
28                          80
23                          80
29                          78
27                          78
30                          76
12                          73
32                          69
35                          69
40                          56
10                          56
31                          52
34                          50
38                          48
33                          44
43                          43
36                          43
37                          42
41                          38
42                          38
39      

In [72]:
# Corrección de la columna.
tiburones = tiburones.astype({'Age':'str'})
tiburones['Age'] = tiburones['Age'].apply(lambda x : x.replace(' ',''))
def filtro_edad(oracion):
    edades = re.findall('[0-9]{2}or[0-9]{2}', oracion)
    a = len(edades)
    if a > 0:
        edad = edades[0].split('or')
        resultado = (int(edad[0]) + int(edad[1])) / 2
        resultado = math.floor(resultado)
        return str(resultado)
    else:
        edades = re.findall('[0-9]{2}&[0-9]{2}', oracion)
        b = len(edades)
        if b > 0:
            edad = edades[0].split('&')
            resultado = (int(edad[0]) + int(edad[1])) / 2
            resultado = math.floor(resultado)
            return str(resultado)
        else:
            edades = re.findall('[0-9]{2}[0-9]', oracion)
            c = len(edades)
            if c > 0:
                return edades[0]
            else:
                return 'Unknown'
tiburones['Age'] = tiburones['Age'].apply(filtro_edad)
tiburones['Age'].value_counts()

Unknown    6274
26            4
29            3
31            3
40            2
35            2
21            1
19            1
15            1
33            1
23            1
13            1
11            1
24            1
16            1
12            1
32            1
27            1
Name: Age, dtype: int64

# Columna Injury

In [73]:
# Se visualizan los valores de la columna.
tiburones['Injury'].value_counts()

FATAL                                                                                                                                                                                                                                          802
Survived                                                                                                                                                                                                                                        97
Foot bitten                                                                                                                                                                                                                                     87
No injury                                                                                                                                                                                                                                       81
Leg bitten                  

In [77]:
# Se observa el número de nulos.
tiburones['Injury'].isnull().value_counts()

False    6272
True       28
Name: Injury, dtype: int64

In [78]:
# Se eliminan los nulos.
tiburones['Injury'] = tiburones['Injury'].fillna('Unknown')

In [79]:
# Se observa nuevamente el número de nulos.
tiburones['Injury'].isnull().value_counts()

False    6300
Name: Injury, dtype: int64

# Columna Fatal

In [81]:
# Se imprime la información de la columna.
tiburones['Fatal'].value_counts()

N          4291
Y          1388
UNKNOWN      71
 N            7
N             1
y             1
2017          1
M             1
Name: Fatal, dtype: int64

In [82]:
#Se corrigen los datos de la columna.
tiburones['Fatal'] = tiburones['Fatal'].replace(' N|N ','N',regex=True)
tiburones['Fatal'] = tiburones['Fatal'].replace('M|2017','UNKNOWN',regex=True)
tiburones['Fatal'] = tiburones['Fatal'].replace('y','Y',regex=True)
tiburones['Fatal'].value_counts()

N          4299
Y          1389
UNKNOWN      73
Name: Fatal, dtype: int64

# Columna Time

In [83]:
# Se visualizan los datos de la columna.
tiburones['Time'].value_counts()

Afternoon                                                                187
11h00                                                                    128
Morning                                                                  121
12h00                                                                    109
15h00                                                                    108
16h00                                                                    101
14h00                                                                     97
16h30                                                                     74
14h30                                                                     73
17h00                                                                     73
13h00                                                                     72
17h30                                                                     70
18h00                                                                     69

In [89]:
# Se corrige la columna.
tiburones['Time'] = tiburones['Time'].fillna('Unknown')
tiburones = tiburones.astype({'Time':'str'})
tiburones['Time'] = tiburones['Time'].apply(lambda x : x.lstrip().rstrip())
horas_indices = []
for x in tiburones.index:
    horas = re.findall('[0-9]{2}[h][0-9]{2}', tiburones.Time[x])
    a = len(horas)
    if a > 0:
        horas_indices.append({'indice':x, 'horas':horas[0].split('h')[0]})
    else:
        horas = re.findall('[0-9]{2}', tiburones.Time[x])
        b = len(horas)
        if b > 0:
            horas_indices.append({'indice':x, 'horas':horas[0]})
for x in horas_indices:
    tiburones.Time[x['indice']] = x['horas']
tiburones['Time'] = tiburones['Time'].apply(lambda x : x.lower())
tiburones['Time'] = tiburones['Time'].apply(lambda x : '06' if 'dawn' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : '09' if 'morning' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : '12' if 'midday' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : '12' if 'noon' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : '15' if 'afternoon' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : '15' if 'after noon' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : '18' if 'dusk' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : '21' if 'evening' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : '22' if 'night' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : '24' if 'midnight' in x else x)
tiburones['Time'] = tiburones['Time'].apply(lambda x : x if x.isnumeric() else 'Unknown')
tiburones['Time'] = tiburones['Time'].apply(lambda x : 'Unknown' if x == '30' else x)
tiburones['Time'].value_counts()

Unknown    3410
12          452
09          260
11          259
16          242
15          239
14          237
13          201
17          199
10          180
18          150
08           93
07           82
22           77
19           53
06           42
21           41
20           31
05           12
03           10
23            8
04            7
02            7
01            6
00            2
Name: Time, dtype: int64

# Columna Species

In [90]:
# Se visualizan los datos contenidos en la columna.
tiburones['Species'].value_counts()

White shark                                                                                                                                                                                             163
Shark involvement prior to death was not confirmed                                                                                                                                                      105
Invalid                                                                                                                                                                                                 102
Shark involvement not confirmed                                                                                                                                                                          87
Tiger shark                                                                                                                                                                             

In [93]:
# Se corrige la columna.
tiburones['Species'] = tiburones['Species'].fillna('Unknown')
tiburones = tiburones.astype({'Species':'str'})
tiburones['Species'] = tiburones['Species'].apply(lambda x : x.lower().lstrip().rstrip())
tiburones['Species'] = tiburones['Species'].apply(lambda x : ''.join([i for i in x if i.isalpha()]).replace('shark', ''))
def nombre_especie(tiburon):
    if 'whit' in tiburon:
        return 'White'
    elif 'nur' in tiburon:
        return 'Nurse'
    elif 'spin' in tiburon:
        return 'Spinner'
    elif 'bask' in tiburon:
        return 'Basking'
    elif 'lem' in tiburon:
        return 'Lemon'
    elif 'ang' in tiburon:
        return 'Angel'
    elif 'beag' in tiburon:
        return 'Beagle'
    elif 'sch' in tiburon:
        return 'School'
    elif 'tig' in tiburon:
        return 'Tiger'
    elif 'leu' in tiburon:
        return 'Bull'
    elif 'wob' in tiburon:
        return 'Wobbegong'
    elif 'dus' in tiburon:
        return 'Dusky'
    elif 'car' in tiburon:
        return 'Carpet'
    elif 'cop' in tiburon:
        return 'Copper'
    elif 'sil' in tiburon:
        return 'Silky'
    elif 'bul' in tiburon:
        return 'Bull'
    elif 'blu' in tiburon:
        return 'Blue'
    elif 'sal' in tiburon:
        return 'Salmon'
    elif 'gra' in tiburon:
        return 'Gray'
    elif 'blac' in tiburon:
        return 'Black Tip'
    elif 'ree' in tiburon:
        return 'Reef'
    elif 'hamm' in tiburon:
        return 'Hammerhead'
    elif 'zam' in tiburon:
        return 'Bull'
    elif 'gil' in tiburon:
        return 'Gill'
    elif 'mak' in tiburon:
        return 'Mako'
    elif 'gal' in tiburon:
        return 'Galapagos'
    elif 'rag' in tiburon:
        return 'Ragged Tooth'
    elif 'whal' in tiburon:
        return 'Whale'
    else:
        return 'Unknown'
tiburones['Species'] = tiburones['Species'].apply(nombre_especie)
tiburones['Species'].value_counts()

Unknown         4404
White            672
Tiger            282
Bull             206
Nurse             97
Black Tip         78
Whale             70
Blue              55
Mako              50
Wobbegong         49
Spinner           48
Ragged Tooth      48
Hammerhead        48
Lemon             44
Reef              32
Carpet            29
Gill              20
Dusky             16
Angel              8
Basking            7
Galapagos          7
Silky              6
Beagle             6
School             6
Copper             5
Gray               4
Salmon             3
Name: Species, dtype: int64

In [94]:
tiburones.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun,2018,Boating,USA,California,Unknown,Paddling,Julie Wolfe,F,Unknown,"No injury to occupant, outrigger canoe and pad...",N,18,White,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun,2018,Unprovoked,USA,Georgia,Unknown,Standing,Adyson Mcneely,F,Unknown,Minor injury to left thigh,N,14,Unknown,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun,2018,Unknown,USA,Hawaii,Unknown,Surfing,John Denges,M,Unknown,Injury to left lower leg from surfboard skeg,N,07,Unknown,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun,2018,Unprovoked,AUSTRALIA,New South Wales,Unknown,Surfing,Unknown,M,Unknown,Minor injury to lower leg,N,Unknown,Unknown,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun,2018,Provoked,MEXICO,Colima,Unknown,Diving,Gustavo Ramos,M,Unknown,Lacerations to leg & hand shark PROVOKED INCIDENT,N,Unknown,Tiger,A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [95]:
tiburones.to_csv('attacks_result.csv', index=False)