In [1]:
import re
from datetime import datetime 

import numpy as np 
import pandas as pd

In [2]:
sharks = pd.read_csv('./attacks.csv', encoding= 'unicode_escape')

# 1. Data exploration

In [3]:
#Primero identificamos que tipo de información tenemos en la data
#Se puede observar que se tienen diferentes valores nulos
sharks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [4]:
#Vemos el nombre de las columnas 
sharks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [5]:
sharks.isnull().mean() * 100  #Identificamos el porcentaje de valores nulos, cada columna tiene un buen porcentaje
                                #el plan es mantener la mayor cantidad de observaciones posibles. 

Case Number               66.170353
Date                      75.500525
Year                      75.508300
Type                      75.516075
Country                   75.694903
Area                      77.269370
Location                  77.599813
Activity                  77.615364
Name                      76.316915
Sex                       77.697003
Age                       86.506240
Injury                    75.609377
Fatal (Y/N)               77.595926
Time                      88.539439
Species                   86.533453
Investigator or Source    75.566614
pdf                       75.500525
href formula              75.504412
href                      75.500525
Case Number.1             75.500525
Case Number.2             75.500525
original order            75.473312
Unnamed: 22               99.996112
Unnamed: 23               99.992225
dtype: float64

In [6]:
#Creamos una copia del df original
sharks_clean=sharks.copy()

# 2. Delete duplicated columns and filling ```NaN``` values plan:

*The unnamed columns 22 and 23 should be dropped, because are almost entirely null.

In [7]:
sharks_clean.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [8]:
sharks_clean.drop(columns=['Unnamed: 22',
       'Unnamed: 23'], inplace=True)

In [9]:
sharks_clean.columns #Se puede observar que el nombre de las columnas tiene espacios entre palabras o al final. 

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

In [10]:
#Procedemos a eliminar los espacios en el nombre de columnas 
sharks_clean.columns = sharks_clean.columns.str.strip()

In [11]:
column_names=sharks_clean.columns
column_names

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

*Due to the relevance of the ```"Species"``` column in the data set, we cannot drop this column, even when the percentage of ```NaN```'s there is near to 50%

In [12]:
sharks_clean['Species'].describe()

count            3464
unique           1549
top       White shark
freq              163
Name: Species, dtype: object

* The column ```"Time"``` and ```"Year"``` seems to be irrelevant. However we will explore it further.

In [13]:
sharks_clean[['Time', 'Year']].head()

Unnamed: 0,Time,Year
0,18h00,2018.0
1,14h00 -15h00,2018.0
2,07h45,2018.0
3,,2018.0
4,,2018.0


In [14]:
sharks_clean[['Time', 'Year']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    2948 non-null   object 
 1   Year    6300 non-null   float64
dtypes: float64(1), object(1)
memory usage: 402.0+ KB


* The columns ```"Country"```, ```"Area"``` and ```"Location"``` are related, so it might be possible to infer the missing values one from the others.

In [15]:
sharks_clean.loc[(sharks_clean["Country"].isnull()) & (sharks_clean["Location"].notnull()),["Country", "Area", 'Location']]

Unnamed: 0,Country,Area,Location
3387,,Caribbean Sea,Between St. Kitts & Nevis
3605,,,Florida Strait
4266,,Between Comores & Madagascar,Geyser Bank
4498,,Caribbean Sea,Between Cuba & Costa Rica
4639,,,225 miles east of Hong Kong
5020,,French Southern Territories,Île Saint-Paul
5425,,,Near the equator
5810,,,Santa Cruz
6137,,St Helena,Landing Place
6155,,,Carlisle Bay


In [16]:
sharks_clean.loc[(sharks_clean["Area"]=='Caribbean Sea') & (sharks_clean["Location"].notnull()),["Country", "Area", 'Location']]

Unnamed: 0,Country,Area,Location
3387,,Caribbean Sea,Between St. Kitts & Nevis
3603,PANAMA,Caribbean Sea,Golfo de los Mosquitos
4439,COLUMBIA,Caribbean Sea,Cartegena
4498,,Caribbean Sea,Between Cuba & Costa Rica
5210,CUBA,Caribbean Sea,20 miles from Havana


# 4. Filling values in ```"Injury"``` and ```"Fatal(Y/N)"``` heuristically

In [17]:
sharks_clean["Injury"].str.lower()

0        no injury to occupant, outrigger canoe and pad...
1                               minor injury to left thigh
2             injury to left lower leg from surfboard skeg
3                                minor injury to lower leg
4        lacerations to leg & hand shark provoked incident
                               ...                        
25718                                                  NaN
25719                                                  NaN
25720                                                  NaN
25721                                                  NaN
25722                                                  NaN
Name: Injury, Length: 25723, dtype: object

In [18]:
#Con este filtro vemos que se tuvieron lesiones fatales, por lo que podemos pasar una Y a la columna Fatal (Y/N).
sharks_clean.loc[(sharks_clean["Injury"].str.lower().str.contains("fatal", na=False)) & (sharks_clean["Fatal (Y/N)"].isnull()),["Injury", "Fatal (Y/N)"]]

Unnamed: 0,Injury,Fatal (Y/N)
76,FATAL,
84,"FATAL, but death was probably due to drowning",
159,"Fatal, coroner unable to determine if the dive...",
437,"FATAL, but shark involvement prior to death un...",
874,FATAL,
1495,"FATAL, but shark involvement prior to death un...",
1563,"FATAL, shark involvement prior to death is unc...",
1879,"PRESUMED FATAL, body not recovered",
1947,Fatal or drowned & remains scavenged by shark,
2025,Non-fatal,


In [19]:
# #reemplazamos los valores de NaN con Y en la columna Fatal (Y/N) si la columna injury contiene Fatal y comprobamos 
sharks_clean["Fatal (Y/N)"] = np.where((sharks_clean["Injury"].str.lower().str.contains("fatal", na=False)), 'Y', sharks_clean["Fatal (Y/N)"])
sharks_clean.loc[(sharks_clean["Injury"].str.lower().str.contains("fatal", na=False)) & (sharks_clean["Fatal (Y/N)"].notnull()),["Injury", "Fatal (Y/N)"]]

Unnamed: 0,Injury,Fatal (Y/N)
6,FATAL,Y
58,FATAL,Y
59,FATAL,Y
64,FATAL,Y
76,FATAL,Y
...,...,...
6297,FATAL,Y
6298,FATAL,Y
6299,FATAL,Y
6300,FATAL,Y


In [20]:
# #Revisamos si se tiene en la columna Fatal (Y/N) el valor Y 
sharks_clean.loc[(sharks_clean["Injury"].isnull()) & (sharks_clean["Fatal (Y/N)"]=='Y'),["Injury", "Fatal (Y/N)"]]

Unnamed: 0,Injury,Fatal (Y/N)
6140,,Y


In [21]:
# Con base en identificar que el ataque fue Fatal, reemplazamos el NaN de la columna Injury con FATAL
sharks_clean["Injury"] = np.where(sharks_clean["Fatal (Y/N)"]=='Y', 'fatal', sharks_clean["Injury"])

In [22]:
# #Revisamos si se tiene en la columna Fatal (Y/N) el valor Y 
sharks_clean.loc[(sharks_clean["Injury"].isnull()) & (sharks_clean["Fatal (Y/N)"]=='Y'),["Injury", "Fatal (Y/N)"]]

Unnamed: 0,Injury,Fatal (Y/N)


In [23]:
#Revisamos si las lesiones no fueron fatales y si hay NaN en la columna de Fatal
sharks_clean.loc[(sharks_clean["Injury"].notnull()) & (sharks_clean["Fatal (Y/N)"].isnull()),["Injury", "Fatal (Y/N)"]]

Unnamed: 0,Injury,Fatal (Y/N)
27,"Multiple severe injuries to arms and leg, leg ...",
79,"Lacerations to right ankle, foot & toe",
98,Lacerations to left foot,
107,Abrasions and cuts to sole of foot,
115,Foot injured,
...,...,...
6126,human remains (male) found in sharks gut,
6147,Lacerations to arm & leg,
6157,"Partial hominid remains recovered from shark, ...",
6158,Struck on thigh,


In [24]:
#Sustituimos los valores nulos en Fatal, a partir de no identificira la palabra Fatal en injury. 
sharks_clean['Fatal (Y/N)'] = np.where((sharks_clean["Injury"].notnull()) & (sharks_clean["Fatal (Y/N)"].isnull()), 'N', sharks_clean["Fatal (Y/N)"])

In [25]:
#Revisamos la sustitución de los valores Nan de Fatal 
sharks_clean.loc[(sharks_clean["Injury"].notnull()) & (sharks_clean["Fatal (Y/N)"].isnull()),["Injury", "Fatal (Y/N)"]]

Unnamed: 0,Injury,Fatal (Y/N)


In [26]:
#Ya no hay valores nulos en la columna de Fatal cuando se tiene información en la columna de Injury
#Sin embargo, aparecen en Fatal valares de UNKNOWN y en la de Injury, no aparece nada, por lo que se determina
#ponerlo igual en Injury
sharks_clean.loc[(sharks_clean["Injury"].isnull()) & (sharks_clean["Fatal (Y/N)"].notnull()),['Case Number',"Injury", "Fatal (Y/N)"]]

Unnamed: 0,Case Number,Injury,Fatal (Y/N)
1270,2008.04.20.a,,UNKNOWN
2250,1997.06.07,,UNKNOWN
2267,1996.12.29,,UNKNOWN
2648,1990.00.00,,UNKNOWN
3038,1981.03.00,,UNKNOWN
3298,1973.08.27,,UNKNOWN
3434,1969.05.22,,UNKNOWN
3712,1964.01.01.b,,UNKNOWN
3838,1962.02.02,,UNKNOWN
3841,1962.01.26,,UNKNOWN


In [27]:
sharks_clean["Injury"] = np.where((sharks_clean["Injury"].isnull()) & (sharks_clean["Fatal (Y/N)"].notnull()), 'UNKNOWN', sharks_clean["Injury"])

In [28]:
sharks_clean.loc[(sharks_clean["Injury"].isnull()) & (sharks_clean["Fatal (Y/N)"]=='UNKNOWN'),['Case Number',"Injury", "Fatal (Y/N)"]]

Unnamed: 0,Case Number,Injury,Fatal (Y/N)


In [29]:
#Se identifican diversa filas con valores Nullos, por lo que se procede a elimnar estas
#A partir de la column con menor número de valores nulos, asignaremos a una nueva variable sharks2
sharks2=sharks_clean.dropna(axis=0,subset=['Case Number'])
sharks2
sharks2.info()
sharks2.isnull().mean() * 100

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8702 entries, 0 to 25722
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6301 non-null   object 
 2   Year                    6299 non-null   float64
 3   Type                    6297 non-null   object 
 4   Country                 6251 non-null   object 
 5   Area                    5846 non-null   object 
 6   Location                5761 non-null   object 
 7   Activity                5757 non-null   object 
 8   Name                    6091 non-null   object 
 9   Sex                     5736 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6294 non-null   object 
 12  Fatal (Y/N)             6294 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null  

Case Number                0.000000
Date                      27.591358
Year                      27.614342
Type                      27.637325
Country                   28.165939
Area                      32.820041
Location                  33.796828
Activity                  33.842795
Name                      30.004597
Sex                       34.084119
Age                       60.112618
Injury                    27.671800
Fatal (Y/N)               27.671800
Time                      66.122730
Species                   60.193059
Investigator or Source    27.786716
pdf                       27.591358
href formula              27.602850
href                      27.591358
Case Number.1             27.591358
Case Number.2             27.591358
original order            27.510917
dtype: float64

# 5. Filling values in ```"Country"```, ```"Area"``` and ```"Location"``` heuristically

In [30]:
#Para llenar los valores nulos de Area y Location primero identificamos si tenemos al menos dos valores
sharks2.loc[(sharks_clean["Country"].notnull()) & (sharks2["Area"].notnull()) & (sharks2["Location"].isnull()),["Country", "Area", 'Location']]

Unnamed: 0,Country,Area,Location
110,BAHAMAS,40 miles off Grand Bahama Island,
151,USA,Florida,
171,BAHAMAS,Great Exuma,
174,REUNION,Saint-Andre,
180,BAHAMAS,Bimini,
...,...,...,...
6244,BAHAMAS,Andros Islands,
6255,MARSHALL ISLANDS,Bikini Atoll,
6274,ITALY,Adriatic Sea,
6287,FIJI,Moala Island,


In [31]:
sharks2.loc[(sharks2["Country"].notnull()) & (sharks2["Area"].notnull()) & (sharks2["Location"].isnull()),["Country", "Area", 'Location']]

Unnamed: 0,Country,Area,Location
110,BAHAMAS,40 miles off Grand Bahama Island,
151,USA,Florida,
171,BAHAMAS,Great Exuma,
174,REUNION,Saint-Andre,
180,BAHAMAS,Bimini,
...,...,...,...
6244,BAHAMAS,Andros Islands,
6255,MARSHALL ISLANDS,Bikini Atoll,
6274,ITALY,Adriatic Sea,
6287,FIJI,Moala Island,


In [32]:
#sharks2["Location"] = np.where((sharks2["Country"].notnull()) & (sharks2["Area"].notnull()) & (sharks2["Location"].isnull()), 'UNKNOWN', sharks2["Location"])

In [33]:
#sharks2.loc[(sharks2["Country"].isnull()) & (sharks2["Area"].notnull()) & (sharks2["Location"].notnull()),["Country", "Area", 'Location']]

In [34]:
#Se sustituyen los valores nulos de la columna de Location con los valores de la columna Area.
sharks2['Location'].fillna(sharks2['Area'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks2['Location'].fillna(sharks2['Area'], inplace=True)


In [35]:
sharks2

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8698,0,,,,,,,,,,...,,,,,,,,,,
8699,0,,,,,,,,,,...,,,,,,,,,,
8700,0,,,,,,,,,,...,,,,,,,,,,
8701,0,,,,,,,,,,...,,,,,,,,,,


In [36]:
sharks2.loc[(sharks2["Country"].isnull()) & (sharks2["Area"].notnull()) & (sharks2["Location"].notnull()),["Country", "Area", 'Location']]

Unnamed: 0,Country,Area,Location
2956,,English Channel,English Channel
3387,,Caribbean Sea,Between St. Kitts & Nevis
4018,,"Between Timor & Darwin, Australia","Between Timor & Darwin, Australia"
4231,,Near the Andaman & Nicobar Islands,Near the Andaman & Nicobar Islands
4266,,Between Comores & Madagascar,Geyser Bank
4498,,Caribbean Sea,Between Cuba & Costa Rica
4700,,Off South American coast,Off South American coast
4712,,300 miles east of St. Thomas (Virgin Islands),300 miles east of St. Thomas (Virgin Islands)
5020,,French Southern Territories,Île Saint-Paul
5612,,Mediterranean Sea,Mediterranean Sea


In [37]:
# Se sutituye el valor nulo de Countru, si en Area aparcee Australia
sharks2["Country"] = np.where((sharks2["Country"].isnull()) & (sharks2["Area"].str.contains('Australia')), 'Australia', sharks2["Country"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks2["Country"] = np.where((sharks2["Country"].isnull()) & (sharks2["Area"].str.contains('Australia')), 'Australia', sharks2["Country"])


In [38]:
#Los valore nulos de Country que no se pueden inferir a partir de la información de Area y Location se sustituyen por unknown
sharks2["Country"] = np.where((sharks2["Country"].isnull()) & (sharks2["Area"].notnull()) & (sharks2["Location"].notnull()), 'unknown', sharks2["Country"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks2["Country"] = np.where((sharks2["Country"].isnull()) & (sharks2["Area"].notnull()) & (sharks2["Location"].notnull()), 'unknown', sharks2["Country"])


In [39]:
sharks2.loc[(sharks2["Country"].notnull()) & (sharks2["Area"].isnull()) & (sharks2["Location"].notnull()),["Country", "Area", 'Location']]

Unnamed: 0,Country,Area,Location
32,NEW CALEDONIA,,"Magenta Beach, Noumea"
33,BAHAMAS,,Bimini
48,NEW CALEDONIA,,Nouville
59,LIBYA,,Gars Garabulli
90,SOLOMON ISLANDS,,Owarigi Island
...,...,...,...
6161,FRANCE,,Nice
6181,CANADA,,Grand Banks
6237,SINGAPORE,,"Keppel Harbor, 2 miles from Singapore city ce..."
6257,NORTH PACIFIC OCEAN,,Wake Island


In [40]:
#Los valores nulos de Area, sabiendo que se tiene Contry y localidad se sustituyen con Location
sharks2.loc[sharks2['Area'].isnull(), 'Area']=sharks2['Location']

In [41]:
sharks2.loc[(sharks2["Country"].isnull()) & (sharks2["Area"].notnull()) & (sharks2["Location"].notnull()),["Country", "Area", 'Location']]

Unnamed: 0,Country,Area,Location
3605,,Florida Strait,Florida Strait
4639,,225 miles east of Hong Kong,225 miles east of Hong Kong
5425,,Near the equator,Near the equator
5810,,Santa Cruz,Santa Cruz
6155,,Carlisle Bay,Carlisle Bay
6206,,In a river feeding into the Bay of Bengal,In a river feeding into the Bay of Bengal


In [42]:
#Como se tienen valores nulos de Country sin saber Area y Location, se identifican con unknown estas columnas
sharks2.loc[(sharks2["Country"].isnull()) & (sharks2["Area"].notnull()) & (sharks2["Location"].notnull()), 'Country']='unknown'

In [43]:
sharks2.loc[(sharks2["Country"].notnull()) & (sharks2["Area"].isnull()) & (sharks2["Location"].isnull()),["Country", "Area", 'Location']]

Unnamed: 0,Country,Area,Location
56,BAHAMAS,,
101,BAHAMAS,,
132,BAHAMAS,,
296,BAHAMAS,,
370,FIJI,,
...,...,...,...
6280,AUSTRALIA,,
6291,ASIA?,,
6293,AUSTRALIA,,
6294,AUSTRALIA,,


In [44]:
#Se tienen los valores en Country pero no se conoce area y loction, por lo que se sustituyen los NAN con unknown en Area
sharks2.loc[(sharks2["Country"].notnull()) & (sharks2["Area"].isnull()) & (sharks2["Location"].isnull()), 'Area']='unknown'


In [45]:
sharks2.loc[(sharks2["Country"].notnull()) & (sharks2["Area"]== 'unknown') & (sharks2["Location"].isnull()), 'Location']='unknown'

In [46]:
sharks2.loc[(sharks2["Country"].notnull()) & (sharks2["Area"].isnull()) & (sharks2["Location"].isnull()),["Country", "Area", 'Location']]

Unnamed: 0,Country,Area,Location


# 6. Cleaning and parsing the column ```"Date"```

In [47]:
sharks2['Date'].info()

<class 'pandas.core.series.Series'>
Int64Index: 8702 entries, 0 to 25722
Series name: Date
Non-Null Count  Dtype 
--------------  ----- 
6301 non-null   object
dtypes: object(1)
memory usage: 394.0+ KB


In [48]:
len(sharks2)

8702

In [49]:
from datetime import datetime


# 7. Droping the column ```"Case Number"```, ```"Year"``` and ```"original order"```.

In [50]:
sharks2.drop(columns=["Case Number", "Year", "original order"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks2.drop(columns=["Case Number", "Year", "original order"], inplace=True)


In [51]:
sharks2

Unnamed: 0,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2
0,25-Jun-2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25
1,18-Jun-2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18
2,09-Jun-2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09
3,08-Jun-2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08
4,04-Jun-2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8698,,,,,,,,,,,,,,,,,,,
8699,,,,,,,,,,,,,,,,,,,
8700,,,,,,,,,,,,,,,,,,,
8701,,,,,,,,,,,,,,,,,,,


In [52]:
#Una vez que se elimina la columna de Case Number, tambien se dropean los valores nulos de las filas de Country, Area y Location
sharks3=sharks2.dropna(axis=0,subset=['Country'])
sharks3

Unnamed: 0,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2
0,25-Jun-2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25
1,18-Jun-2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18
2,09-Jun-2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09
3,08-Jun-2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08
4,04-Jun-2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,fatal,Y,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005
6298,Before 1903,Unprovoked,AUSTRALIA,Western Australia,Western Australia,Pearl diving,Ahmun,M,,fatal,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004
6299,1900-1905,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,fatal,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003
6300,1883-1889,Unprovoked,PANAMA,"Panama Bay 8ºN, 79ºW","Panama Bay 8ºN, 79ºW",,Jules Patterson,M,,fatal,Y,,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002


# 8. Filling values for ```"Name"``` and ```"Investigator or Source"```

In [53]:
sharks3.loc[sharks3['Name'].isnull().value_counts()]

Unnamed: 0,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2
6062,06-Jul-1842,Provoked,USA,New Jersey,"Absecon, Atlantic County",Harassing a shark,male,,,Lacerations to leg PROVOKED INCIDENT,N,,,"New York Evening Post, 7/11/1842",1842.07.06-Absecon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1842.07.06,1842.07.06
210,01-Oct-2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfng,male,M,32.0,Minor injuries,N,17h30,,"Orlando Sentinel, 10/2/2016",2016.10.01-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.10.01,2016.10.01


In [54]:
sharks3.loc[(sharks3["Name"].isnull()) & (sharks3["Investigator or Source"].notnull()),["Name", 'Investigator or Source']]

Unnamed: 0,Name,Investigator or Source
32,,"Les Nouvelles Caledoniennes, 4/10/2018"
86,,"B. Myatt, GSAF"
124,,"Linfo, 7/3/2017"
154,,"Daytona Beach News-Journal, 4/17/2017"
184,,"Brisbane Times,1/13/2017"
...,...,...
6185,,"Di Candia, 2004"
6197,,"C. Moore, GSAF"
6218,,G. Van Grevelynghe
6237,,"V.M. Coppleson (1958), p.266"


In [55]:
#Sustituir los valores nulos de la columna Name con los de Investigator or source
sharks3.loc[sharks3['Name'].isnull(), 'Name']=sharks3['Investigator or Source']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks3.loc[sharks3['Name'].isnull(), 'Name']=sharks3['Investigator or Source']


In [56]:
#Sustituir los valores nulos de la columna Investigator or source con la de los valores de name
sharks3.loc[sharks3['Investigator or Source'].isnull(), 'Investigator or Source']=sharks3['Name']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks3.loc[sharks3['Investigator or Source'].isnull(), 'Investigator or Source']=sharks3['Name']


In [57]:
sharks3.loc[(sharks3["Name"].isnull()) & (sharks3["Investigator or Source"].notnull()),["Name", 'Investigator or Source']]

Unnamed: 0,Name,Investigator or Source


In [58]:
sharks3

Unnamed: 0,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2
0,25-Jun-2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25
1,18-Jun-2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18
2,09-Jun-2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09
3,08-Jun-2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08
4,04-Jun-2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,fatal,Y,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005
6298,Before 1903,Unprovoked,AUSTRALIA,Western Australia,Western Australia,Pearl diving,Ahmun,M,,fatal,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004
6299,1900-1905,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,fatal,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003
6300,1883-1889,Unprovoked,PANAMA,"Panama Bay 8ºN, 79ºW","Panama Bay 8ºN, 79ºW",,Jules Patterson,M,,fatal,Y,,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002


# 9. Tiding column ```"Activity"``` by extracting verbs in gerund.

In [59]:
sharks3["Activity"].isnull().value_counts()

False    5735
True      537
Name: Activity, dtype: int64

In [60]:
#sharks3.loc[(sharks3["Activity"].str.endswith('ing')),["Name", 'Investigator or Source', 'Activity']]

# 10. Create a Predictor Function and with it, filling NaN values in Columns ```"Age"``` and ```"Activity"```.

In [61]:
sharks3['Age'].notnull().value_counts()

True     3465
False    2807
Name: Age, dtype: int64

In [62]:
#Interpolation with help of padding simply means filling missing values with the same value present 
#above them in the dataset. If the missing value is in the first row then this method will not work. 
#While using this technique you also need to specify the limit which means how many NaN values 
#to fill. Se realizará la interpolación con edad y actividad.
sharks3['Age']=sharks3['Age'].interpolate(method="pad", limit=2807)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks3['Age']=sharks3['Age'].interpolate(method="pad", limit=2807)


In [63]:
sharks3['Age'].value_counts()

17        327
18        311
16        301
19        295
15        267
         ... 
>50         1
adult       1
9 & 12      1
? & 19      1
74          1
Name: Age, Length: 157, dtype: int64

In [64]:
sharks3['Activity']=sharks3['Activity'].interpolate(method="pad", limit=2807)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks3['Activity']=sharks3['Activity'].interpolate(method="pad", limit=2807)


In [65]:
sharks3['Activity'].notnull().value_counts()

True    6272
Name: Activity, dtype: int64

# 11. Use this Predictor for filling ```NaN```'s in Area 

In [66]:
sharks3['Area'].notnull().value_counts

<bound method IndexOpsMixin.value_counts of 0       True
1       True
2       True
3       True
4       True
        ... 
6297    True
6298    True
6299    True
6300    True
6301    True
Name: Area, Length: 6272, dtype: bool>

# 12. Filling values for column ```"Species"```: Unfortunately we could not apply the Predictor in this case because there are no enough information in the data set

In [67]:
sharks3['Species'].isnull().value_counts()

False    3456
True     2816
Name: Species, dtype: int64

In [68]:
sharks3['Species']= sharks3['Species'].interpolate(method="pad", limit=2816)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks3['Species']= sharks3['Species'].interpolate(method="pad", limit=2816)


In [69]:
sharks3['Species'].isnull().value_counts()

False    6272
Name: Species, dtype: int64

In [70]:
sharks3['Species'].value_counts()

White shark                                                        267
Shark involvement prior to death unconfirmed                       249
Invalid                                                            209
Shark involvement prior to death was not confirmed                 190
Shark involvement not confirmed                                    147
                                                                  ... 
Possiby white shark                                                  1
Tiger shark, 3.7 m [12'], (tooth fragment recovered from wound)      1
Two 3 m [10'] oceanic whitetip sharks                                1
Tiger shark, 3 m to 4.9 m [10' to 16']                               1
Raggedtooth shark, 2.7 m [9']                                        1
Name: Species, Length: 1548, dtype: int64

# 13. Droping irrelevant rows based on ```NaN```'s counting by columns

In [71]:
sharks3.isnull().mean() * 100 #Se identifica que la columna Time sigue teniendo un alto 
#porcentaje de valores nulos, por lo que se eliminara la misma

Date                       0.000000
Type                       0.063776
Country                    0.000000
Area                       0.000000
Location                   0.000000
Activity                   0.000000
Name                       0.015944
Sex                        8.976403
Age                        0.000000
Injury                     0.111607
Fatal (Y/N)                0.111607
Time                      53.061224
Species                    0.000000
Investigator or Source     0.015944
pdf                        0.000000
href formula               0.015944
href                       0.000000
Case Number.1              0.000000
Case Number.2              0.000000
dtype: float64

In [72]:
sharks3.isnull().tail(25)

Unnamed: 0,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2
6277,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
6278,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False
6279,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
6280,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
6281,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
6282,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
6283,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
6284,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
6285,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
6286,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False


In [73]:
#Se eliminará la columna 'Time'
sharks3.drop(columns=['Time'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks3.drop(columns=['Time'], inplace=True)


In [74]:
sharks3.isnull().mean() * 100

Date                      0.000000
Type                      0.063776
Country                   0.000000
Area                      0.000000
Location                  0.000000
Activity                  0.000000
Name                      0.015944
Sex                       8.976403
Age                       0.000000
Injury                    0.111607
Fatal (Y/N)               0.111607
Species                   0.000000
Investigator or Source    0.015944
pdf                       0.000000
href formula              0.015944
href                      0.000000
Case Number.1             0.000000
Case Number.2             0.000000
dtype: float64

# 15. Assing boolean values fo column ```"Fatal (Y/N)"```.

In [75]:
#Asignamos a los Y como True
sharks3['Fatal (Y/N)']=np.where((sharks3['Fatal (Y/N)']=='Y'), True, sharks3['Fatal (Y/N)'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks3['Fatal (Y/N)']=np.where((sharks3['Fatal (Y/N)']=='Y'), True, sharks3['Fatal (Y/N)'])


In [76]:
sharks3['Fatal (Y/N)']

0          N
1          N
2          N
3          N
4          N
        ... 
6297    True
6298    True
6299    True
6300    True
6301    True
Name: Fatal (Y/N), Length: 6272, dtype: object

In [77]:
# #Asignamos a los diferentes de Y como False
sharks3['Fatal (Y/N)']=np.where((sharks3['Fatal (Y/N)']=='N'), False, sharks3['Fatal (Y/N)'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks3['Fatal (Y/N)']=np.where((sharks3['Fatal (Y/N)']=='N'), False, sharks3['Fatal (Y/N)'])


# 16. Changing column names and Re-indexing

In [78]:
sharks3.columns

Index(['Date', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name',
       'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Species',
       'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2'],
      dtype='object')

In [79]:
sharks3.rename(columns={'Fatal (Y/N)': 'Fatal_(Y/N)', 'Investigator or Source':'Investigator_or_Source',
                       'href formula' : 'href_formula', 'Case Number.1' : 'Case_Number.1', 
                        'Case Number.2':'Case_Number.2'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sharks3.rename(columns={'Fatal (Y/N)': 'Fatal_(Y/N)', 'Investigator or Source':'Investigator_or_Source',


In [80]:
sharks3.columns

Index(['Date', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name',
       'Sex', 'Age', 'Injury', 'Fatal_(Y/N)', 'Species',
       'Investigator_or_Source', 'pdf', 'href_formula', 'href',
       'Case_Number.1', 'Case_Number.2'],
      dtype='object')

In [92]:
sharks3.reindex(columns =['Date', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name',
       'Sex', 'Age', 'Injury', 'Fatal_(Y/N)', 'Species',
       'Investigator_or_Source', 'pdf', 'href_formula', 'href',
       'Case_Number.1', 'Case_Number.2'])

Unnamed: 0,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal_(Y/N),Species,Investigator_or_Source,pdf,href_formula,href,Case_Number.1,Case_Number.2
1,25-Jun-2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",False,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25
2,18-Jun-2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,False,White shark,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18
3,09-Jun-2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,False,White shark,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09
4,08-Jun-2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,48,Minor injury to lower leg,False,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08
5,04-Jun-2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,48,Lacerations to leg & hand shark PROVOKED INCIDENT,False,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6268,Before 1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,16,fatal,True,Said to involve a grey nurse shark that leapt ...,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005
6269,Before 1903,Unprovoked,AUSTRALIA,Western Australia,Western Australia,Pearl diving,Ahmun,M,16,fatal,True,Said to involve a grey nurse shark that leapt ...,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004
6270,1900-1905,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,16,fatal,True,Said to involve a grey nurse shark that leapt ...,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003
6271,1883-1889,Unprovoked,PANAMA,"Panama Bay 8ºN, 79ºW","Panama Bay 8ºN, 79ºW",Swimming,Jules Patterson,M,16,fatal,True,Said to involve a grey nurse shark that leapt ...,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002


In [93]:
sharks3

Unnamed: 0,Date,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal_(Y/N),Species,Investigator_or_Source,pdf,href_formula,href,Case_Number.1,Case_Number.2
1,25-Jun-2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",False,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25
2,18-Jun-2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,False,White shark,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18
3,09-Jun-2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,False,White shark,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09
4,08-Jun-2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,48,Minor injury to lower leg,False,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08
5,04-Jun-2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,48,Lacerations to leg & hand shark PROVOKED INCIDENT,False,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6268,Before 1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,16,fatal,True,Said to involve a grey nurse shark that leapt ...,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005
6269,Before 1903,Unprovoked,AUSTRALIA,Western Australia,Western Australia,Pearl diving,Ahmun,M,16,fatal,True,Said to involve a grey nurse shark that leapt ...,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004
6270,1900-1905,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,16,fatal,True,Said to involve a grey nurse shark that leapt ...,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003
6271,1883-1889,Unprovoked,PANAMA,"Panama Bay 8ºN, 79ºW","Panama Bay 8ºN, 79ºW",Swimming,Jules Patterson,M,16,fatal,True,Said to involve a grey nurse shark that leapt ...,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002


# 17. Exporting data frame as .csv file 

In [96]:
from pathlib import Path
filepath = Path('Sharks_clean.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
sharks3.to_csv(filepath)