
# Shark Tank

## imports

In [1]:
import pandas as pd
import seaborn as sns
import re
import sys
sys.path.append('../')

import src.date_format as dt
import src.moon_ph as m

## load data

In [2]:
sh = pd.read_csv('./attacks.csv',encoding="ISO-8859-1")
#sh.head()

## Limpiar columnas

In [3]:
#fix col names
diccio_todas = {col: col.replace(" ","_")  for col in list(sh.columns)}
sh.rename(columns=diccio_todas, inplace=True)


In [4]:
#drop unnecessary cols
sh.drop(["Unnamed:_22","Unnamed:_23", "Case_Number.1","Case_Number.2","href_formula","href","pdf"], axis=1, inplace=True)

In [5]:
sh.shape

(25723, 17)

In [6]:
#work data frame
sh.head(2)

Unnamed: 0,Case_Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex_,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,original_order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",6302.0


## Clean the data
### NAN values


In [7]:
sh.isnull().sum()

Case_Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex_                      19986
Age                       22252
Injury                    19449
Fatal_(Y/N)               19960
Time                      22775
Species_                  22259
Investigator_or_Source    19438
original_order            19414
dtype: int64

## drop NaN rows

In [8]:
sh.dropna(axis=0, how = "all",inplace=True)

In [9]:
sh.shape

(8703, 17)

In [10]:
sh.drop_duplicates(inplace=True)

In [11]:
sh.shape

(6311, 17)

In [12]:
sh.drop_duplicates(subset=['Case_Number'],inplace=True)
sh.shape

(6288, 17)

## ask for odd cases *

In [13]:
sh.drop([6302,25722], axis=0, inplace=True)

In [14]:
sh.shape

(6286, 17)

## fill NaN values

In [15]:
sh.Year.interpolate(method='linear', limit_direction='forward', axis=0)

0       2018.0
1       2018.0
2       2018.0
3       2018.0
4       2018.0
         ...  
6297       0.0
6298       0.0
6299       0.0
6300       0.0
6301       0.0
Name: Year, Length: 6286, dtype: float64

### fix year

In [16]:
sh["Year"] = sh.Date.apply(dt.get_year)

### pick object columns  **

In [17]:
string = list(sh.select_dtypes(include="object").columns) 
print(string)

['Case_Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name', 'Sex_', 'Age', 'Injury', 'Fatal_(Y/N)', 'Time', 'Species_', 'Investigator_or_Source']


In [18]:
#sh[string].fillna("unknown", inplace=True)
#sh.tail(6)

## done the fools way

In [19]:
sh.fillna("UNKNOWN",inplace=True)

## Invalid Type is unusable data

In [20]:
sh.Type.unique()

array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster', 'UNKNOWN', 'Boat', 'Boatomg'], dtype=object)

In [21]:
sh = sh[sh["Type"] != "Invalid"]
sh.shape


(5743, 17)

In [22]:
sh.head(6)

Unnamed: 0,Case_Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex_,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,original_order
0,2018.06.25,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",6303.0
1,2018.06.18,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,UNKNOWN,"K.McMurray, TrackingSharks.com",6302.0
3,2018.06.08,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,UNKNOWN,Minor injury to lower leg,N,UNKNOWN,2 m shark,"B. Myatt, GSAF",6300.0
4,2018.06.04,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,UNKNOWN,Lacerations to leg & hand shark PROVOKED INCIDENT,N,UNKNOWN,"Tiger shark, 3m",A .Kipper,6299.0
5,2018.06.03.b,03-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,UNKNOWN,"No injury, board bitten",N,UNKNOWN,UNKNOWN,"Daily Telegraph, 6/4/2018",6298.0
6,2018.06.03.a,03-Jun-2018,2018,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18,FATAL,Y,Late afternoon,Tiger shark,"Diario de Pernambuco, 6/4/2018",6297.0


## subset fecha formato

In [23]:
sh["Date_format"] = sh.Date.apply(dt.date_r)
#sh.loc[r"\d+\-\w{3}\-\d+"].shape

In [24]:
sh.head(6)

Unnamed: 0,Case_Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex_,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,original_order,Date_format
0,2018.06.25,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",6303.0,25-Jun-2018
1,2018.06.18,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,UNKNOWN,"K.McMurray, TrackingSharks.com",6302.0,18-Jun-2018
3,2018.06.08,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,UNKNOWN,Minor injury to lower leg,N,UNKNOWN,2 m shark,"B. Myatt, GSAF",6300.0,08-Jun-2018
4,2018.06.04,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,UNKNOWN,Lacerations to leg & hand shark PROVOKED INCIDENT,N,UNKNOWN,"Tiger shark, 3m",A .Kipper,6299.0,04-Jun-2018
5,2018.06.03.b,03-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,UNKNOWN,"No injury, board bitten",N,UNKNOWN,UNKNOWN,"Daily Telegraph, 6/4/2018",6298.0,03-Jun-2018
6,2018.06.03.a,03-Jun-2018,2018,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18,FATAL,Y,Late afternoon,Tiger shark,"Diario de Pernambuco, 6/4/2018",6297.0,03-Jun-2018


## new data frame sh_f

In [25]:
sh_f = sh[sh["Date_format"] != "date not complete"]


In [26]:
sh_f.shape

(4902, 18)

## nueva columna Moon_phase
### subset?

In [27]:
sh_f["Moon_phase"] = sh_f.Date_format.apply(m.moon)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sh_f["Moon_phase"] = sh_f.Date_format.apply(m.moon)


In [28]:
sh_f.head(7)

Unnamed: 0,Case_Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex_,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,original_order,Date_format,Moon_phase
0,2018.06.25,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",6303.0,25-Jun-2018,full
1,2018.06.18,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,UNKNOWN,"K.McMurray, TrackingSharks.com",6302.0,18-Jun-2018,waxing
3,2018.06.08,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,UNKNOWN,Minor injury to lower leg,N,UNKNOWN,2 m shark,"B. Myatt, GSAF",6300.0,08-Jun-2018,waning
4,2018.06.04,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,UNKNOWN,Lacerations to leg & hand shark PROVOKED INCIDENT,N,UNKNOWN,"Tiger shark, 3m",A .Kipper,6299.0,04-Jun-2018,waning
5,2018.06.03.b,03-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,UNKNOWN,"No injury, board bitten",N,UNKNOWN,UNKNOWN,"Daily Telegraph, 6/4/2018",6298.0,03-Jun-2018,waning
6,2018.06.03.a,03-Jun-2018,2018,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18,FATAL,Y,Late afternoon,Tiger shark,"Diario de Pernambuco, 6/4/2018",6297.0,03-Jun-2018,waning
7,2018.05.27,27-May-2018,2018,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,male,M,52,Minor injury to foot. PROVOKED INCIDENT,N,UNKNOWN,"Lemon shark, 3'","K. McMurray, TrackingSharks.com",6296.0,27-May-2018,full


In [29]:
sh_f.Moon_phase.value_counts()

full      1315
waxing    1159
waning    1136
new       1131
Name: Moon_phase, dtype: int64

## new column Day/Night
### fix time

In [30]:
#sh_f.Time= dt.time_fix(sh_f, 'Time', 'Age','Fatal_(Y/N)')

In [31]:
#sh_f["Time"]= sh_f.apply(lambda x: dt.fix_time(x["Time"],x["Age"],x["Fatal_(Y/N)"]))

In [32]:
sh_f["Day/"] = sh_f.Time + "@" + sh_f.Age + "@" + sh_f["Fatal_(Y/N)"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sh_f["Day/"] = sh_f.Time + "@" + sh_f.Age + "@" + sh_f["Fatal_(Y/N)"]


In [33]:
sh_f["Time"] = sh_f["Day/"].apply(dt.fix_time)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sh_f["Time"] = sh_f["Day/"].apply(dt.fix_time)


In [34]:
sh_f["Day/Night"]=sh_f.Time.apply(dt.daytime)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sh_f["Day/Night"]=sh_f.Time.apply(dt.daytime)


In [39]:
sh_f.drop(["Day/"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [40]:
sh_f.sample(6)

Unnamed: 0,Case_Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex_,Age,Injury,Fatal_(Y/N),Time,Species_,Investigator_or_Source,original_order,Date_format,Moon_phase,Day/Night
5268,1919.05.29,29-May-1919,1919,Unprovoked,USA,South Carolina,"James Island Sound, Charleston","""Swimming vigorously""",W.E. Davis,M,UNKNOWN,Left foot bitten & abraded,N,12h00,UNKNOWN,E. M. Burton,1035.0,29-May-1919,new,Day
5718,1889.07.08,08-Jul-1889,1889,Unprovoked,AUSTRALIA,Victoria,Port Phillip,Fell into the water,male,M,UNKNOWN,FATAL,Y,evening,UNKNOWN,"Brisbane Courier, 7/9/1889",585.0,08-Jul-1889,waxing,Night
4117,1959.01.12,12-Jan-1959,1959,Unprovoked,NEW GUINEA,Milne Bay Province,"Uga, Banaira, Milne Bay",Dragging banana seeds through the shallows,male,M,UNKNOWN,Left calf & right thigh bitten,N,UNKNOWN,1.4 m [4.5'] shark,"A.M. Rapson, p.149; L. Schultz & M. Malin, p.544",2186.0,12-Jan-1959,new,UNKNOWN
5464,1906.11.16,16-Nov-1906,1906,Unprovoked,JAMAICA,Westmoreland Parish,Cabaritta River mouth,Fishing,Zacey Allen,M,UNKNOWN,Leg bitten,N,UNKNOWN,UNKNOWN,"The Gleaner (Jamaica), 11/20/1906",839.0,16-Nov-1906,new,UNKNOWN
3108,1978.12.29,29-Dec-1978,1978,Unprovoked,AUSTRALIA,Queensland,Bribie Island,UNKNOWN,Wayne Brown,M,,Survived,N,UNKNOWN,UNKNOWN,"R. McKenzie, Sunday Mail, 9/6/1987, p.11",3195.0,29-Dec-1978,new,UNKNOWN
216,2016.09.17.a,17-Sep-2016,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,UNKNOWN,Struck by fin on chest & leg,N,UNKNOWN,UNKNOWN,"The Age, 9/18/2016",6087.0,17-Sep-2016,full,UNKNOWN


## export df

In [41]:
sh.to_csv ('sh.csv')
sh_f.to_csv ('sh_f.csv')