In [1]:
import pymysql
import sqlalchemy as db
from sqlalchemy import create_engine
import getpass
import pandas as pd
import re
from statistics import mean 
import numpy as np

In [2]:
password = getpass.getpass("Insert password:")
engine = db.create_engine('mysql+pymysql://root:password@localhost')
print("Connected to server!")

Insert password:········
Connected to server!


In [3]:
df_sharks = pd.read_csv('./input/GSAF5.csv', encoding = "ISO-8859-1")

In [4]:
# 1) Diagnosing the data

In [5]:
df_sharks.shape

(5992, 24)

In [6]:
df_sharks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [7]:
df_sharks.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [8]:
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 24 columns):
Case Number               5992 non-null object
Date                      5992 non-null object
Year                      5992 non-null int64
Type                      5992 non-null object
Country                   5949 non-null object
Area                      5590 non-null object
Location                  5496 non-null object
Activity                  5465 non-null object
Name                      5792 non-null object
Sex                       5425 non-null object
Age                       3311 non-null object
Injury                    5965 non-null object
Fatal (Y/N)               5973 non-null object
Time                      2779 non-null object
Species                   3058 non-null object
Investigator or Source    5977 non-null object
pdf                       5992 non-null object
href formula              5991 non-null object
href                      5989 non-null object
C

In [9]:
df_sharks.dtypes

Case Number               object
Date                      object
Year                       int64
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Age                       object
Injury                    object
Fatal (Y/N)               object
Time                      object
Species                   object
Investigator or Source    object
pdf                       object
href formula              object
href                      object
Case Number.1             object
Case Number.2             object
original order             int64
Unnamed: 22               object
Unnamed: 23               object
dtype: object

In [10]:
# 2 Fixing the column names

In [11]:
df_sharks = df_sharks.rename(columns={'Case Number': 'case_number', 'Date': 'date', 'Year':'year', 'Type':'type', 'Country':'country', 'Area':'area', 'Location':'location',
       'Activity': 'activity', 'Name': 'name', 'Sex ': 'sex', 'Age': 'age', 'Injury': 'injury', 'Fatal (Y/N)': 'fatal', 'Time': 'time',
       'Species ': 'species', 'Investigator or Source': 'investigator_source', 'href formula':'href_formula',
       'Case Number.1': 'case_1', 'Case Number.2': 'case_2', 'original order': 'original_order'})

In [12]:
df_sharks.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal', 'time', 'species',
       'investigator_source', 'pdf', 'href_formula', 'href', 'case_1',
       'case_2', 'original_order', 'Unnamed: 22', 'Unnamed: 23'],
      dtype='object')

In [13]:
# 3 Dealing with duplicates

In [14]:
# There is no rows completely duplicate
duplicate_rows = df_sharks.duplicated().sum()
duplicate_rows

0

In [15]:
# There are cases whose case numbers are repeated
duplicated_cases = list(df_sharks['case_number'][df_sharks['case_number'].duplicated()].values)
df_sharks_duplicate_cases = df_sharks[df_sharks['case_number'].isin(duplicated_cases)]
df_sharks_duplicate_cases

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,investigator_source,pdf,href_formula,href,case_1,case_2,original_order,Unnamed: 22,Unnamed: 23
300,2014.08.02,02-Aug-14,2014,Unprovoked,USA,Florida,"South of Cocoa Beach, Brevard County",Surfing,male,M,...,,"Florida Today, 8/8/2014",2014.08.08-CocoaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2014.08.02,2014.08.02,5692,,
301,2014.08.02,02-Aug-14,2014,Unprovoked,USA,Florida,"Table Beach, Brevard County",Boogie boarding,Christian Sanhueza,M,...,,"Florida Today, 8/2/2014",2014.08.02-Sanhueza.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2014.08.02,2014.08.02,5691,,
392,2013.10.05,06-Oct-13,2013,Unprovoked,USA,California,"Bunkers, Humboldt Bay, Eureka, Humboldt County",Surfing,Jay Scrivner,M,...,"White shark, 8' to 10'","R. Collier, GSAF",2013.10.06-Scrivner.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2013.10.05,2013.10.05,5601,,
393,2013.10.05,10-Oct-13,2013,Unprovoked,USA,Florida,"Destin, Okaloosa County",Wading,Zachary Tyke Standridge,M,...,Small bull shark,"Monroe County Advocate, 10/9/2013",2013.10.05-Standridge.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2013.10.05,2013.10.05,5600,,
523,2012.09.02.b,02-Sep-12,2012,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,female,F,...,3.5' to 4' shark,"WYTV, 9/3/2012",2012.09.02.b-NSB-girl.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2012.09.02.b,2012.09.02.b,5470,,
524,2012.09.02.b,02-Sep-12,2012,Provoked,USA,Hawaii,"Spreckelsville, Maui",Spearfishing,M. Malabon,,...,"Tiger shark, 10' to 12'",HawaiiNow.com,2012.09.02.c-Malabon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2012.09.02.b,2012.09.02.b,5469,,
840,2009.12.18,18-Dec-09,2009,Unprovoked,SOUTH AFRICA,Eastern Cape Province,"Second Beach, Port St. Johns",Paddling on kneeboard,Tshintshekile Nduva,M,...,,"B. Jordan & A. Ferreira, Times Live, 12/21/2009",2009.12.18.a-Nduva.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2009.12.18,2009.12.18,5153,,
841,2009.12.18,18-Dec-09,2009,Invalid,SOUTH AFRICA,KwaZulu-Natal,"North Beach, Durban",Surfing,Lance Morris,M,...,No shark involvement,"M. Addison, C. Eckstander, GSAF",2009.12.18.b-Morris-barracuda bite.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2009.12.18,2009.12.18,5152,,
1212,2006.09.02,02-Sep-06,2006,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Swimming,male,M,...,,"S. Petersohn, GSAF",2006.09.02.b-Child-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2006.09.02,2006.09.02,4781,,
1213,2006.09.02,02-Sep-06,2006,Unprovoked,SOUTH AFRICA,Western Cape Province,Noordhoek,Surfing,Steven Harcourt-Wood,M,...,"White shark, 3.5m","Cape Times, 9/3/2006",2006.09.02.a-Harcourt-Wood.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2006.09.02,2006.09.02,4780,,


In [16]:
count_cn = df_sharks.case_number.value_counts()
print(count_cn[count_cn > 1].count())

count_c1 = df_sharks.case_1.value_counts()
print(count_c1[count_c1 > 1].count())

count_c2 = df_sharks.case_2.value_counts()
print(count_c2[count_c2 > 1].count())

print(df_sharks['case_number'].equals(df_sharks['case_2']))

df_sharks = df_sharks.drop(['case_1', 'case_2'], axis = 1)



16
17
16
False


In [17]:
df_sharks.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal', 'time', 'species',
       'investigator_source', 'pdf', 'href_formula', 'href', 'original_order',
       'Unnamed: 22', 'Unnamed: 23'],
      dtype='object')

In [18]:
# 4 Deleting not useful columns

In [19]:
print(df_sharks.groupby('type').count()['case_number'])
print(df_sharks.groupby('activity').count()['case_number'])
print(df_sharks.groupby('injury').count()['case_number'])
print(df_sharks.groupby('species').count()['case_number'])
#they are not interesting for my purpose



type
Boat             200
Boating          110
Invalid          519
Provoked         557
Sea Disaster     220
Unprovoked      4386
Name: case_number, dtype: int64
activity
                                                                                  1
                                                                                  1
 a canoe was pursuing a schooner that had forcibily abducted 5 young girls        1
"Boat accident"                                                                   1
"Climbing up to ship after repairing the stern in water"                          1
                                                                                 ..
ship torpedoed 400 miles off the African coas. Man was clinging to hatch cover    1
small boat                                                                        1
wreck of the State Oil Company ship Permina                                       1
yachting accident                                                       

In [20]:
df_sharks = df_sharks.drop(['type','investigator_source', 'pdf','href_formula', 'href', 'activity', 'name','injury', 'species', 'original_order'], axis = 1)

#others could be interesting for future research in my topic. Which others?

df_sharks.columns

Index(['case_number', 'date', 'year', 'country', 'area', 'location', 'sex',
       'age', 'fatal', 'time', 'Unnamed: 22', 'Unnamed: 23'],
      dtype='object')

In [21]:
# 5 Dealing with missing values

In [22]:
# Check missing values
null_cols = df_sharks.isnull().sum()
null_cols

case_number       0
date              0
year              0
country          43
area            402
location        496
sex             567
age            2681
fatal            19
time           3213
Unnamed: 22    5991
Unnamed: 23    5990
dtype: int64

In [23]:
# Delete unnamed and NaN columns
df_sharks = df_sharks.drop(columns = ['Unnamed: 22', 'Unnamed: 23'])

In [24]:
df_sharks.columns

Index(['case_number', 'date', 'year', 'country', 'area', 'location', 'sex',
       'age', 'fatal', 'time'],
      dtype='object')

In [25]:
num_rows_0 = df_sharks.shape[0]

In [26]:
# Delete rows for columns with less than 500 NaN
df_sharks = df_sharks.dropna(subset = list(null_cols[null_cols <= 600].index))

In [27]:
null_cols = df_sharks.isnull().sum()
null_cols

case_number       0
date              0
year              0
country           0
area              0
location          0
sex               0
age            1760
fatal             0
time           2247
dtype: int64

In [28]:
num_rows_1 = df_sharks.shape[0]
num_rows_dif = num_rows_0 - num_rows_1
print(num_rows_dif)

1164


In [29]:
# Filling the age NaN with the average of ages

In [30]:
df_sharks.groupby('age').count().index

Index([' ', '  ', '"middle-age"', '"young"', '(adult)', '10', '10 or 12', '11',
       '12', '12 or 13',
       ...
       'F', 'Teen', 'Teens', 'X', 'adult', 'mid-20s', 'mid-30s', 'teen',
       'young', '  '],
      dtype='object', name='age', length=135)

In [31]:
# put in the functions file and import!

def age_fixer(text):
    text = str(text)
    text.strip()
    if re.match('month', text):
        return "1"
    if re.search('(\d{1,2})\s(.+)\s(\d{1,2})', text):
        return str(mean([int(s) for s in re.findall('(\d{1,2})',text)]))
    if re.search('\d{1,2}', text):
        return re.findall('\d{1,2}',text)[0]
    if re.match('[Tt]een', text):
        return "16"
    if re.search('[aA]dult', text):
        return "40"
    if re.search('[Yy]oung', text):
        return "30"
    else:
        return np.NaN


In [32]:
df_sharks['age'] = df_sharks['age'].apply(age_fixer)

In [33]:
null_cols = df_sharks.isnull().sum()
null_cols

case_number       0
date              0
year              0
country           0
area              0
location          0
sex               0
age            1768
fatal             0
time           2247
dtype: int64

In [34]:
df_sharks['age'] = pd.to_numeric(df_sharks['age'])

In [35]:
df_sharks = df_sharks.fillna(value = {'age': df_sharks.age.mean()})

In [36]:
null_cols = df_sharks.isnull().sum()
null_cols

case_number       0
date              0
year              0
country           0
area              0
location          0
sex               0
age               0
fatal             0
time           2247
dtype: int64

In [37]:
def time_fixer(text):
    text = str(text)
    text.strip()
    if re.match('\d{1,2}\D?\d\d', text):
        tot_str = re.match('(\d{1,2})\D?(\d\d)',text).group()
        sub_str = re.sub("[^0-9]", "", str(tot_str))
        for_str = ("{:0>4}".format(sub_str))[:2]
        hour = int(for_str)
        if hour >= 0 and hour <= 5:
            return "night"
        if hour > 5 and hour <= 11: 
            return "morning"
        if hour > 11 and hour <= 17: 
            return "afternoon"
        if hour > 17 and hour <= 23:
            return "evening"
    if re.search('[Nn]ight|[Dd]ark', text):
        return "night"
    if re.search('[Ee]vening|[Ss]unset|[Dd]usk', text):
        return "evening"
    if re.search('[Ll]unch|[Aa]fterno+n|[Dd]ay|[Nn]oon', text):
        return "afternoon"
    if re.search('A.?M|[Mm]orning|[Dd]awn', text):
        return "morning"
    else:
        return np.NaN


In [38]:
df_sharks['time'] = df_sharks['time'].apply(time_fixer)

In [39]:
null_cols = df_sharks.isnull().sum()
null_cols

case_number       0
date              0
year              0
country           0
area              0
location          0
sex               0
age               0
fatal             0
time           2291
dtype: int64

In [40]:
df_sharks = df_sharks.fillna(value = {'time': "unknown"})

In [41]:
null_cols = df_sharks.isnull().sum()
null_cols

case_number    0
date           0
year           0
country        0
area           0
location       0
sex            0
age            0
fatal          0
time           0
dtype: int64

In [42]:
# ojo que si hay columnas que me dan igual y me he cargado sus 
#filas, mejor cargarme las columnas y que sus filas completas si que me sirvan

'''['case_number', 'date', 'year', 'type', 'country', 'area', 'location', 
       'activity', 'name', 'sex', 'age', 'injury', 'fatal', 'time', 'species',
       'investigator_source', 'pdf', 'href_formula', 'href', 'case_1',
       'case_2', 'original_order']'''

#hacer regex en activity
#hacer regex en ¿year? y date --> month 


"['case_number', 'date', 'year', 'type', 'country', 'area', 'location', \n       'activity', 'name', 'sex', 'age', 'injury', 'fatal', 'time', 'species',\n       'investigator_source', 'pdf', 'href_formula', 'href', 'case_1',\n       'case_2', 'original_order']"

In [43]:
df_sharks.columns

Index(['case_number', 'date', 'year', 'country', 'area', 'location', 'sex',
       'age', 'fatal', 'time'],
      dtype='object')

Unnamed: 0,case_number,date,year,country,area,location,sex,age,fatal,time
0,2016.09.18.c,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",M,16.000000,N,afternoon
1,2016.09.18.b,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",M,36.000000,N,morning
2,2016.09.18.a,18-Sep-16,2016,USA,Florida,"New Smyrna Beach, Volusia County",M,43.000000,N,morning
3,2016.09.17,17-Sep-16,2016,AUSTRALIA,Victoria,Thirteenth Beach,M,26.923529,N,unknown
4,2016.09.15,16-Sep-16,2016,AUSTRALIA,Victoria,Bells Beach,M,26.923529,N,unknown
6,2016.09.11,11-Sep-16,2016,USA,Florida,"Ponte Vedra, St. Johns County",M,60.000000,N,afternoon
7,2016.09.07,07-Sep-16,2016,USA,Hawaii,"Makaha, Oahu",F,51.000000,N,afternoon
8,2016.09.06,06-Sep-16,2016,NEW CALEDONIA,North Province,Koumac,M,50.000000,Y,afternoon
9,2016.09.05.b,05-Sep-16,2016,USA,South Carolina,"Kingston Plantation, Myrtle Beach, Horry County",F,12.000000,N,afternoon
10,2016.09.05.a,05-Sep-16,2016,AUSTRALIA,Western Australia,Injidup,M,26.923529,N,afternoon
