In [56]:
import pymysql
import sqlalchemy as db
from sqlalchemy import create_engine
import getpass
import pandas as pd
import re
from statistics import mean 

In [2]:
password = getpass.getpass("Insert password:")
engine = db.create_engine('mysql+pymysql://root:password@localhost')
print("Connected to server!")

Insert password:········
Connected to server!


In [3]:
df_sharks = pd.read_csv('./input/GSAF5.csv', encoding = "ISO-8859-1")

In [4]:
# 1) Diagnosing the data

In [5]:
df_sharks.shape

(5992, 24)

In [6]:
df_sharks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [7]:
df_sharks.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [8]:
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 24 columns):
Case Number               5992 non-null object
Date                      5992 non-null object
Year                      5992 non-null int64
Type                      5992 non-null object
Country                   5949 non-null object
Area                      5590 non-null object
Location                  5496 non-null object
Activity                  5465 non-null object
Name                      5792 non-null object
Sex                       5425 non-null object
Age                       3311 non-null object
Injury                    5965 non-null object
Fatal (Y/N)               5973 non-null object
Time                      2779 non-null object
Species                   3058 non-null object
Investigator or Source    5977 non-null object
pdf                       5992 non-null object
href formula              5991 non-null object
href                      5989 non-null object
C

In [9]:
df_sharks.dtypes

Case Number               object
Date                      object
Year                       int64
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Age                       object
Injury                    object
Fatal (Y/N)               object
Time                      object
Species                   object
Investigator or Source    object
pdf                       object
href formula              object
href                      object
Case Number.1             object
Case Number.2             object
original order             int64
Unnamed: 22               object
Unnamed: 23               object
dtype: object

In [10]:
# 2 Fixing the column names

In [11]:
df_sharks = df_sharks.rename(columns={'Case Number': 'case_number', 'Date': 'date', 'Year':'year', 'Type':'type', 'Country':'country', 'Area':'area', 'Location':'location',
       'Activity': 'activity', 'Name': 'name', 'Sex ': 'sex', 'Age': 'age', 'Injury': 'injury', 'Fatal (Y/N)': 'fatal', 'Time': 'time',
       'Species ': 'species', 'Investigator or Source': 'investigator_source', 'href formula':'href_formula',
       'Case Number.1': 'case_1', 'Case Number.2': 'case_2', 'original order': 'original_order'})

In [12]:
df_sharks.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal', 'time', 'species',
       'investigator_source', 'pdf', 'href_formula', 'href', 'case_1',
       'case_2', 'original_order', 'Unnamed: 22', 'Unnamed: 23'],
      dtype='object')

In [13]:
# 3 Dealing with duplicates

In [14]:
# There is no rows completely duplicate
duplicate_rows = df_sharks.duplicated().sum()
duplicate_rows

0

In [15]:
# There are cases whose case numbers are repeated
duplicated_cases = list(df_sharks['case_number'][df_sharks['case_number'].duplicated()].values)
df_sharks_duplicate_cases = df_sharks[df_sharks['case_number'].isin(duplicated_cases)]
df_sharks_duplicate_cases

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,investigator_source,pdf,href_formula,href,case_1,case_2,original_order,Unnamed: 22,Unnamed: 23
300,2014.08.02,02-Aug-14,2014,Unprovoked,USA,Florida,"South of Cocoa Beach, Brevard County",Surfing,male,M,...,,"Florida Today, 8/8/2014",2014.08.08-CocoaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2014.08.02,2014.08.02,5692,,
301,2014.08.02,02-Aug-14,2014,Unprovoked,USA,Florida,"Table Beach, Brevard County",Boogie boarding,Christian Sanhueza,M,...,,"Florida Today, 8/2/2014",2014.08.02-Sanhueza.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2014.08.02,2014.08.02,5691,,
392,2013.10.05,06-Oct-13,2013,Unprovoked,USA,California,"Bunkers, Humboldt Bay, Eureka, Humboldt County",Surfing,Jay Scrivner,M,...,"White shark, 8' to 10'","R. Collier, GSAF",2013.10.06-Scrivner.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2013.10.05,2013.10.05,5601,,
393,2013.10.05,10-Oct-13,2013,Unprovoked,USA,Florida,"Destin, Okaloosa County",Wading,Zachary Tyke Standridge,M,...,Small bull shark,"Monroe County Advocate, 10/9/2013",2013.10.05-Standridge.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2013.10.05,2013.10.05,5600,,
523,2012.09.02.b,02-Sep-12,2012,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,female,F,...,3.5' to 4' shark,"WYTV, 9/3/2012",2012.09.02.b-NSB-girl.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2012.09.02.b,2012.09.02.b,5470,,
524,2012.09.02.b,02-Sep-12,2012,Provoked,USA,Hawaii,"Spreckelsville, Maui",Spearfishing,M. Malabon,,...,"Tiger shark, 10' to 12'",HawaiiNow.com,2012.09.02.c-Malabon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2012.09.02.b,2012.09.02.b,5469,,
840,2009.12.18,18-Dec-09,2009,Unprovoked,SOUTH AFRICA,Eastern Cape Province,"Second Beach, Port St. Johns",Paddling on kneeboard,Tshintshekile Nduva,M,...,,"B. Jordan & A. Ferreira, Times Live, 12/21/2009",2009.12.18.a-Nduva.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2009.12.18,2009.12.18,5153,,
841,2009.12.18,18-Dec-09,2009,Invalid,SOUTH AFRICA,KwaZulu-Natal,"North Beach, Durban",Surfing,Lance Morris,M,...,No shark involvement,"M. Addison, C. Eckstander, GSAF",2009.12.18.b-Morris-barracuda bite.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2009.12.18,2009.12.18,5152,,
1212,2006.09.02,02-Sep-06,2006,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Swimming,male,M,...,,"S. Petersohn, GSAF",2006.09.02.b-Child-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2006.09.02,2006.09.02,4781,,
1213,2006.09.02,02-Sep-06,2006,Unprovoked,SOUTH AFRICA,Western Cape Province,Noordhoek,Surfing,Steven Harcourt-Wood,M,...,"White shark, 3.5m","Cape Times, 9/3/2006",2006.09.02.a-Harcourt-Wood.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2006.09.02,2006.09.02,4780,,


In [16]:
# 4 Dealing with missing values

In [17]:
# Check missing values
null_cols = df_sharks.isnull().sum()
null_cols

case_number               0
date                      0
year                      0
type                      0
country                  43
area                    402
location                496
activity                527
name                    200
sex                     567
age                    2681
injury                   27
fatal                    19
time                   3213
species                2934
investigator_source      15
pdf                       0
href_formula              1
href                      3
case_1                    0
case_2                    0
original_order            0
Unnamed: 22            5991
Unnamed: 23            5990
dtype: int64

In [18]:
# Delete unnamed and NaN columns
df_sharks = df_sharks.drop(columns = ['Unnamed: 22', 'Unnamed: 23'])

In [19]:
df_sharks.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal', 'time', 'species',
       'investigator_source', 'pdf', 'href_formula', 'href', 'case_1',
       'case_2', 'original_order'],
      dtype='object')

In [20]:
num_rows_0 = df_sharks.shape[0]

In [26]:
# Delete rows for columns with less than 500 NaN
df_sharks = df_sharks.dropna(subset = list(null_cols[null_cols <= 500].index))

In [27]:
null_cols = df_sharks.isnull().sum()
null_cols

case_number               0
date                      0
year                      0
type                      0
country                   0
area                      0
location                  0
activity                  0
name                      0
sex                       0
age                    1534
injury                    0
fatal                     0
time                   1964
species                   0
investigator_source       0
pdf                       0
href_formula              0
href                      0
case_1                    0
case_2                    0
original_order            0
dtype: int64

In [28]:
num_rows_1 = df_sharks.shape[0]
num_rows_dif = num_rows_0 - num_rows_1
print(num_rows_dif)

1504


In [29]:
# Fill NaN values in "Species" with "Unknown"
df_sharks['species'] = df_sharks['species'].fillna("Unknown")

In [30]:
null_cols = df_sharks.isnull().sum()
null_cols

case_number               0
date                      0
year                      0
type                      0
country                   0
area                      0
location                  0
activity                  0
name                      0
sex                       0
age                    1534
injury                    0
fatal                     0
time                   1964
species                   0
investigator_source       0
pdf                       0
href_formula              0
href                      0
case_1                    0
case_2                    0
original_order            0
dtype: int64

In [31]:
# Filling the age NaN with the average of ages

In [42]:
df_sharks.groupby('age').count()

Unnamed: 0_level_0,case_number,date,year,type,country,area,location,activity,name,sex,...,fatal,time,species,investigator_source,pdf,href_formula,href,case_1,case_2,original_order
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"""young""",1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,1,1,1,1,1
(adult),1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
10,47,47,47,47,47,47,47,47,47,47,...,47,30,47,47,47,47,47,47,47,47
10 or 12,1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,1,1,1,1,1
11,33,33,33,33,33,33,33,33,33,33,...,33,23,33,33,33,33,33,33,33,33
12,61,61,61,61,61,61,61,61,61,61,...,61,35,61,61,61,61,61,61,61,61
12 or 13,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
13,83,83,83,83,83,83,83,83,83,83,...,83,59,83,83,83,83,83,83,83,83


In [77]:
#(\d{1,2})|([Tt]een)|[aA]dult

def age_fixer(text):
    text = text.strip()
    if re.search('(\d{1,2})\s(.+)\s(\d{1,2})', text):
        return str(mean([int(s) for s in re.findall('(\d{1,2})',text)]))
    if re.match('(\d{1,2})', text):
        return str(int(re.match('(\d{1,2})',text).group()))
    else:
        return text
    
print(age_fixer("9 or 10"))
print(age_fixer("22"))
print(age_fixer("teen"))

9.5
    22
teen
