In [1]:
# Importing the packages

import pandas as pd
import numpy as np

In [2]:
# Importing the csv file and debugging an encoding error

df = pd.read_csv("GSAF5.csv", encoding = "latin1")

In [3]:
# Displaying the first rows

df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [4]:
# Getting the clean date ouf of the case number column

df["clean_date"] = df["Case Number"].str.slice(0,10)

In [5]:
# Setting an index in order to access the interested in data directly

df.rename(columns = {"Case Number" : "case_number"}, inplace = True)
df = df.set_index("case_number")



### Missing values & preparing the dataframe for analysis

In [6]:
# Counting the null values in columns

null_columns = df.isnull().sum()
null_columns[null_columns > 0]

Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
href formula                 1
href                         3
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [7]:
# Are these nulls a majority for these columns?

null_columns[null_columns > 0] / len(df)

Country                   0.007176
Area                      0.067089
Location                  0.082777
Activity                  0.087951
Name                      0.033378
Sex                       0.094626
Age                       0.447430
Injury                    0.004506
Fatal (Y/N)               0.003171
Time                      0.536215
Species                   0.489653
Investigator or Source    0.002503
href formula              0.000167
href                      0.000501
Unnamed: 22               0.999833
Unnamed: 23               0.999666
dtype: float64

In [8]:
# It makes sence to drop the columns, which have more than 10% of null values
# as well as column, which doesn't provide any interesting information for us, such as pdf, href formula, href,
# case number 1, case number 2, original order

perc = null_columns[null_columns > 0] / len(df)
to_drop = perc[perc > 0.1].index
to_drop

Index(['Age', 'Time', 'Species ', 'Unnamed: 22', 'Unnamed: 23'], dtype='object')

In [9]:
# Dropping the columns, with specifired criteria (axis = 1)

df_adj = df.drop(to_drop, axis = 1)
df_adj = df_adj.drop(["href formula","href","pdf","Case Number.1","Case Number.2","original order","Date"], axis = 1)

In [10]:
# Look at new dataframe

df_adj.head()

Unnamed: 0_level_0,Year,Type,Country,Area,Location,Activity,Name,Sex,Injury,Fatal (Y/N),Investigator or Source,clean_date
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016.09.18.c,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Minor injury to thigh,N,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.18.b,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,Lacerations to hands,N,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.18.a,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Lacerations to lower leg,N,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.17,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,Struck by fin on chest & leg,N,"The Age, 9/18/2016",2016.09.17
2016.09.15,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,No injury: Knocked off board by shark,N,"The Age, 9/16/2016",2016.09.15


In [11]:
# Updating the column names

df_adj.columns = (df_adj.columns.str.replace(' ','_').str.lower())

In [12]:
# Checking if everything went well

df_adj.head()

Unnamed: 0_level_0,year,type,country,area,location,activity,name,sex_,injury,fatal_(y/n),investigator_or_source,clean_date
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016.09.18.c,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Minor injury to thigh,N,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.18.b,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,Lacerations to hands,N,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.18.a,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Lacerations to lower leg,N,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.17,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,Struck by fin on chest & leg,N,"The Age, 9/18/2016",2016.09.17
2016.09.15,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,No injury: Knocked off board by shark,N,"The Age, 9/16/2016",2016.09.15


In [13]:
# It did, but sex has now sex_, so we have to change it

df_adj.rename(columns = {"sex_" : "sex"}, inplace = True)

In [14]:
# Replacing the null values in country column

df_adj["country"] = df_adj["country"].fillna("UNKNOWN")

In [15]:
# Adjusting the values in country column

df_adj["country"] = df_adj["country"].str.title()
df_adj["country"] = df_adj["country"].replace("Usa","USA")
df_adj.head()

Unnamed: 0_level_0,year,type,country,area,location,activity,name,sex,injury,fatal_(y/n),investigator_or_source,clean_date
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016.09.18.c,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Minor injury to thigh,N,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.18.b,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,Lacerations to hands,N,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.18.a,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,Lacerations to lower leg,N,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.17,2016,Unprovoked,Australia,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,Struck by fin on chest & leg,N,"The Age, 9/18/2016",2016.09.17
2016.09.15,2016,Unprovoked,Australia,Victoria,Bells Beach,Surfing,male,M,No injury: Knocked off board by shark,N,"The Age, 9/16/2016",2016.09.15


In [16]:
# Adjusting the values in columns sex and fatal

df_adj["sex"] = df_adj["sex"].str.lower()
df_adj["fatal_(y/n)"] = df_adj["fatal_(y/n)"].str.lower()
df_adj.head()

Unnamed: 0_level_0,year,type,country,area,location,activity,name,sex,injury,fatal_(y/n),investigator_or_source,clean_date
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016.09.18.c,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,m,Minor injury to thigh,n,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.18.b,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,m,Lacerations to hands,n,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.18.a,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,m,Lacerations to lower leg,n,"Orlando Sentinel, 9/19/2016",2016.09.18
2016.09.17,2016,Unprovoked,Australia,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,m,Struck by fin on chest & leg,n,"The Age, 9/18/2016",2016.09.17
2016.09.15,2016,Unprovoked,Australia,Victoria,Bells Beach,Surfing,male,m,No injury: Knocked off board by shark,n,"The Age, 9/16/2016",2016.09.15


In [17]:
# Splitting the investigator_or_sourse to get rid of the date in it

check = df_adj["investigator_or_source"].str.split(",", n = 1, expand = True)
df_adj["investigator_source"] = check[0]
df_adj["to_be_dropped"] = check[1]
df_adj = df_adj.drop(["investigator_or_source","to_be_dropped"], axis = 1)

Unnamed: 0_level_0,year,type,country,area,location,activity,name,sex,injury,fatal_(y/n),clean_date,investigator_source
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016.09.18.c,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,m,Minor injury to thigh,n,2016.09.18,Orlando Sentinel
2016.09.18.b,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,m,Lacerations to hands,n,2016.09.18,Orlando Sentinel
2016.09.18.a,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,m,Lacerations to lower leg,n,2016.09.18,Orlando Sentinel
2016.09.17,2016,Unprovoked,Australia,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,m,Struck by fin on chest & leg,n,2016.09.17,The Age
2016.09.15,2016,Unprovoked,Australia,Victoria,Bells Beach,Surfing,male,m,No injury: Knocked off board by shark,n,2016.09.15,The Age
...,...,...,...,...,...,...,...,...,...,...,...,...
ND.0005,0,Unprovoked,Australia,Western Australia,Roebuck Bay,Diving,male,m,FATAL,y,ND.0005,H. Taunton; N. Bartlett
ND.0004,0,Unprovoked,Australia,Western Australia,,Pearl diving,Ahmun,m,FATAL,y,ND.0004,H. Taunton; N. Bartlett
ND.0003,0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,m,FATAL,y,ND.0003,F. Schwartz
ND.0002,0,Unprovoked,Panama,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,m,FATAL,y,ND.0002,The Sun


In [18]:
# Getting rid out of the 0-year data

df_new = df_adj[df_adj["year"] != 0]
df_new

Unnamed: 0_level_0,year,type,country,area,location,activity,name,sex,injury,fatal_(y/n),clean_date,investigator_source
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016.09.18.c,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,m,Minor injury to thigh,n,2016.09.18,Orlando Sentinel
2016.09.18.b,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,m,Lacerations to hands,n,2016.09.18,Orlando Sentinel
2016.09.18.a,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,m,Lacerations to lower leg,n,2016.09.18,Orlando Sentinel
2016.09.17,2016,Unprovoked,Australia,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,m,Struck by fin on chest & leg,n,2016.09.17,The Age
2016.09.15,2016,Unprovoked,Australia,Victoria,Bells Beach,Surfing,male,m,No injury: Knocked off board by shark,n,2016.09.15,The Age
...,...,...,...,...,...,...,...,...,...,...,...,...
1554.00.00,1554,Unprovoked,France,Nice & Marseilles,,,males (wearing armor),m,,unknown,1554.00.00,G. Rondelet
1543.00.00,1543,Unprovoked,Venezuela,Magarita or Cubagua Islands,,Pearl diving,Indian slave,m,FATAL,y,1543.00.00,J. Castro
0500.00.00,500,Unprovoked,Mexico,,,,male,,Foot severed,n,0500.00.00,J. Castro
0077.00.00,77,Unprovoked,Unknown,Ionian Sea,,Sponge diving,males,m,FATAL,y,0077.00.00,Perils mentioned by Pliny the Elder (23 A.D. t...


In [25]:
# We don't want to leave any null values left, doesn't matter if they have influence or not

df_new.isnull().sum()

year                     0
type                     0
country                  0
area                   377
location               462
activity               508
name                   193
sex                    557
injury                  26
fatal_(y/n)             19
clean_date               0
investigator_source     15
dtype: int64

In [45]:
# Removing the null values and putting "unknown" instead

df_new["area"] = df_new["area"].fillna("unknown")
df_new["location"] = df_new["location"].fillna("unknown")
df_new["activity"] = df_new["activity"].fillna("unknown")
df_new["name"] = df_new["name"].fillna("unknown")
df_new["sex"] = df_new["sex"].fillna("unknown")
df_new["injury"] = df_new["injury"].fillna("unknown")
df_new["fatal_(y/n)"] = df_new["fatal_(y/n)"].fillna("unknown")
df_new["investigator_source"] = df_new["investigator_source"].fillna("unknown")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [27]:
# All the null values are removed

df_new.isnull().sum()

year                   0
type                   0
country                0
area                   0
location               0
activity               0
name                   0
sex                    0
injury                 0
fatal_(y/n)            0
clean_date             0
investigator_source    0
dtype: int64

## Data types

In [29]:
# Getting info about the data types in our dataframe

df_new.dtypes

year                    int64
type                   object
country                object
area                   object
location               object
activity               object
name                   object
sex                    object
injury                 object
fatal_(y/n)            object
clean_date             object
investigator_source    object
dtype: object

In [30]:
# Changing the data type of year

df_new["year"].astype("object")

case_number
2016.09.18.c    2016
2016.09.18.b    2016
2016.09.18.a    2016
2016.09.17      2016
2016.09.15      2016
                ... 
1554.00.00      1554
1543.00.00      1543
0500.00.00       500
0077.00.00        77
0005.00.00         5
Name: year, Length: 5868, dtype: object

## Changing the order of the columns for dataframe

In [48]:
df_final = df_new[["clean_date","year","type","country","area","location","activity","name","sex","injury","fatal_(y/n)","investigator_source"]]

In [49]:
# Final check for null values

df_final.isnull().sum()

clean_date             0
year                   0
type                   0
country                0
area                   0
location               0
activity               0
name                   0
sex                    0
injury                 0
fatal_(y/n)            0
investigator_source    0
dtype: int64

In [51]:
# Result

df_final.head()

Unnamed: 0_level_0,clean_date,year,type,country,area,location,activity,name,sex,injury,fatal_(y/n),investigator_source
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016.09.18.c,2016.09.18,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,m,Minor injury to thigh,n,Orlando Sentinel
2016.09.18.b,2016.09.18,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,m,Lacerations to hands,n,Orlando Sentinel
2016.09.18.a,2016.09.18,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,m,Lacerations to lower leg,n,Orlando Sentinel
2016.09.17,2016.09.17,2016,Unprovoked,Australia,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,m,Struck by fin on chest & leg,n,The Age
2016.09.15,2016.09.15,2016,Unprovoked,Australia,Victoria,Bells Beach,Surfing,male,m,No injury: Knocked off board by shark,n,The Age


In [57]:
# Exporting

df_final.to_csv(r'..\data-wrangling.csv')

In [58]:
import os


In [63]:
os.path.abspath('.')

'/Users/ilya.ivolgin/data-ber-10-19/module-1_projects/pandas-project/your-code'