In [1]:
#Import packages
import pandas as pd
from datetime import datetime

In [2]:
#Read data
df_raw = pd.read_excel('../data/raw/MadreDeDios_MercuryMasterData.xlsx')
df_raw.shape

(4627, 12)

In [3]:
#Explore
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4627 entries, 0 to 4626
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Author                       4627 non-null   object 
 1   Date data was collected      4627 non-null   object 
 2   Sample type                  4627 non-null   object 
 3   Species                      4223 non-null   object 
 4   Sample size                  4627 non-null   int64  
 5   Longitude (X)                4626 non-null   float64
 6   Latitude (Y)                 4626 non-null   float64
 7   Mercury concentration (ppm)  4627 non-null   float64
 8   Margin of error              766 non-null    float64
 9   Mercury type                 4627 non-null   object 
 10  Age (human only)             3191 non-null   float64
 11  Sex (human only)             3191 non-null   object 
dtypes: float64(5), int64(1), object(6)
memory usage: 433.9+ KB


In [None]:
df_raw.head(2)

In [None]:
#Extract genus to new field
#df_raw['Genus'] = df_raw['Species'].apply(lambda x: str(x).split()[0])

### Task 1 - Fix the dates
The dates come in three formats. First, some records have proper date time formats (e.g. `2017-08-15 15:30:00`). Others, however, just have years (e.g. `2009`) or spans of years (e.g. `2001-2003`). 

To fix this, we'll construct a few new columns:
* One column ("`DateType`") will list the type of date provided in the record: `datetime`, `year`, `range`. 
* A second column ("`DateCollected`") will include the full date and time of collection for those records where it is provided and others being set to null values. 
* A third column ("`YearCollected`") will include the year collected. Values here will be collected directly from the 'datetime' and 'year' records. And for the 'range' records, the year will be _the first year of collection_.
* And finally a fourth column ("`EndYearCollected`") will include the last year if it's a 'range' record or a repeat of the "Year Collected" if not. 

In [None]:
#%% Add the "DateType" column, using a function to assing its value 

#Define function to assign date formats
def get_date_type(the_date):
    if type(the_date) == datetime:
        return 'datetime'
    elif type(the_date) == type(1):
        return 'year'#datetime.strptime(str(the_date), '%Y')
    else: 
        return 'range'

#Apply the function to assign the value based on the data type of the record
df_raw['DateType'] = df_raw['Date data was collected'].apply(get_date_type)

#Reveal the counts of each type (Most have just a year)
df_raw['DateType'].value_counts().plot(kind='bar');

In [None]:
#%% Create conversion functions
def convert_date(the_date):
    if type(the_date) == datetime:
        out_date = the_date
        start_year = out_date.year
        end_year = out_date.year
    elif type(the_date) == type(1):
        out_date = None 
        start_year = the_date#datetime.strptime(str(the_date), '%Y')
        end_year   = the_date#datetime.strptime(str(the_date), '%Y')
    else: 
        out_date = None
        first_year,last_year = the_date.split("-")
        start_year = first_year #datetime.strptime(str(first_year), '%Y')
        end_year   = last_year  #datetime.strptime(str(last_year), '%Y')
    return pd.Series([out_date, int(start_year), int(end_year)])

#Apply function
df_raw[['DateCollected','StartYear','EndYear']] = df_raw['Date data was collected'].apply(convert_date)

#Fix the fact that integers are upcasted as floats
df_raw.StartYear = df_raw.StartYear.astype('int')
df_raw.EndYear = df_raw.EndYear.astype('int')

In [None]:
#

In [None]:
#For records with a proper datetime format, set that value in a "DateCollected" field
#df_raw.loc[df_raw["DateType"]=='datetime','DateCollected'] = df_raw['Date data was collected']

#df_raw.loc[df_raw["DateType"]=='year','DateCollected'] = pd.to_datetime(df_raw['Date data was collected'])
df_raw.iloc[11:17]

In [None]:
#Add the "YearCollected" field
df_raw.loc[df_raw["DateType"]=='datetime']['YearCollected'] = pd.DatetimeIndex(df_raw['DateCollected']).year

In [None]:
#Push properly time stamped data to a new dataframe
df_fulldate = df_raw.loc[df_raw['DateType']=='datetime']
df_fulldate.index = pd.to_datetime(df_fulldate['Date data was collected'])
df_fulldate.info()

In [None]:
df_raw['DateType'].value_counts().plot(kind='bar')

In [None]:
df_subset = df_raw.loc[df_raw['Date Filter'] == False]
df_subset2 = df_raw.loc[df_raw['Date Filter'] != False]


In [None]:
df_subset['Date data was collected'].unique()

In [None]:
datetime.strptime("2020", '%Y')