This notebook is used for development of helper functions to evantually be included in the feature explorer module.

In [3]:
import pandas as pd

## Load Datasets for testing

In [4]:
# King County
king = pd.read_csv('https://raw.githubusercontent.com/ryanleeallred/datasets/master/kc_house_data.csv')

# Water Pumps
pumps = pd.read_csv('Data/pumps.csv')
pumps_target = pd.read_csv('Data/pump_train_labels.csv')
pumps_target = pumps_target['status_group']

# Moores Law
tables = pd.read_html('https://en.wikipedia.org/wiki/Transistor_count', header=0)
moore = tables[0]

# Pedestrian Crossing
crossing = pd.read_csv('Data/daily.csv')

king.shape, pumps.shape, moore.shape, crossing.shape

((21613, 21), (59400, 40), (139, 6), (1063, 9))

In [5]:
pumps.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


# Categorical Features 

### Collapse High Cardinality Features Function

In [23]:
def collapse_cardinality(data, threshold=20): 
    ''' 
    Collapse a Pandas Series with high cardinality to have a specificed number of categories. 
    Categories with frequencies below the threshold will be grouped into an 'other' category. 
    The returned series will have cardinality equal to the threshold + 1.  
    
    Parameters: Pandas Dataframe, threshold to determine cutoff point for cardinality
    '''
    keep = data.value_counts().index[:threshold]
    for cat in data.value_counts().index:
        if (cat in keep) == False:
            data = data.replace(cat, 'other')
    return data
    
collapse_cardinality(pumps['lga'], 40).nunique()

41

# Dates 

### Date Expander

In [83]:
import numpy as np

def date_expander(data, year=True, month=True, holiday=True, dow=True, workingday=True, day=False, hour=False, minute=False):
    '''
    Accpets a series containing dates. Extracts potentially useful information into distinct columns. 
    
    Returns a DataFrame containing the newly created features columns.
    '''
    data = pd.to_datetime(data, infer_datetime_format=True)
    df = pd.DataFrame()
    if year: 
        year = pd.Series( data.dt.year, name='year')
    if month:
        month = pd.Series( data.dt.month, name='month')
        df = pd.concat([year, month], axis=1)
    if holiday:
        holiday = is_holiday(data)
        df = pd.concat([df, holiday], axis=1)
    if dow:
        dow = pd.Series( data.dt.dayofweek, name='dow')
        df = pd.concat([df, dow], axis=1)
    if workingday:
        workingday = is_working_day(data)
        df = pd.concat([df, workingday], axis=1)
    if day: 
        day = pd.Series( data.dt.day, name='day')
        df = pd.concat([df, day], axis=1)
    if hour:
        hour = pd.Series( data.dt.hour, name='hour')
        df = pd.concat([df, hour], axis=1)
    if minute:
        minute = pd.Series( data.dt.minute, name='minute')
        df = pd.concat([df, minute], axis=1)
        
    return df

date_expander(pumps['date_recorded'], year=True, month=True, holiday=True, workingday=True, dow=True, day=True, hour=True, minute=True).head()

Unnamed: 0,year,month,holiday,dow,workingday,day,hour,minute
0,2011,3,False,0,True,14,0,0
1,2013,3,False,2,True,6,0,0
2,2013,2,False,0,True,25,0,0
3,2013,1,False,0,True,28,0,0
4,2011,7,False,2,True,13,0,0


### Is_holiday 

In [77]:
def is_holiday(data):
    from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
    cal = calendar()
    holidays = cal.holidays(start=data.min(), end=data.max())
    holiday = pd.Series( data.isin(holidays), name='holiday')
    return holiday

### Is_working_day 

In [82]:
def is_working_day(data):
    workingday = np.where( (is_holiday(data) | (data.dt.dayofweek > 4)), False, True )
    workingday = pd.Series( workingday, name='workingday')
    return workingday

### Net_Days_From

In [90]:
def net_days_from(data, date):
    data_days = pd.to_datetime(data).dt.day
    date = pd.to_datetime(date).day
    net_days = pd.Series( data_days - date, name='netdays')
    return net_days

net_days_from(pumps['date_recorded'], '2014-11-01').head()

0    13
1     5
2    24
3    27
4    12
Name: netdays, dtype: int64

### Net_Working_Days_From

In [99]:
def net_working_days_from(data, date):
    data_days = pd.to_datetime(data).dt.day
    date = pd.to_datetime(date).day
    days = []
    for day in data_days:
        count = pd.bdate_range(day, date)
        days.append(count)
    return days
net_working_days_from(pumps['date_recorded'], '2014-11-01')

[DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='B'),
 DatetimeI

# Location  

### Create Polar Coordinates

### Create Cartesian Coordinates

### Convert Coordinates to Address

### Convert Address to Coordinates

### Coordinate Clusters

### Distance From 
- Default to mean coordinates

### Extract Features From Address

### Extract Features from coordinates