# Coercing to Booleans

1. GEt the object df

In [6]:
# df

object_df = df.select_dtypes('object')

# [col for col in object_df.columns if is_date_time(object_df[col])]

2. Identify columns to change similarly

In [8]:
# almost_datetime_cols = object_df.apply(lambda col: is_date_time(col))

3. Loop through and make the change

In [9]:
# steps = [ ( [almost_dt_col], FunctionTransformer(change_it_to_date)) for almost_dt_col in almost_datetime_cols]

In [None]:
from sklearn_pandas import DataFrameMapper

mapper = DataFrameMapper(steps, df_out = True)
mapper.fit_transform()

# columns that we coerced

### Loading our AirBnb Data

For this lesson, we'll work with [AirBnb listings in Berlin](https://www.kaggle.com/brittabettendorf/berlin-airbnb-data).  Let's load our data.

In [10]:
import pandas as pd
df = pd.read_csv('./nums_and_dates_ten_k.csv', index_col = 0)

In [12]:
potential_date_cols = ['last_scraped',
 'host_since',
 'calendar_last_scraped',
 'first_review',
 'last_review']
df[potential_date_cols] = df[potential_date_cols].astype('datetime64')

In [221]:
def contains_date(column):
#     remove nas first, potentially use all
    regex_string = (r'^\d{1,2}-\d{1,2}-\d{4}$|^\d{4}-\d{1,2}-\d{1,2}$' + 
'|^\d{1,2}\/\d{1,2}\/\d{4}$|^\d{4}\/\d{1,2}\/\d{1,2}$')
    return column.str.contains(regex_string).any()

### Feature engineering

In [17]:
.value_counts()

2018-11-07    7999
2018-11-09       1
Name: last_scraped, dtype: int64

In [19]:
df['last_scraped'].unique()

array(['2018-11-07T00:00:00.000000000', '2018-11-09T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [21]:
def percent_different(df_series):
    series_filled = df_series.dropna()
    return len(series_filled.unique())/len(series_filled)

In [24]:
# percent_different(df['last_scraped'])

In [25]:
def find_categorical(df, threshold = .5):    
    categorical_df = pd.DataFrame({})
    for column in df.columns:
        if percent_different(df[column]) < threshold:
            categorical_df[column] = df[column]
    return categorical_df 

In [92]:
cat_df = find_categorical(df.select_dtypes('object'))

In [94]:
# cat_df

### Combine with Selecting Categorical Columns

In [33]:
def get_multiple_val_counts(df, num_vals = 1):
    return [df[column].value_counts(normalize=True).iloc[:num_vals] for column in df.columns]

In [35]:
# get_multiple_val_counts(cat_df, 2)

In [37]:
import numpy as np
def summarize_cats(df):
    multiple_val_counts = get_multiple_val_counts(df)
    stacked_counts = np.vstack([np.array([val_count.name, val_count.index[0], float(val_count.values[0])]) for val_count in multiple_val_counts])
    sorted_cols = np.argsort(stacked_counts.reshape(-1, 3)[:, 2].astype('float'))
    return stacked_counts[sorted_cols[::-1]]

In [95]:
summary = summarize_cats(cat_df)

In [96]:
summary

array([['requires_license', 't', '0.99975'],
       ['market', 'Berlin', '0.99975'],
       ['state', 'Berlin', '0.9978704747588626'],
       ['host_has_profile_pic', 't', '0.9976193459466233'],
       ['smart_location', 'Berlin, Germany', '0.99175'],
       ['city', 'Berlin', '0.9917489686210776'],
       ['street', 'Berlin, Berlin, Germany', '0.989125'],
       ['require_guest_profile_picture', 'f', '0.984375'],
       ['require_guest_phone_verification', 'f', '0.975125'],
       ['bed_type', 'Real Bed', '0.93525'],
       ['property_type', 'Apartment', '0.899'],
       ['host_is_superhost', 'f', '0.8743265254980579'],
       ['instant_bookable', 'f', '0.839'],
       ['host_location', 'Berlin, Berlin, Germany', '0.8262015309323629'],
       ['is_location_exact', 't', '0.761625'],
       ['room_type', 'Entire home/apt', '0.54125'],
       ['host_identity_verified', 't', '0.5090840746773587'],
       ['host_response_time', 'within an hour', '0.45789473684210524'],
       ['cancellatio

In [98]:
import numpy as np
bool_cols = summary[np.isin(summary[:, 1], ['t', 'f'])]
bool_cols[:, 0]

array(['requires_license', 'host_has_profile_pic',
       'require_guest_profile_picture',
       'require_guest_phone_verification', 'host_is_superhost',
       'instant_bookable', 'is_location_exact', 'host_identity_verified'],
      dtype='<U55')

In [54]:
bool_df = cat_df[bool_cols[:, 0]]

In [103]:
# steps = [([col], [SimpleImputer(strategy='constant', fill_value='f'),MissingIndicator(missing_values='t')) for col in bool_cols[:, 0]]

In [70]:
# get_multiple_val_counts(bool_df, 2)

# bool_df

In [74]:
from sklearn.impute import MissingIndicator, SimpleImputer
missing_indicator_steps = [([col], [SimpleImputer(strategy='constant', fill_value = 'f'), MissingIndicator(missing_values='t')])for col in bool_cols[:, 0]]

In [75]:
from sklearn_pandas import DataFrameMapper


mapper = DataFrameMapper(missing_indicator_steps, df_out = True)

In [77]:
bool_df = mapper.fit_transform(bool_df)

In [80]:
# bool_df

In [84]:
df_updated_bool = df.drop(columns = bool_df.columns)

In [91]:
len(bool_df.columns)

8

In [85]:
df_updated_bool[bool_df.columns] = bool_df

In [86]:
df_updated_bool.select_dtypes('object').shape

(8000, 37)

In [87]:
df.select_dtypes('object').shape

(8000, 45)

### Identifying Boolean Values