# Boolean Values

### Introduction

Deciding which features should be included and focused on in our linear model is an important skill of any data scientist.  As we saw previously, if we include features which are too collinear, we will improperly measure the coefficients related to our collinear features.  In addition, feature selection and prioritizing features with feature importance will help us to understand which features to devote our attention to in terms of feature engineering and domain understanding.  Finally, limiting the number of features in our model, and identifying the most crucial features in our model will make our models, and their insights more understandable.

### Working with AirBnb

For this lesson, we'll work with [AirBnb listings in Berlin](https://www.kaggle.com/brittabettendorf/berlin-airbnb-data).

In [3]:
import pandas as pd
df = pd.read_csv('listings_summary.csv.zip')

In [324]:
pd.set_option('display.max_rows',100)

### Feature engineering

Let's try to capture as much of this object data as possible.

In [326]:
def find_object_features(df):
    return list(df.dtypes[df.dtypes == 'object'].index)

In [327]:
def find_object_feature_values(df):
    object_features = find_object_features(df)
    return df[object_features][:1].values[0]

In [47]:
import numpy as np
def find_booleans(df):
    columns = df.columns
    boolean_columns = np.array([column for column in columns if len(df[column].value_counts(dropna=True)) == 2])
    boolean_values = np.array([df[column].value_counts(dropna=True).index.to_list() for column in boolean_columns])
    columns_and_values = np.stack((boolean_columns, boolean_values[:, 0], boolean_values[:, 1])).T
    return columns_and_values

In [None]:
boolean_columns = find_booleans(df)

In [63]:
def select_booleans(df, values = []):
    boolean_columns = find_booleans(df)
    matches = np.isin(boolean_columns[:, 1], values)
    return boolean_columns[matches]

In [65]:
boolean_values = ['t', 'f']
select_booleans(df, boolean_values)

array([['host_is_superhost', 'f', 't'],
       ['host_has_profile_pic', 't', 'f'],
       ['host_identity_verified', 'f', 't'],
       ['is_location_exact', 't', 'f'],
       ['requires_license', 't', 'f'],
       ['instant_bookable', 'f', 't'],
       ['require_guest_profile_picture', 'f', 't'],
       ['require_guest_phone_verification', 'f', 't']], dtype='<U32')

In [72]:
boolean_mapping = {'t': 1, 'f': 0}

In [79]:
import numpy as np
def to_booleans(df, boolean_mapping):
    potential_columns = find_booleans(df)
    boolean_values = list(boolean_mapping.keys())
    boolean_features = select_booleans(df, boolean_values)[:, 0]
    boolean_df = pd.DataFrame({})
    for feature in boolean_features:
        boolean_df[feature] = df[feature].map(boolean_mapping)
    return boolean_df[boolean_features]

In [81]:
new_boolean_cols = to_booleans(df, boolean_mapping)
new_boolean_cols[0:2]

Unnamed: 0,host_is_superhost,host_has_profile_pic,host_identity_verified,is_location_exact,requires_license,instant_bookable,require_guest_profile_picture,require_guest_phone_verification
0,1.0,1.0,1.0,0,1,0,0,0
1,0.0,1.0,1.0,1,1,0,0,0


### Detecting Almost Binary Features

In [110]:
def almost_binary(df):
    non_empty_columns = df.dropna(axis=1,how='all').columns
    return np.array([df[column].value_counts(normalize=True).values[0] for column in non_empty_columns]).reshape(-1, 1)


In [114]:
def summarize_counts(df):
    non_empty_columns = df.dropna(axis=1,how='all').columns
    frequencies = np.array([df[column].value_counts(normalize=True).values[0] for column in non_empty_columns]).reshape(-1, 1)
    columns = non_empty_columns.to_numpy().reshape(-1, 1)
    top_values = np.array([df[column].value_counts(normalize=True).index[0] for column in non_empty_columns]).reshape(-1, 1)
    summarize = np.hstack((columns, frequencies, top_values))
    return summarize[summarize[:,1].argsort()[::-1]]

In [118]:
summary = summarize_counts(df)

In [119]:
summary[:2]

array([['country', 1.0, 'Germany'],
       ['country_code', 1.0, 'DE']], dtype=object)

In [120]:
def almost_binary(df, threshold = .95):
    return np.array([np.array([cat, top]) for cat, frequency, top in summarize_counts(df) if 1.0 > frequency > threshold])

In [122]:
almost_bin_feats = almost_binary(df)

In [123]:
almost_bin_feats

array([['last_scraped', '2018-11-07'],
       ['calendar_last_scraped', '2018-11-07'],
       ['market', 'Berlin'],
       ['requires_license', 't'],
       ['state', 'Berlin'],
       ['host_has_profile_pic', 't'],
       ['city', 'Berlin'],
       ['smart_location', 'Berlin, Germany'],
       ['require_guest_profile_picture', 'f'],
       ['street', 'Berlin, Berlin, Germany'],
       ['require_guest_phone_verification', 'f'],
       ['bed_type', 'Real Bed']], dtype='<U32')

In [126]:
def remove_punctuation(string):
    return string.strip().lower().replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')

In [127]:
def matrix_new_features(df):
    bin_feats = almost_binary(df)
    new_bin_feats = np.array(['{column}_is_{top}'.format(column = column, top = remove_punctuation(top)) for column, top in bin_feats])
    return np.hstack((bin_feats[:, 0].reshape(-1, 1), bin_feats[:, 1].reshape(-1, 1), new_bin_feats.reshape(-1, 1)))

In [129]:
potential_new_features = matrix_new_features(df)

In [143]:
def booleans_without_top_values(df, not_values):
    potential_new_features = matrix_new_features(df)
    not_tf = ~np.isin(potential_new_features[:, 1], not_values)
    return potential_new_features[not_tf]

In [147]:
selected_booleans = booleans_without_top_values(df, ['t', 'f', '2018-11-07'])

In [153]:
selected_bool_cols = selected_booleans[:, 0]
selected_booleans_df = df[selected_bool_cols]

In [131]:
def almost_to_boolean(df):
    columns_to_replace = matrix_new_features(df)[:, 0]
    values_to_replace = matrix_new_features(df)[:, 1]
    new_column_names = matrix_new_features(df)[:, 2]
    to_replace_df = pd.DataFrame({})
    for column, value, new_name in zip(columns_to_replace, values_to_replace, new_column_names):
        bool_column = np.where(df[column] == value,1,0)
        to_replace_df[new_name] = bool_column
    return to_replace_df

In [151]:
almost = almost_to_boolean(selected_booleans_df)

In [152]:
almost.dtypes

market_is_berlin                    int64
state_is_berlin                     int64
city_is_berlin                      int64
smart_location_is_berlin_germany    int64
street_is_berlin_berlin_germany     int64
bed_type_is_real_bed                int64
dtype: object

In [171]:
def df_with_replaced_columns(original_df, selected_booleans_df):
    matrix_features = matrix_new_features(selected_booleans_df)
    cols_to_drop = matrix_features[:, 0]
    pruned_df = copied_df.drop(cols_to_drop, axis = 1)
    return pd.concat([pruned_df, selected_booleans_df], axis = 1)

In [172]:
new_df = new_df_with_na_cols(df, selected_booleans_df)

In [170]:
len(new_df.columns)

96

### Summary