# Categorical Features

### Introduction

### Working with AirBnb

For this lesson, we'll work with [AirBnb listings in Berlin](https://www.kaggle.com/brittabettendorf/berlin-airbnb-data).

In [74]:
import pandas as pd
df = pd.read_csv('listings_summary.csv.zip')

In [82]:
pd.set_option('display.max_rows',100)

In [85]:
# df.T

### Feature engineering

Let's try to capture as much of this object data as possible.

In [78]:
def find_object_features(df):
    return list(df.dtypes[df.dtypes == 'object'].index)

In [95]:
# df.dtypes[df.dtypes == 'float64']

In [96]:
def find_object_feature_values(df):
    object_features = find_object_features(df)
    return df[object_features][:1].values[0]

In [97]:
def informative(df):
    non_informative = [column for column in df.columns if len(df[column].unique()) == 1]
    informative_columns = list(set(df.columns.to_list()) - set(non_informative))
    return df[informative_columns]

In [102]:
# informative(df)[:2]

In [6]:
def percentage_unique(df_series):
    series_filled = df_series.dropna()
    return len(series_filled.unique())/len(series_filled)

In [7]:
def find_categorical(df, threshold = .5):    
    categorical_df = pd.DataFrame({})
    for column in df.columns:
        if percentage_unique(df[column]) < threshold:
            categorical_df[column] = df[column]
    return categorical_df 

In [88]:
# find_object_feature_values(df)

In [116]:
df_informative = informative(df)

potential_categorical = find_categorical(df_informative)
# potential_categorical

### Combine with Selecting Categorical Columns

In [104]:
import numpy as np
def summarize_counts(df):
    non_empty_columns = df.dropna(axis=1,how='all').columns
    frequencies = np.array([df[column].value_counts(normalize=True).values[0] for column in non_empty_columns]).reshape(-1, 1)
    columns = non_empty_columns.to_numpy().reshape(-1, 1)
    top_values = np.array([df[column].value_counts(normalize=True).index[0] for column in non_empty_columns]).reshape(-1, 1)
    summarize = np.hstack((columns, frequencies, top_values))
    return summarize[summarize[:,1].argsort()[::-1]]

In [121]:

# summarize_counts(potential_categorical)

In [None]:
top_counts

In [18]:
def selected_summaries(df, not_values = [], lower_bound = .1, upper_bound = 1):
    potential_cols = summarize_counts(df)
    potential_cols = potential_cols[potential_cols[:, 1] > lower_bound]
    potential_cols = potential_cols[potential_cols[:, 1] < upper_bound]
    not_tf = ~np.isin(potential_cols[:, 2], not_values)
    return potential_cols[not_tf]

In [123]:
selected = selected_summaries(df, not_values = ['t', 'f'], upper_bound = .90)
selected

array([['property_type', 0.8968162468960624, 'Apartment'],
       ['bathrooms', 0.8795293072824156, '1.0'],
       ['review_scores_communication', 0.805393184074115, '10.0'],
       ['review_scores_checkin', 0.7908940397350993, '10.0'],
       ['guests_included', 0.7746984746363959, '1'],
       ['host_location', 0.7660902121590302, 'Berlin, Berlin, Germany'],
       ['calculated_host_listings_count', 0.7643667967364314, '1'],
       ['bedrooms', 0.761693441022455, '1.0'],
       ['review_scores_accuracy', 0.7544381960524865, '10.0'],
       ['host_response_rate', 0.7380138759449104, '100%'],
       ['host_listings_count', 0.7170824824647074, '1.0'],
       ['host_total_listings_count', 0.7170824824647074, '1.0'],
       ['availability_30', 0.6433575736076623, '0'],
       ['beds', 0.636549395877754, '1.0'],
       ['review_scores_location', 0.616356713205673, '10.0'],
       ['review_scores_cleanliness', 0.568325891626702, '10.0'],
       ['availability_60', 0.5574228449804896, '0'],


* But we may not want values with digits, as we could change them to floats.

In [126]:
def num_is_digit(array, str_index = 0):
    return np.array([value[str_index].isdigit() for value in array])

In [127]:
num_is_digit(selected[:, 2], str_index = 0)[0:10]

array([False,  True,  True,  True,  True, False,  True,  True,  True,
        True])

In [128]:
def remove_digits_from_selected(selected_matrix, col_idx, str_indices = [0, -1]):
    for idx in str_indices:
        selected_col = selected_matrix[~num_is_digit(selected_matrix[:, col_idx], idx)]
    return selected_col

In [130]:
selected_sums_no_digits = remove_digits_from_selected(selected, 2, [0, -1])
selected_sums_no_digits

array([['property_type', 0.8968162468960624, 'Apartment'],
       ['host_location', 0.7660902121590302, 'Berlin, Berlin, Germany'],
       ['host_response_rate', 0.7380138759449104, '100%'],
       ['host_response_time', 0.5260923586663906, 'within an hour'],
       ['room_type', 0.511440227030862, 'Private room'],
       ['cancellation_policy', 0.403600567577155, 'flexible'],
       ['neighbourhood_group_cleansed', 0.2437477829017382,
        'Friedrichshain-Kreuzberg'],
       ['host_verifications', 0.18193508336289466,
        "['email', 'phone', 'reviews']"],
       ['neighbourhood', 0.1498062648802577, 'Neukölln'],
       ['host_neighbourhood', 0.14640852331309429, 'Neukölln'],
       ['calendar_updated', 0.11160872649875843, 'today']], dtype=object)

### Cleaning Values

1. Find columns to clean

In [31]:
def categorical_plus_values(df, threshold = 5):
    categorical_cols = find_categorical(df)
    return [column for column in categorical_cols if len(df[column].value_counts()) > threshold]

In [131]:
selected_cat_cols = selected_sums_no_digits[:, 0]

selected_cat_cols

array(['property_type', 'host_location', 'host_response_rate',
       'host_response_time', 'room_type', 'cancellation_policy',
       'neighbourhood_group_cleansed', 'host_verifications',
       'neighbourhood', 'host_neighbourhood', 'calendar_updated'],
      dtype=object)

In [132]:
cat_cols_df = df_informative[selected_cat_cols]
cat_cols_df[:3]

Unnamed: 0,property_type,host_location,host_response_rate,host_response_time,room_type,cancellation_policy,neighbourhood_group_cleansed,host_verifications,neighbourhood,host_neighbourhood,calendar_updated
0,Guesthouse,"Key Biscayne, Florida, United States",96%,within an hour,Entire home/apt,strict_14_with_grace_period,Mitte,"['email', 'phone', 'reviews', 'jumio', 'offlin...",Mitte,Mitte,3 months ago
1,Apartment,"Berlin, Berlin, Germany",,,Private room,flexible,Pankow,"['email', 'phone', 'reviews', 'jumio', 'govern...",,Prenzlauer Berg,7 weeks ago
2,Apartment,"Coledale, New South Wales, Australia",100%,within a day,Entire home/apt,strict_14_with_grace_period,Pankow,"['email', 'phone', 'facebook', 'reviews', 'man...",Prenzlauer Berg,Prenzlauer Berg,a week ago


In [133]:
updated_non_digits = categorical_plus_values(cat_cols_df)

In [134]:
len(updated_non_digits)

8

In [135]:
updated_non_digits

['property_type',
 'host_location',
 'host_response_rate',
 'neighbourhood_group_cleansed',
 'host_verifications',
 'neighbourhood',
 'host_neighbourhood',
 'calendar_updated']

In [64]:
df[updated_non_digits].describe()

Unnamed: 0,property_type,host_location,host_response_rate,neighbourhood_group_cleansed,host_verifications,neighbourhood,host_neighbourhood,calendar_updated
count,22552,22436,9657,22552,22552,21421,17458,22552
unique,33,1036,64,12,301,91,181,75
top,Apartment,"Berlin, Berlin, Germany",100%,Friedrichshain-Kreuzberg,"['email', 'phone', 'reviews']",Neukölln,Neukölln,today
freq,20225,17188,7127,5497,4103,3209,2556,2517


In [141]:
# df['property_type'].value_counts(normalize = True)

### Clean Values of Relevant Columns

In [54]:
def selected_cat_values(column, threshold = .02):
    values_counted = column.value_counts(normalize=True)
    return values_counted[values_counted > threshold]

In [144]:
selected = selected_cat_values(df.neighbourhood_cleansed, .02)

In [145]:
selected

Tempelhofer Vorstadt        0.058753
Frankfurter Allee Süd FK    0.056846
Alexanderplatz              0.048377
Reuterstraße                0.044431
Rixdorf                     0.039021
Neuköllner Mitte/Zentrum    0.035341
Brunnenstr. Süd             0.034276
Frankfurter Allee Nord      0.032591
Schillerpromenade           0.029354
südliche Luisenstadt        0.028512
Prenzlauer Berg Nordwest    0.027625
Prenzlauer Berg Südwest     0.027403
Schöneberg-Nord             0.025142
Prenzlauer Berg Süd         0.024610
Wedding Zentrum             0.022925
Moabit West                 0.021728
nördliche Luisenstadt       0.021462
Schöneberg-Süd              0.021018
Helmholtzplatz              0.020353
Name: neighbourhood_cleansed, dtype: float64

In [146]:
def reduce_cat_values(column, threshold = .02):
    column = column.copy()
    selected_values = selected_cat_values(column, threshold).index
    column[~column.isin(selected_values)] = 'other'
    return column.astype('category')

In [149]:
reduced

NameError: name 'reduced' is not defined

In [69]:
len(df[updated_non_digits].columns)

8

In [70]:
categoricals = ['property_type', 'host_location', 'neighbourhood_cleansed', 'room_type', 'cancellation_policy', 'neighbourhood_group_cleansed', 'host_verifications', 'neighbourhood', 'host_neighbourhood']



In [71]:
def df_reduced_categories(df, categoricals, threshold = .01):
    new_df = pd.DataFrame()
    for category in categoricals:
        new_df[category] = reduce_cat_values(df[category], threshold)
    return new_df

In [72]:
df_reduced = df_reduced_categories(df, categoricals)

In [73]:
df_reduced.describe()

Unnamed: 0,property_type,host_location,neighbourhood_cleansed,room_type,cancellation_policy,neighbourhood_group_cleansed,host_verifications,neighbourhood,host_neighbourhood
count,22552,22552,22552,22552,22552,22552,22552,22552,22552
unique,5,4,31,3,4,11,18,14,14
top,Apartment,"Berlin, Berlin, Germany",other,Private room,flexible,Friedrichshain-Kreuzberg,"['email', 'phone', 'reviews']",other,other
freq,20225,17188,5031,11534,9102,5497,4103,4152,7648


In [198]:
summarize_counts(df_reduced)

array([['property_type', 0.8968162468960624, 'Apartment'],
       ['host_location', 0.7621496984746364, 'Berlin, Berlin, Germany'],
       ['neighbourhood_cleansed', 0.51232706633558, 'other'],
       ['room_type', 0.511440227030862, 'Private room'],
       ['cancellation_policy', 0.403600567577155, 'flexible'],
       ['host_neighbourhood', 0.38479957431713374, 'other'],
       ['host_verifications', 0.2920361830436325, 'other'],
       ['neighbourhood_group_cleansed', 0.2437477829017382,
        'Friedrichshain-Kreuzberg'],
       ['neighbourhood', 0.21882759843916283, 'other']], dtype=object)

In [236]:
def replace_df_columns(original_df, replacing_df):
    replacing_cols = replacing_df.columns
    original_df = original_df.drop(columns = replacing_cols)
    new_df = pd.concat([original_df, replacing_df], axis = 1)
    return new_df

In [233]:
new_df = replace_df_columns(df, df_reduced)