In [1]:
import pandas as pd
df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv",\
            sep=",")
temp_df = df.copy()

In [2]:
df['latitude'].head()

0    34.19
1    34.40
2    33.69
3    33.64
4    33.57
Name: latitude, dtype: float64

In [10]:
import numpy as np
def get_quantile_based_boundaries(feature_values, num_buckets):
  boundaries = np.arange(1.0, num_buckets) / num_buckets
  quantiles = feature_values.quantile(boundaries)
  return [quantiles[q] for q in quantiles.keys()]

In [11]:
bins = get_quantile_based_boundaries(df.latitude, 5)
bins

[33.86, 34.09, 36.64, 37.81]

In [12]:
df.latitude.head(10)

0    34.19
1    34.40
2    33.69
3    33.64
4    33.57
5    33.63
6    33.61
7    34.83
8    33.61
9    34.83
Name: latitude, dtype: float64

In [13]:
df.latitude = np.digitize(df.latitude, bins)

In [14]:
df.latitude.head()

0    2
1    2
2    0
3    0
4    0
Name: latitude, dtype: int64

In [15]:
df.latitude.value_counts()

2    3547
4    3412
3    3389
0    3331
1    3321
Name: latitude, dtype: int64

In [18]:
pd.qcut(df.latitude, 3).head(20)

0        (1.0, 3.0]
1        (1.0, 3.0]
2     (-0.001, 1.0]
3     (-0.001, 1.0]
4     (-0.001, 1.0]
5     (-0.001, 1.0]
6     (-0.001, 1.0]
7        (1.0, 3.0]
8     (-0.001, 1.0]
9        (1.0, 3.0]
10    (-0.001, 1.0]
11    (-0.001, 1.0]
12       (1.0, 3.0]
13       (1.0, 3.0]
14    (-0.001, 1.0]
15       (1.0, 3.0]
16    (-0.001, 1.0]
17    (-0.001, 1.0]
18    (-0.001, 1.0]
19    (-0.001, 1.0]
Name: latitude, dtype: category
Categories (3, interval[float64]): [(-0.001, 1.0] < (1.0, 3.0] < (3.0, 4.0]]

In [20]:
pd.get_dummies(pd.qcut(df.latitude, 3).head())

latitude,"(-0.001, 1.0]","(1.0, 3.0]","(3.0, 4.0]"
0,0,1,0
1,0,1,0
2,1,0,0
3,1,0,0
4,1,0,0


In [22]:
from sklearn.preprocessing import KBinsDiscretizer
enc = KBinsDiscretizer(n_bins=5, encode='onehot')
x = enc.fit_transform(temp_df['latitude'].values.reshape(-1,1))

In [23]:
x.toarray()

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])

In [25]:
import tensorflow as tf
longitude = tf.feature_column.numeric_column("longitude")
latitude = tf.feature_column.numeric_column("latitude")
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=get_quantile_based_boundaries(
      df["longitude"], 10))

  from ._conv import register_converters as _register_converters


In [26]:
bucketized_longitude[:5]

(NumericColumn(key='longitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 (-122.28,
  -121.98,
  -121.36000000000001,
  -119.87,
  -118.49,
  -118.3,
  -118.12,
  -117.88,
  -117.24))

In [27]:
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=get_quantile_based_boundaries(
      df["latitude"], 10))
bucketized_latitude[:5]

(NumericColumn(key='latitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 (33.62, 33.86, 34.0, 34.09, 34.25, 36.64, 37.47, 37.81, 38.48))

In [28]:
long_x_lat = tf.feature_column.crossed_column(
  set([bucketized_longitude, bucketized_latitude]), hash_bucket_size=1000) 

In [29]:
type(long_x_lat)

tensorflow.python.feature_column.feature_column_v2.CrossedColumn